diff mbox

[RFC,v2,2/2] hw/pci: handle unassigned pci addresses

Message ID 1378725114-13197-3-git-send-email-marcel.a@redhat.com
State New
Headers show

Commit Message

Marcel Apfelbaum Sept. 9, 2013, 11:11 a.m. UTC
Created a MemoryRegion with negative priority that
spans over all the pci address space.
It "intercepts" the accesses to unassigned pci
address space and will follow the pci spec:
 1. returns -1 on read
 2. does nothing on write
 3. sets the RECEIVED MASTER ABORT bit in the STATUS register
    of the device that initiated the transaction

Note: This implementation assumes that all the reads/writes to
the pci address space are done by the cpu.

Signed-off-by: Marcel Apfelbaum <marcel.a@redhat.com>
---
Changes from v1:
 - "pci-unassigned-mem" MemoryRegion resides now in PCIBus and not on
    various Host Bridges
 - "pci-unassgined-mem" does not have a ".valid.accept" field and
    implements read write methods

 hw/pci/pci.c             | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/pci/pci_bus.h |  1 +
 2 files changed, 47 insertions(+)

Comments

Michael S. Tsirkin Sept. 9, 2013, 11:40 a.m. UTC | #1
On Mon, Sep 09, 2013 at 02:11:54PM +0300, Marcel Apfelbaum wrote:
> Created a MemoryRegion with negative priority that
> spans over all the pci address space.
> It "intercepts" the accesses to unassigned pci
> address space and will follow the pci spec:
>  1. returns -1 on read
>  2. does nothing on write
>  3. sets the RECEIVED MASTER ABORT bit in the STATUS register
>     of the device that initiated the transaction
> 
> Note: This implementation assumes that all the reads/writes to
> the pci address space are done by the cpu.
> 
> Signed-off-by: Marcel Apfelbaum <marcel.a@redhat.com>
> ---
> Changes from v1:
>  - "pci-unassigned-mem" MemoryRegion resides now in PCIBus and not on
>     various Host Bridges
>  - "pci-unassgined-mem" does not have a ".valid.accept" field and
>     implements read write methods
> 
>  hw/pci/pci.c             | 46 ++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/pci/pci_bus.h |  1 +
>  2 files changed, 47 insertions(+)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index d00682e..b6a8026 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -283,6 +283,43 @@ const char *pci_root_bus_path(PCIDevice *dev)
>      return rootbus->qbus.name;
>  }
>  
> +static void unassigned_mem_access(PCIBus *bus)
> +{
> +    /* FIXME assumption: memory access to the pci address
> +     * space is always initiated by the host bridge

/* Always
 * like
 * this
 */

/* Not
 * like this */

> +     * (device 0 on the bus) */

Can't we pass the master device in?
We are still left with the assumption that
there's a single master, but at least
we get rid of the assumption that it's
always device 0 function 0.


> +    PCIDevice *d = bus->devices[0];
> +    if (!d) {
> +        return;
> +    }
> +
> +    pci_word_test_and_set_mask(d->config + PCI_STATUS,
> +                               PCI_STATUS_REC_MASTER_ABORT);

Probably should check and set secondary status for
a bridge device.

> +}
> +
> +static uint64_t unassigned_mem_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    PCIBus *bus = opaque;
> +    unassigned_mem_access(bus);
> +
> +    return -1ULL;
> +}
> +
> +static void unassigned_mem_write(void *opaque, hwaddr addr, uint64_t val,
> +                                unsigned size)
> +{
> +    PCIBus *bus = opaque;
> +    unassigned_mem_access(bus);
> +}
> +
> +static const MemoryRegionOps unassigned_mem_ops = {
> +    .read = unassigned_mem_read,
> +    .write = unassigned_mem_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +};
> +
> +#define UNASSIGNED_MEM_PRIORITY -1
> +
>  static void pci_bus_init(PCIBus *bus, DeviceState *parent,
>                           const char *name,
>                           MemoryRegion *address_space_mem,


I would rename "unassigned" to "pci_master_Abort_" everywhere.

Call things by what they do not how they are used.

> @@ -294,6 +331,15 @@ static void pci_bus_init(PCIBus *bus, DeviceState *parent,
>      bus->address_space_mem = address_space_mem;
>      bus->address_space_io = address_space_io;
>  
> +
> +    memory_region_init_io(&bus->unassigned_mem, OBJECT(bus),
> +                          &unassigned_mem_ops, bus, "pci-unassigned",
> +                          memory_region_size(bus->address_space_mem));
> +    memory_region_add_subregion_overlap(bus->address_space_mem,
> +                                        bus->address_space_mem->addr,
> +                                        &bus->unassigned_mem,
> +                                        UNASSIGNED_MEM_PRIORITY);
> +
>      /* host bridge */
>      QLIST_INIT(&bus->child);
>

It seems safer to add an API for enabling this functionality.
Something like

pci_set_master_for_master_abort(PCIBus *, PCIDevice *);

  
> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> index 9df1788..4cc25a3 100644
> --- a/include/hw/pci/pci_bus.h
> +++ b/include/hw/pci/pci_bus.h
> @@ -23,6 +23,7 @@ struct PCIBus {
>      PCIDevice *parent_dev;
>      MemoryRegion *address_space_mem;
>      MemoryRegion *address_space_io;
> +    MemoryRegion unassigned_mem;
>  
>      QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
>      QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
> -- 
> 1.8.3.1
Marcel Apfelbaum Sept. 9, 2013, 12:11 p.m. UTC | #2
On Mon, 2013-09-09 at 14:40 +0300, Michael S. Tsirkin wrote:
> On Mon, Sep 09, 2013 at 02:11:54PM +0300, Marcel Apfelbaum wrote:
> > Created a MemoryRegion with negative priority that
> > spans over all the pci address space.
> > It "intercepts" the accesses to unassigned pci
> > address space and will follow the pci spec:
> >  1. returns -1 on read
> >  2. does nothing on write
> >  3. sets the RECEIVED MASTER ABORT bit in the STATUS register
> >     of the device that initiated the transaction
> > 
> > Note: This implementation assumes that all the reads/writes to
> > the pci address space are done by the cpu.
> > 
> > Signed-off-by: Marcel Apfelbaum <marcel.a@redhat.com>
> > ---
> > Changes from v1:
> >  - "pci-unassigned-mem" MemoryRegion resides now in PCIBus and not on
> >     various Host Bridges
> >  - "pci-unassgined-mem" does not have a ".valid.accept" field and
> >     implements read write methods
> > 
> >  hw/pci/pci.c             | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> >  include/hw/pci/pci_bus.h |  1 +
> >  2 files changed, 47 insertions(+)
> > 
> > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > index d00682e..b6a8026 100644
> > --- a/hw/pci/pci.c
> > +++ b/hw/pci/pci.c
> > @@ -283,6 +283,43 @@ const char *pci_root_bus_path(PCIDevice *dev)
> >      return rootbus->qbus.name;
> >  }
> >  
> > +static void unassigned_mem_access(PCIBus *bus)
> > +{
> > +    /* FIXME assumption: memory access to the pci address
> > +     * space is always initiated by the host bridge
> 
> /* Always
>  * like
>  * this
>  */
> 
> /* Not
>  * like this */
OK

> 
> > +     * (device 0 on the bus) */
> 
> Can't we pass the master device in?
> We are still left with the assumption that
> there's a single master, but at least
> we get rid of the assumption that it's
> always device 0 function 0.
> 
> 
> > +    PCIDevice *d = bus->devices[0];
> > +    if (!d) {
> > +        return;
> > +    }
> > +
> > +    pci_word_test_and_set_mask(d->config + PCI_STATUS,
> > +                               PCI_STATUS_REC_MASTER_ABORT);
> 
> Probably should check and set secondary status for
> a bridge device.
I wanted to add the support for bridges later,
I am struggling with some issues with PCI bridges.
 
> 
> > +}
> > +
> > +static uint64_t unassigned_mem_read(void *opaque, hwaddr addr, unsigned size)
> > +{
> > +    PCIBus *bus = opaque;
> > +    unassigned_mem_access(bus);
> > +
> > +    return -1ULL;
> > +}
> > +
> > +static void unassigned_mem_write(void *opaque, hwaddr addr, uint64_t val,
> > +                                unsigned size)
> > +{
> > +    PCIBus *bus = opaque;
> > +    unassigned_mem_access(bus);
> > +}
> > +
> > +static const MemoryRegionOps unassigned_mem_ops = {
> > +    .read = unassigned_mem_read,
> > +    .write = unassigned_mem_write,
> > +    .endianness = DEVICE_NATIVE_ENDIAN,
> > +};
> > +
> > +#define UNASSIGNED_MEM_PRIORITY -1
> > +
> >  static void pci_bus_init(PCIBus *bus, DeviceState *parent,
> >                           const char *name,
> >                           MemoryRegion *address_space_mem,
> 
> 
> I would rename "unassigned" to "pci_master_Abort_" everywhere.
Are you sure about the capital "A"?

For the memory ops I suppose is OK, but the MemoryRegion name
I think it should remain "unassigned_mem"; it will be strange
to name it pci_master_Abort_mem.
> 
> Call things by what they do not how they are used.
> 
> > @@ -294,6 +331,15 @@ static void pci_bus_init(PCIBus *bus, DeviceState *parent,
> >      bus->address_space_mem = address_space_mem;
> >      bus->address_space_io = address_space_io;
> >  
> > +
> > +    memory_region_init_io(&bus->unassigned_mem, OBJECT(bus),
> > +                          &unassigned_mem_ops, bus, "pci-unassigned",
> > +                          memory_region_size(bus->address_space_mem));
> > +    memory_region_add_subregion_overlap(bus->address_space_mem,
> > +                                        bus->address_space_mem->addr,
> > +                                        &bus->unassigned_mem,
> > +                                        UNASSIGNED_MEM_PRIORITY);
> > +
> >      /* host bridge */
> >      QLIST_INIT(&bus->child);
> >
> 
> It seems safer to add an API for enabling this functionality.
> Something like
> 
> pci_set_master_for_master_abort(PCIBus *, PCIDevice *);
I started to implement that way, but it would be strange (I think) 
to see in the code the above method because almost all the pci devices
can be master devices.

I selected an "implicit" implementation so for this reason exactly.
We do not really select a master, but a "master for writes/reads on
pci address space". Selecting device 0 on the bus automatically for
this makes more sense to me.
What do you think?
Marcel 

> 
>   
> > diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> > index 9df1788..4cc25a3 100644
> > --- a/include/hw/pci/pci_bus.h
> > +++ b/include/hw/pci/pci_bus.h
> > @@ -23,6 +23,7 @@ struct PCIBus {
> >      PCIDevice *parent_dev;
> >      MemoryRegion *address_space_mem;
> >      MemoryRegion *address_space_io;
> > +    MemoryRegion unassigned_mem;
> >  
> >      QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
> >      QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
> > -- 
> > 1.8.3.1
>
Michael S. Tsirkin Sept. 9, 2013, 12:23 p.m. UTC | #3
On Mon, Sep 09, 2013 at 03:11:55PM +0300, Marcel Apfelbaum wrote:
> On Mon, 2013-09-09 at 14:40 +0300, Michael S. Tsirkin wrote:
> > On Mon, Sep 09, 2013 at 02:11:54PM +0300, Marcel Apfelbaum wrote:
> > > Created a MemoryRegion with negative priority that
> > > spans over all the pci address space.
> > > It "intercepts" the accesses to unassigned pci
> > > address space and will follow the pci spec:
> > >  1. returns -1 on read
> > >  2. does nothing on write
> > >  3. sets the RECEIVED MASTER ABORT bit in the STATUS register
> > >     of the device that initiated the transaction
> > > 
> > > Note: This implementation assumes that all the reads/writes to
> > > the pci address space are done by the cpu.
> > > 
> > > Signed-off-by: Marcel Apfelbaum <marcel.a@redhat.com>
> > > ---
> > > Changes from v1:
> > >  - "pci-unassigned-mem" MemoryRegion resides now in PCIBus and not on
> > >     various Host Bridges
> > >  - "pci-unassgined-mem" does not have a ".valid.accept" field and
> > >     implements read write methods
> > > 
> > >  hw/pci/pci.c             | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  include/hw/pci/pci_bus.h |  1 +
> > >  2 files changed, 47 insertions(+)
> > > 
> > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > > index d00682e..b6a8026 100644
> > > --- a/hw/pci/pci.c
> > > +++ b/hw/pci/pci.c
> > > @@ -283,6 +283,43 @@ const char *pci_root_bus_path(PCIDevice *dev)
> > >      return rootbus->qbus.name;
> > >  }
> > >  
> > > +static void unassigned_mem_access(PCIBus *bus)
> > > +{
> > > +    /* FIXME assumption: memory access to the pci address
> > > +     * space is always initiated by the host bridge
> > 
> > /* Always
> >  * like
> >  * this
> >  */
> > 
> > /* Not
> >  * like this */
> OK
> 
> > 
> > > +     * (device 0 on the bus) */
> > 
> > Can't we pass the master device in?
> > We are still left with the assumption that
> > there's a single master, but at least
> > we get rid of the assumption that it's
> > always device 0 function 0.
> > 
> > 
> > > +    PCIDevice *d = bus->devices[0];
> > > +    if (!d) {
> > > +        return;
> > > +    }
> > > +
> > > +    pci_word_test_and_set_mask(d->config + PCI_STATUS,
> > > +                               PCI_STATUS_REC_MASTER_ABORT);
> > 
> > Probably should check and set secondary status for
> > a bridge device.
> I wanted to add the support for bridges later,
> I am struggling with some issues with PCI bridges.

Shouldn't be very different.

> > 
> > > +}
> > > +
> > > +static uint64_t unassigned_mem_read(void *opaque, hwaddr addr, unsigned size)
> > > +{
> > > +    PCIBus *bus = opaque;
> > > +    unassigned_mem_access(bus);
> > > +
> > > +    return -1ULL;
> > > +}
> > > +
> > > +static void unassigned_mem_write(void *opaque, hwaddr addr, uint64_t val,
> > > +                                unsigned size)
> > > +{
> > > +    PCIBus *bus = opaque;
> > > +    unassigned_mem_access(bus);
> > > +}
> > > +
> > > +static const MemoryRegionOps unassigned_mem_ops = {
> > > +    .read = unassigned_mem_read,
> > > +    .write = unassigned_mem_write,
> > > +    .endianness = DEVICE_NATIVE_ENDIAN,
> > > +};
> > > +
> > > +#define UNASSIGNED_MEM_PRIORITY -1
> > > +
> > >  static void pci_bus_init(PCIBus *bus, DeviceState *parent,
> > >                           const char *name,
> > >                           MemoryRegion *address_space_mem,
> > 
> > 
> > I would rename "unassigned" to "pci_master_Abort_" everywhere.
> Are you sure about the capital "A"?

Ugh no, that's a typo.

> 
> For the memory ops I suppose is OK, but the MemoryRegion name
> I think it should remain "unassigned_mem"; it will be strange
> to name it pci_master_Abort_mem.

Why would it be strange?

> > 
> > Call things by what they do not how they are used.
> > 
> > > @@ -294,6 +331,15 @@ static void pci_bus_init(PCIBus *bus, DeviceState *parent,
> > >      bus->address_space_mem = address_space_mem;
> > >      bus->address_space_io = address_space_io;
> > >  
> > > +
> > > +    memory_region_init_io(&bus->unassigned_mem, OBJECT(bus),
> > > +                          &unassigned_mem_ops, bus, "pci-unassigned",
> > > +                          memory_region_size(bus->address_space_mem));
> > > +    memory_region_add_subregion_overlap(bus->address_space_mem,
> > > +                                        bus->address_space_mem->addr,
> > > +                                        &bus->unassigned_mem,
> > > +                                        UNASSIGNED_MEM_PRIORITY);
> > > +
> > >      /* host bridge */
> > >      QLIST_INIT(&bus->child);
> > >
> > 
> > It seems safer to add an API for enabling this functionality.
> > Something like
> > 
> > pci_set_master_for_master_abort(PCIBus *, PCIDevice *);
> I started to implement that way, but it would be strange (I think) 
> to see in the code the above method because almost all the pci devices
> can be master devices.

In theory yes in practice no one programs devices
with addresses that trigger master abort.

Addressing this theorectical issue would require
memory ops to get the bus master pointer.
When this happens the new API can go away,
but we can do this gradually.

> I selected an "implicit" implementation so for this reason exactly.
> We do not really select a master, but a "master for writes/reads on
> pci address space". Selecting device 0 on the bus automatically for
> this makes more sense to me.

No because you don't know that device 0 function 0 is the
pci host bridge.

And this won't work for bridges at all.

> What do you think?
> Marcel 
> 
> > 
> >   
> > > diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> > > index 9df1788..4cc25a3 100644
> > > --- a/include/hw/pci/pci_bus.h
> > > +++ b/include/hw/pci/pci_bus.h
> > > @@ -23,6 +23,7 @@ struct PCIBus {
> > >      PCIDevice *parent_dev;
> > >      MemoryRegion *address_space_mem;
> > >      MemoryRegion *address_space_io;
> > > +    MemoryRegion unassigned_mem;
> > >  
> > >      QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
> > >      QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
> > > -- 
> > > 1.8.3.1
> > 
>
Marcel Apfelbaum Sept. 9, 2013, 12:43 p.m. UTC | #4
On Mon, 2013-09-09 at 15:23 +0300, Michael S. Tsirkin wrote:
> On Mon, Sep 09, 2013 at 03:11:55PM +0300, Marcel Apfelbaum wrote:
> > On Mon, 2013-09-09 at 14:40 +0300, Michael S. Tsirkin wrote:
> > > On Mon, Sep 09, 2013 at 02:11:54PM +0300, Marcel Apfelbaum wrote:
> > > > Created a MemoryRegion with negative priority that
> > > > spans over all the pci address space.
> > > > It "intercepts" the accesses to unassigned pci
> > > > address space and will follow the pci spec:
> > > >  1. returns -1 on read
> > > >  2. does nothing on write
> > > >  3. sets the RECEIVED MASTER ABORT bit in the STATUS register
> > > >     of the device that initiated the transaction
> > > > 
> > > > Note: This implementation assumes that all the reads/writes to
> > > > the pci address space are done by the cpu.
> > > > 
> > > > Signed-off-by: Marcel Apfelbaum <marcel.a@redhat.com>
> > > > ---
> > > > Changes from v1:
> > > >  - "pci-unassigned-mem" MemoryRegion resides now in PCIBus and not on
> > > >     various Host Bridges
> > > >  - "pci-unassgined-mem" does not have a ".valid.accept" field and
> > > >     implements read write methods
> > > > 
> > > >  hw/pci/pci.c             | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> > > >  include/hw/pci/pci_bus.h |  1 +
> > > >  2 files changed, 47 insertions(+)
> > > > 
> > > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > > > index d00682e..b6a8026 100644
> > > > --- a/hw/pci/pci.c
> > > > +++ b/hw/pci/pci.c
> > > > @@ -283,6 +283,43 @@ const char *pci_root_bus_path(PCIDevice *dev)
> > > >      return rootbus->qbus.name;
> > > >  }
> > > >  
> > > > +static void unassigned_mem_access(PCIBus *bus)
> > > > +{
> > > > +    /* FIXME assumption: memory access to the pci address
> > > > +     * space is always initiated by the host bridge
> > > 
> > > /* Always
> > >  * like
> > >  * this
> > >  */
> > > 
> > > /* Not
> > >  * like this */
> > OK
> > 
> > > 
> > > > +     * (device 0 on the bus) */
> > > 
> > > Can't we pass the master device in?
> > > We are still left with the assumption that
> > > there's a single master, but at least
> > > we get rid of the assumption that it's
> > > always device 0 function 0.
> > > 
> > > 
> > > > +    PCIDevice *d = bus->devices[0];
> > > > +    if (!d) {
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    pci_word_test_and_set_mask(d->config + PCI_STATUS,
> > > > +                               PCI_STATUS_REC_MASTER_ABORT);
> > > 
> > > Probably should check and set secondary status for
> > > a bridge device.
> > I wanted to add the support for bridges later,
> > I am struggling with some issues with PCI bridges.
> 
> Shouldn't be very different.
The current implementation uses only one MemoryRegion that spans
over all pci address space. It catches all the accesses both to
primary bus addresses (bus 0), or to the regions covered by the bridges.
The callback ALWAYS receive a pointer to the primary bus.

What would be a better approach?
1. Remain with a single MemoryRegion and check if the address
   belongs to a bridge address range and recursively travel
   the bridges tree and update the registers
2. Model a MemoryRegion for each bridge that represents
   the address range behind the bridge (not existing today).
   Add a MemoryRegion child to catch accesses to unassigned
   addresses behind the bridge, then recursively travel the
   bridges to top and update the registers. 

> 
> > > 
> > > > +}
> > > > +
> > > > +static uint64_t unassigned_mem_read(void *opaque, hwaddr addr, unsigned size)
> > > > +{
> > > > +    PCIBus *bus = opaque;
> > > > +    unassigned_mem_access(bus);
> > > > +
> > > > +    return -1ULL;
> > > > +}
> > > > +
> > > > +static void unassigned_mem_write(void *opaque, hwaddr addr, uint64_t val,
> > > > +                                unsigned size)
> > > > +{
> > > > +    PCIBus *bus = opaque;
> > > > +    unassigned_mem_access(bus);
> > > > +}
> > > > +
> > > > +static const MemoryRegionOps unassigned_mem_ops = {
> > > > +    .read = unassigned_mem_read,
> > > > +    .write = unassigned_mem_write,
> > > > +    .endianness = DEVICE_NATIVE_ENDIAN,
> > > > +};
> > > > +
> > > > +#define UNASSIGNED_MEM_PRIORITY -1
> > > > +
> > > >  static void pci_bus_init(PCIBus *bus, DeviceState *parent,
> > > >                           const char *name,
> > > >                           MemoryRegion *address_space_mem,
> > > 
> > > 
> > > I would rename "unassigned" to "pci_master_Abort_" everywhere.
> > Are you sure about the capital "A"?
> 
> Ugh no, that's a typo.
> 
> > 
> > For the memory ops I suppose is OK, but the MemoryRegion name
> > I think it should remain "unassigned_mem"; it will be strange
> > to name it pci_master_Abort_mem.
> 
> Why would it be strange?
1. Because it states what happens when there is a an access to
   unassigned memory and not that IS an unassigned memory.
2. This name is already used for unassigned IO ports and
   maybe is better to follow this convension
> 
> > > 
> > > Call things by what they do not how they are used.
> > > 
> > > > @@ -294,6 +331,15 @@ static void pci_bus_init(PCIBus *bus, DeviceState *parent,
> > > >      bus->address_space_mem = address_space_mem;
> > > >      bus->address_space_io = address_space_io;
> > > >  
> > > > +
> > > > +    memory_region_init_io(&bus->unassigned_mem, OBJECT(bus),
> > > > +                          &unassigned_mem_ops, bus, "pci-unassigned",
> > > > +                          memory_region_size(bus->address_space_mem));
> > > > +    memory_region_add_subregion_overlap(bus->address_space_mem,
> > > > +                                        bus->address_space_mem->addr,
> > > > +                                        &bus->unassigned_mem,
> > > > +                                        UNASSIGNED_MEM_PRIORITY);
> > > > +
> > > >      /* host bridge */
> > > >      QLIST_INIT(&bus->child);
> > > >
> > > 
> > > It seems safer to add an API for enabling this functionality.
> > > Something like
> > > 
> > > pci_set_master_for_master_abort(PCIBus *, PCIDevice *);
> > I started to implement that way, but it would be strange (I think) 
> > to see in the code the above method because almost all the pci devices
> > can be master devices.
> 
> In theory yes in practice no one programs devices
> with addresses that trigger master abort.
> 
> Addressing this theorectical issue would require
> memory ops to get the bus master pointer.
> When this happens the new API can go away,
> but we can do this gradually.
> 
> > I selected an "implicit" implementation so for this reason exactly.
> > We do not really select a master, but a "master for writes/reads on
> > pci address space". Selecting device 0 on the bus automatically for
> > this makes more sense to me.
> 
> No because you don't know that device 0 function 0 is the
> pci host bridge.
The scenario is covered only for the primary bus and not for buses
behind the PCI bridge (the later being handled differently.)
In this case, isn't the Host Bridge always device 00.0?
If not, can we find a way to scan all bus devices and find
the host bridge so we will not have to manually add it to each
host bridge?

> 
> And this won't work for bridges at all.
For bridges I follow a different path, please see above. 

Thanks!
Marcel

> 
> > What do you think?
> > Marcel 
> > 
> > > 
> > >   
> > > > diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> > > > index 9df1788..4cc25a3 100644
> > > > --- a/include/hw/pci/pci_bus.h
> > > > +++ b/include/hw/pci/pci_bus.h
> > > > @@ -23,6 +23,7 @@ struct PCIBus {
> > > >      PCIDevice *parent_dev;
> > > >      MemoryRegion *address_space_mem;
> > > >      MemoryRegion *address_space_io;
> > > > +    MemoryRegion unassigned_mem;
> > > >  
> > > >      QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
> > > >      QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
> > > > -- 
> > > > 1.8.3.1
> > > 
> >
Peter Maydell Sept. 9, 2013, 12:52 p.m. UTC | #5
On 9 September 2013 13:43, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> The scenario is covered only for the primary bus and not for buses
> behind the PCI bridge (the later being handled differently.)
> In this case, isn't the Host Bridge always device 00.0?

No. For instance the host controller may pass a nonzero
value of devfn_min to pci_bus_new/pci_register_bus (in
which case the host bridge will end up there; example
hw/pci-host/versatile.c) or it might just pass a nonzero
devfn to pci_create_simple when it creates the host controller
PCI device on the bus (I don't think we have anything that
does it that way, though).

> If not, can we find a way to scan all bus devices and find
> the host bridge so we will not have to manually add it to each
> host bridge?

It would be conceptually nicer not to treat host bridges as
a special case but instead to just report the abort back
to whatever the PCI master was (which might be a device
doing DMA). That might be a lot of effort though.

-- PMM
Michael S. Tsirkin Sept. 9, 2013, 12:59 p.m. UTC | #6
On Mon, Sep 09, 2013 at 01:52:11PM +0100, Peter Maydell wrote:
> On 9 September 2013 13:43, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > The scenario is covered only for the primary bus and not for buses
> > behind the PCI bridge (the later being handled differently.)
> > In this case, isn't the Host Bridge always device 00.0?
> 
> No. For instance the host controller may pass a nonzero
> value of devfn_min to pci_bus_new/pci_register_bus (in
> which case the host bridge will end up there; example
> hw/pci-host/versatile.c) or it might just pass a nonzero
> devfn to pci_create_simple when it creates the host controller
> PCI device on the bus (I don't think we have anything that
> does it that way, though).
> 
> > If not, can we find a way to scan all bus devices and find
> > the host bridge so we will not have to manually add it to each
> > host bridge?
> 
> It would be conceptually nicer not to treat host bridges as
> a special case but instead to just report the abort back
> to whatever the PCI master was (which might be a device
> doing DMA). That might be a lot of effort though.
> 
> -- PMM

Yes. As a shortcut, what I suggest is registering the
device that wants to be notified of master aborts with
the bus.
Peter Maydell Sept. 9, 2013, 1:02 p.m. UTC | #7
On 9 September 2013 13:59, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Sep 09, 2013 at 01:52:11PM +0100, Peter Maydell wrote:
>> It would be conceptually nicer not to treat host bridges as
>> a special case but instead to just report the abort back
>> to whatever the PCI master was (which might be a device
>> doing DMA). That might be a lot of effort though.

> Yes. As a shortcut, what I suggest is registering the
> device that wants to be notified of master aborts with
> the bus.

Can you just pick the device which is (a subclass of)
TYPE_PCI_HOST_BRIDGE, or do we have host bridges which
aren't using that class?

-- PMM
Marcel Apfelbaum Sept. 9, 2013, 1:07 p.m. UTC | #8
On Mon, 2013-09-09 at 13:52 +0100, Peter Maydell wrote:
> On 9 September 2013 13:43, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > The scenario is covered only for the primary bus and not for buses
> > behind the PCI bridge (the later being handled differently.)
> > In this case, isn't the Host Bridge always device 00.0?
> 
> No. For instance the host controller may pass a nonzero
> value of devfn_min to pci_bus_new/pci_register_bus (in
> which case the host bridge will end up there; example
> hw/pci-host/versatile.c) or it might just pass a nonzero
> devfn to pci_create_simple when it creates the host controller
> PCI device on the bus (I don't think we have anything that
> does it that way, though).
If my assumption is not generally true, I will not use it
Thanks!

> 
> > If not, can we find a way to scan all bus devices and find
> > the host bridge so we will not have to manually add it to each
> > host bridge?
> 
> It would be conceptually nicer not to treat host bridges as
> a special case but instead to just report the abort back
> to whatever the PCI master was (which might be a device
> doing DMA). That might be a lot of effort though.
This is exactly my point. ALL device on the bus can be masters
of a DMA transaction. So adding an interface as suggested by
Michael: pci_set_master_for_master_abort(PCIBus *, PCIDevice *)
for the general case (a device doing DMA) it is too far from reality.

However, if we don't want the master device far all accesses
(including DMA) but only for accesses to pci address space,
in this specific case we can appoint the HostBridge
(explicitly/implicitly) as the master device because most
devices do not access pci address space of other devices.

As a conclusion, should we call the API
pci_set_master_for_master_abort_on_pci_space ?
Marcel




> 
> -- PMM
Marcel Apfelbaum Sept. 9, 2013, 1:15 p.m. UTC | #9
On Mon, 2013-09-09 at 14:02 +0100, Peter Maydell wrote:
> On 9 September 2013 13:59, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Sep 09, 2013 at 01:52:11PM +0100, Peter Maydell wrote:
> >> It would be conceptually nicer not to treat host bridges as
> >> a special case but instead to just report the abort back
> >> to whatever the PCI master was (which might be a device
> >> doing DMA). That might be a lot of effort though.
> 
> > Yes. As a shortcut, what I suggest is registering the
> > device that wants to be notified of master aborts with
> > the bus.
> 
> Can you just pick the device which is (a subclass of)
> TYPE_PCI_HOST_BRIDGE, or do we have host bridges which
> aren't using that class?
This is what I would really want to do, but some HOST Bridge devices
inherit directly from PCI_DEVICE.

TYPE_PCI_HOST_BRIDGE derives from TYPE_SYS_BUS_DEVICE which
is a not a PCI device and does not help us here (not a PCI_DEVICE
on the bus)

Strangely TYPE_Q35_HOST_DEVICE derives from TYPE_SYS_BUS_DEVICE
and it hold as composition a PCIDevice that will be part of
the bus, as opposed to TYPE_I440FX_PCI_DEVICE which directly
inherits from PCI_DEVICE.

It seems to me as a dead end, or we need to re-factor the current
hierarchy.

Marcel
> 
> -- PMM
Peter Maydell Sept. 9, 2013, 1:16 p.m. UTC | #10
On 9 September 2013 14:07, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> This is exactly my point. ALL device on the bus can be masters
> of a DMA transaction. So adding an interface as suggested by
> Michael: pci_set_master_for_master_abort(PCIBus *, PCIDevice *)
> for the general case (a device doing DMA) it is too far from reality.

Actually I don't think it would be too painful.
At the moment in do_pci_register_device() we do this to
create the memory region used by a device for its bus
master transactions:

    memory_region_init_alias(&pci_dev->bus_master_enable_region,
                             OBJECT(pci_dev), "bus master",
                             dma_as->root, 0,
                             memory_region_size(dma_as->root));

If instead of using this alias directly as the
bus_master_enable region you instead:
 * create a container region
 * create a 'background' region at negative priority
   (ie one per device, and you can make the 'opaque' pointer
   point to the device, not the bus)
 * put the alias and the background region into the container
 * use the container as the bus_master_enable region

then you will get in your callback a pointer to the
device which caused the abort. You can then have your
callback call a method defined on PCIDevice which we
implement:
 * as do-nothing in the PCI device base class
 * as set-the-master-abort bit in the PCI host bridge
   class
(and anybody who wants to get fancy about handling aborts
can override it in their own device implementation)

That seems achievable without really requiring new
infrastructure. Have I missed something that wouldn't
work if we did this?

thanks
-- PMM
Peter Maydell Sept. 9, 2013, 1:19 p.m. UTC | #11
On 9 September 2013 14:15, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> On Mon, 2013-09-09 at 14:02 +0100, Peter Maydell wrote:
>> Can you just pick the device which is (a subclass of)
>> TYPE_PCI_HOST_BRIDGE, or do we have host bridges which
>> aren't using that class?
> This is what I would really want to do, but some HOST Bridge devices
> inherit directly from PCI_DEVICE.

> TYPE_PCI_HOST_BRIDGE derives from TYPE_SYS_BUS_DEVICE which
> is a not a PCI device and does not help us here (not a PCI_DEVICE
> on the bus)

Oops, yes, I get those two the wrong way round a lot. Anyway,
if we need to make all host bridges have a common subclass
we could certainly refactor them accordingly.

> Strangely TYPE_Q35_HOST_DEVICE derives from TYPE_SYS_BUS_DEVICE
> and it hold as composition a PCIDevice that will be part of
> the bus, as opposed to TYPE_I440FX_PCI_DEVICE which directly
> inherits from PCI_DEVICE.

This may just be wrong choice of name rather than actually
wrong hierarchy.

-- PMM
Marcel Apfelbaum Sept. 9, 2013, 1:29 p.m. UTC | #12
On Mon, 2013-09-09 at 14:19 +0100, Peter Maydell wrote:
> On 9 September 2013 14:15, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > On Mon, 2013-09-09 at 14:02 +0100, Peter Maydell wrote:
> >> Can you just pick the device which is (a subclass of)
> >> TYPE_PCI_HOST_BRIDGE, or do we have host bridges which
> >> aren't using that class?
> > This is what I would really want to do, but some HOST Bridge devices
> > inherit directly from PCI_DEVICE.
> 
> > TYPE_PCI_HOST_BRIDGE derives from TYPE_SYS_BUS_DEVICE which
> > is a not a PCI device and does not help us here (not a PCI_DEVICE
> > on the bus)
> 
> Oops, yes, I get those two the wrong way round a lot. Anyway,
> if we need to make all host bridges have a common subclass
> we could certainly refactor them accordingly.
> 
> > Strangely TYPE_Q35_HOST_DEVICE derives from TYPE_SYS_BUS_DEVICE
> > and it hold as composition a PCIDevice that will be part of
> > the bus, as opposed to TYPE_I440FX_PCI_DEVICE which directly
> > inherits from PCI_DEVICE.
> 
> This may just be wrong choice of name rather than actually
> wrong hierarchy.
I try not to "judge" the naming convention, so let's leave it
aside for now.
My issue is that we have at least 2 ways to model the bridges:
1. TYPE_PCI_HOST_BRIDGE
   * derives from TYPE_SYS_BUS_DEVICE
   * has a bus
   * one of the bus devices is a TYPE_I440FX_PCI_DEVICE which
    derives from TYPE_PCI_DEVICE
2. TYPE_PCIE_HOST_BRIDGE
   * derives from TYPE_PCI_HOST_BRIDGE which derives
     from TYPE_SYS_BUS_DEVICE
   * has a PciDevice and register it to the bus in order
     to work as (1)

I would like to implement an hierarchy that will allow
all the host bridge devices to have a common ancestor
In this was, we can scan the PCI bus to look for
master...
 

> 
> -- PMM
Peter Maydell Sept. 9, 2013, 1:39 p.m. UTC | #13
On 9 September 2013 14:29, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> My issue is that we have at least 2 ways to model the bridges:
> 1. TYPE_PCI_HOST_BRIDGE
>    * derives from TYPE_SYS_BUS_DEVICE
>    * has a bus
>    * one of the bus devices is a TYPE_I440FX_PCI_DEVICE which
>     derives from TYPE_PCI_DEVICE
> 2. TYPE_PCIE_HOST_BRIDGE
>    * derives from TYPE_PCI_HOST_BRIDGE which derives
>      from TYPE_SYS_BUS_DEVICE
>    * has a PciDevice and register it to the bus in order
>      to work as (1)

So I think there are two different (and slightly confusing)
things here:
 (1) the model of the host's PCI controller; this is
 typically derived from TYPE_SYS_BUS_DEVICE somehow
 but I guess in theory need not be; often it's a
 TYPE_PCI_HOST_BRIDGE or TYPE_PCIE_HOST_BRIDGE.
 (2) the PCI device which sits on the PCI bus and is
 visible to the guest, usually with a PCI class ID
 of PCI_CLASS_BRIDGE_HOST (but not necessarily; versatile
 is different, for instance). Currently we generally
 make these direct subclasses of TYPE_PCI_DEVICE.

[The naming convention confusion arises because
different controllers are inconsistent about how
they name these classes and which type name ends up
with 'host' in it.]

What you're going to get in the callback is (2)...

> I would like to implement an hierarchy that will allow
> all the host bridge devices to have a common ancestor
> In this was, we can scan the PCI bus to look for
> master...

...and yes, we could insert an extra class and make
all those bridge hosts derive from it rather than
directly from TYPE_PCI_DEVICE if we needed to.

-- PMM
Marcel Apfelbaum Sept. 9, 2013, 1:44 p.m. UTC | #14
On Mon, 2013-09-09 at 14:16 +0100, Peter Maydell wrote:
> On 9 September 2013 14:07, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > This is exactly my point. ALL device on the bus can be masters
> > of a DMA transaction. So adding an interface as suggested by
> > Michael: pci_set_master_for_master_abort(PCIBus *, PCIDevice *)
> > for the general case (a device doing DMA) it is too far from reality.
> 
> Actually I don't think it would be too painful.
> At the moment in do_pci_register_device() we do this to
> create the memory region used by a device for its bus
> master transactions:
> 
>     memory_region_init_alias(&pci_dev->bus_master_enable_region,
>                              OBJECT(pci_dev), "bus master",
>                              dma_as->root, 0,
>                              memory_region_size(dma_as->root));
> 
> If instead of using this alias directly as the
> bus_master_enable region you instead:
>  * create a container region
>  * create a 'background' region at negative priority
>    (ie one per device, and you can make the 'opaque' pointer
>    point to the device, not the bus)
>  * put the alias and the background region into the container
>  * use the container as the bus_master_enable region
> 
> then you will get in your callback a pointer to the
> device which caused the abort. You can then have your
> callback call a method defined on PCIDevice which we
> implement:
>  * as do-nothing in the PCI device base class
>  * as set-the-master-abort bit in the PCI host bridge
>    class
The Received Master Abort bit must be set for the initiator.
In the case described here this bit mast be set in the
device register rather that in host bridge.

> (and anybody who wants to get fancy about handling aborts
> can override it in their own device implementation)
> 
> That seems achievable without really requiring new
> infrastructure. Have I missed something that wouldn't
> work if we did this?
The idea seems correct (and cool!) to me (I'll look deeper),
but it covers only one way: upstream.
We need to unify this with the downstream: The cpu accesses to
unassigned memory should result in the callback to the host bridge.

All we need is that all the host bridges to have a common class and
following the same idea we get the downstream (The host bridge inititates
all transactions on the bus on behalf of the cpu)

If this works, we don't need no work around!
Marcel


> 
> thanks
> -- PMM
Michael S. Tsirkin Sept. 9, 2013, 1:58 p.m. UTC | #15
On Mon, Sep 09, 2013 at 03:43:49PM +0300, Marcel Apfelbaum wrote:
> On Mon, 2013-09-09 at 15:23 +0300, Michael S. Tsirkin wrote:
> > On Mon, Sep 09, 2013 at 03:11:55PM +0300, Marcel Apfelbaum wrote:
> > > On Mon, 2013-09-09 at 14:40 +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Sep 09, 2013 at 02:11:54PM +0300, Marcel Apfelbaum wrote:
> > > > > Created a MemoryRegion with negative priority that
> > > > > spans over all the pci address space.
> > > > > It "intercepts" the accesses to unassigned pci
> > > > > address space and will follow the pci spec:
> > > > >  1. returns -1 on read
> > > > >  2. does nothing on write
> > > > >  3. sets the RECEIVED MASTER ABORT bit in the STATUS register
> > > > >     of the device that initiated the transaction
> > > > > 
> > > > > Note: This implementation assumes that all the reads/writes to
> > > > > the pci address space are done by the cpu.
> > > > > 
> > > > > Signed-off-by: Marcel Apfelbaum <marcel.a@redhat.com>
> > > > > ---
> > > > > Changes from v1:
> > > > >  - "pci-unassigned-mem" MemoryRegion resides now in PCIBus and not on
> > > > >     various Host Bridges
> > > > >  - "pci-unassgined-mem" does not have a ".valid.accept" field and
> > > > >     implements read write methods
> > > > > 
> > > > >  hw/pci/pci.c             | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  include/hw/pci/pci_bus.h |  1 +
> > > > >  2 files changed, 47 insertions(+)
> > > > > 
> > > > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > > > > index d00682e..b6a8026 100644
> > > > > --- a/hw/pci/pci.c
> > > > > +++ b/hw/pci/pci.c
> > > > > @@ -283,6 +283,43 @@ const char *pci_root_bus_path(PCIDevice *dev)
> > > > >      return rootbus->qbus.name;
> > > > >  }
> > > > >  
> > > > > +static void unassigned_mem_access(PCIBus *bus)
> > > > > +{
> > > > > +    /* FIXME assumption: memory access to the pci address
> > > > > +     * space is always initiated by the host bridge
> > > > 
> > > > /* Always
> > > >  * like
> > > >  * this
> > > >  */
> > > > 
> > > > /* Not
> > > >  * like this */
> > > OK
> > > 
> > > > 
> > > > > +     * (device 0 on the bus) */
> > > > 
> > > > Can't we pass the master device in?
> > > > We are still left with the assumption that
> > > > there's a single master, but at least
> > > > we get rid of the assumption that it's
> > > > always device 0 function 0.
> > > > 
> > > > 
> > > > > +    PCIDevice *d = bus->devices[0];
> > > > > +    if (!d) {
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    pci_word_test_and_set_mask(d->config + PCI_STATUS,
> > > > > +                               PCI_STATUS_REC_MASTER_ABORT);
> > > > 
> > > > Probably should check and set secondary status for
> > > > a bridge device.
> > > I wanted to add the support for bridges later,
> > > I am struggling with some issues with PCI bridges.
> > 
> > Shouldn't be very different.
> The current implementation uses only one MemoryRegion that spans
> over all pci address space. It catches all the accesses both to
> primary bus addresses (bus 0), or to the regions covered by the bridges.

That's not what I see in code.

pci_bridge_initfn creates address spaces for IO and memory.
Bridge windows are aliases created by pci_bridge_init_alias.
All they do is forward transactions to the secondary bus,
so these address spaces represent secondary bus addressing.

What did I miss?

> The callback ALWAYS receive a pointer to the primary bus.
> 
> What would be a better approach?
> 1. Remain with a single MemoryRegion and check if the address
>    belongs to a bridge address range and recursively travel
>    the bridges tree and update the registers
> 2. Model a MemoryRegion for each bridge that represents
>    the address range behind the bridge (not existing today).

I think this is exactly what address_space_XXX represent.

>    Add a MemoryRegion child to catch accesses to unassigned
>    addresses behind the bridge, then recursively travel the
>    bridges to top and update the registers. 

This bit sounds right.

> 
> > 
> > > > 
> > > > > +}
> > > > > +
> > > > > +static uint64_t unassigned_mem_read(void *opaque, hwaddr addr, unsigned size)
> > > > > +{
> > > > > +    PCIBus *bus = opaque;
> > > > > +    unassigned_mem_access(bus);
> > > > > +
> > > > > +    return -1ULL;
> > > > > +}
> > > > > +
> > > > > +static void unassigned_mem_write(void *opaque, hwaddr addr, uint64_t val,
> > > > > +                                unsigned size)
> > > > > +{
> > > > > +    PCIBus *bus = opaque;
> > > > > +    unassigned_mem_access(bus);
> > > > > +}
> > > > > +
> > > > > +static const MemoryRegionOps unassigned_mem_ops = {
> > > > > +    .read = unassigned_mem_read,
> > > > > +    .write = unassigned_mem_write,
> > > > > +    .endianness = DEVICE_NATIVE_ENDIAN,
> > > > > +};
> > > > > +
> > > > > +#define UNASSIGNED_MEM_PRIORITY -1
> > > > > +
> > > > >  static void pci_bus_init(PCIBus *bus, DeviceState *parent,
> > > > >                           const char *name,
> > > > >                           MemoryRegion *address_space_mem,
> > > > 
> > > > 
> > > > I would rename "unassigned" to "pci_master_Abort_" everywhere.
> > > Are you sure about the capital "A"?
> > 
> > Ugh no, that's a typo.
> > 
> > > 
> > > For the memory ops I suppose is OK, but the MemoryRegion name
> > > I think it should remain "unassigned_mem"; it will be strange
> > > to name it pci_master_Abort_mem.
> > 
> > Why would it be strange?
> 1. Because it states what happens when there is a an access to
>    unassigned memory and not that IS an unassigned memory.
> 2. This name is already used for unassigned IO ports and
>    maybe is better to follow this convension
> > 
> > > > 
> > > > Call things by what they do not how they are used.
> > > > 
> > > > > @@ -294,6 +331,15 @@ static void pci_bus_init(PCIBus *bus, DeviceState *parent,
> > > > >      bus->address_space_mem = address_space_mem;
> > > > >      bus->address_space_io = address_space_io;
> > > > >  
> > > > > +
> > > > > +    memory_region_init_io(&bus->unassigned_mem, OBJECT(bus),
> > > > > +                          &unassigned_mem_ops, bus, "pci-unassigned",
> > > > > +                          memory_region_size(bus->address_space_mem));
> > > > > +    memory_region_add_subregion_overlap(bus->address_space_mem,
> > > > > +                                        bus->address_space_mem->addr,
> > > > > +                                        &bus->unassigned_mem,
> > > > > +                                        UNASSIGNED_MEM_PRIORITY);
> > > > > +
> > > > >      /* host bridge */
> > > > >      QLIST_INIT(&bus->child);
> > > > >
> > > > 
> > > > It seems safer to add an API for enabling this functionality.
> > > > Something like
> > > > 
> > > > pci_set_master_for_master_abort(PCIBus *, PCIDevice *);
> > > I started to implement that way, but it would be strange (I think) 
> > > to see in the code the above method because almost all the pci devices
> > > can be master devices.
> > 
> > In theory yes in practice no one programs devices
> > with addresses that trigger master abort.
> > 
> > Addressing this theorectical issue would require
> > memory ops to get the bus master pointer.
> > When this happens the new API can go away,
> > but we can do this gradually.
> > 
> > > I selected an "implicit" implementation so for this reason exactly.
> > > We do not really select a master, but a "master for writes/reads on
> > > pci address space". Selecting device 0 on the bus automatically for
> > > this makes more sense to me.
> > 
> > No because you don't know that device 0 function 0 is the
> > pci host bridge.
> The scenario is covered only for the primary bus and not for buses
> behind the PCI bridge (the later being handled differently.)
> In this case, isn't the Host Bridge always device 00.0?
> If not, can we find a way to scan all bus devices and find
> the host bridge so we will not have to manually add it to each
> host bridge?
> 
> > 
> > And this won't work for bridges at all.
> For bridges I follow a different path, please see above. 
> 
> Thanks!
> Marcel
> 
> > 
> > > What do you think?
> > > Marcel 
> > > 
> > > > 
> > > >   
> > > > > diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> > > > > index 9df1788..4cc25a3 100644
> > > > > --- a/include/hw/pci/pci_bus.h
> > > > > +++ b/include/hw/pci/pci_bus.h
> > > > > @@ -23,6 +23,7 @@ struct PCIBus {
> > > > >      PCIDevice *parent_dev;
> > > > >      MemoryRegion *address_space_mem;
> > > > >      MemoryRegion *address_space_io;
> > > > > +    MemoryRegion unassigned_mem;
> > > > >  
> > > > >      QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
> > > > >      QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
> > > > > -- 
> > > > > 1.8.3.1
> > > > 
> > > 
>
Michael S. Tsirkin Sept. 9, 2013, 1:59 p.m. UTC | #16
On Mon, Sep 09, 2013 at 02:02:47PM +0100, Peter Maydell wrote:
> On 9 September 2013 13:59, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Sep 09, 2013 at 01:52:11PM +0100, Peter Maydell wrote:
> >> It would be conceptually nicer not to treat host bridges as
> >> a special case but instead to just report the abort back
> >> to whatever the PCI master was (which might be a device
> >> doing DMA). That might be a lot of effort though.
> 
> > Yes. As a shortcut, what I suggest is registering the
> > device that wants to be notified of master aborts with
> > the bus.
> 
> Can you just pick the device which is (a subclass of)
> TYPE_PCI_HOST_BRIDGE, or do we have host bridges which
> aren't using that class?
> 
> -- PMM

Not anymore I think. So yes, I think this will work.
Michael S. Tsirkin Sept. 9, 2013, 2:01 p.m. UTC | #17
On Mon, Sep 09, 2013 at 04:07:53PM +0300, Marcel Apfelbaum wrote:
> On Mon, 2013-09-09 at 13:52 +0100, Peter Maydell wrote:
> > On 9 September 2013 13:43, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > > The scenario is covered only for the primary bus and not for buses
> > > behind the PCI bridge (the later being handled differently.)
> > > In this case, isn't the Host Bridge always device 00.0?
> > 
> > No. For instance the host controller may pass a nonzero
> > value of devfn_min to pci_bus_new/pci_register_bus (in
> > which case the host bridge will end up there; example
> > hw/pci-host/versatile.c) or it might just pass a nonzero
> > devfn to pci_create_simple when it creates the host controller
> > PCI device on the bus (I don't think we have anything that
> > does it that way, though).
> If my assumption is not generally true, I will not use it
> Thanks!
> 
> > 
> > > If not, can we find a way to scan all bus devices and find
> > > the host bridge so we will not have to manually add it to each
> > > host bridge?
> > 
> > It would be conceptually nicer not to treat host bridges as
> > a special case but instead to just report the abort back
> > to whatever the PCI master was (which might be a device
> > doing DMA). That might be a lot of effort though.
> This is exactly my point. ALL device on the bus can be masters
> of a DMA transaction. So adding an interface as suggested by
> Michael: pci_set_master_for_master_abort(PCIBus *, PCIDevice *)
> for the general case (a device doing DMA) it is too far from reality.
> 
> However, if we don't want the master device far all accesses
> (including DMA) but only for accesses to pci address space,
> in this specific case we can appoint the HostBridge
> (explicitly/implicitly) as the master device because most
> devices do not access pci address space of other devices.
> 
> As a conclusion, should we call the API
> pci_set_master_for_master_abort_on_pci_space ?
> Marcel

I like Peter's idea of detecting a host bridge
implictly using device type.
With a big FIXME explaining that we shouldn't
need to do this.


> 
> 
> 
> > 
> > -- PMM
>
Marcel Apfelbaum Sept. 9, 2013, 2:04 p.m. UTC | #18
On Mon, 2013-09-09 at 14:39 +0100, Peter Maydell wrote:
> On 9 September 2013 14:29, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > My issue is that we have at least 2 ways to model the bridges:
> > 1. TYPE_PCI_HOST_BRIDGE
> >    * derives from TYPE_SYS_BUS_DEVICE
> >    * has a bus
> >    * one of the bus devices is a TYPE_I440FX_PCI_DEVICE which
> >     derives from TYPE_PCI_DEVICE
> > 2. TYPE_PCIE_HOST_BRIDGE
> >    * derives from TYPE_PCI_HOST_BRIDGE which derives
> >      from TYPE_SYS_BUS_DEVICE
> >    * has a PciDevice and register it to the bus in order
> >      to work as (1)
> 
> So I think there are two different (and slightly confusing)
> things here:
>  (1) the model of the host's PCI controller; this is
>  typically derived from TYPE_SYS_BUS_DEVICE somehow
>  but I guess in theory need not be; often it's a
>  TYPE_PCI_HOST_BRIDGE or TYPE_PCIE_HOST_BRIDGE.
>  (2) the PCI device which sits on the PCI bus and is
>  visible to the guest, usually with a PCI class ID
>  of PCI_CLASS_BRIDGE_HOST (but not necessarily; versatile
>  is different, for instance). Currently we generally
>  make these direct subclasses of TYPE_PCI_DEVICE.
> 
> [The naming convention confusion arises because
> different controllers are inconsistent about how
> they name these classes and which type name ends up
> with 'host' in it.]
> 
> What you're going to get in the callback is (2)...
This I come to understand, thanks!
> 
> > I would like to implement an hierarchy that will allow
> > all the host bridge devices to have a common ancestor
> > In this was, we can scan the PCI bus to look for
> > master...
> 
> ...and yes, we could insert an extra class and make
> all those bridge hosts derive from it rather than
> directly from TYPE_PCI_DEVICE if we needed to.
And we solve the main issue - no need for an API for master abort
- for upstream: use your suggestion (let's hope it works)
- for downstream: look on the bus for a device deriving
  from this class and *this device* will handle the
  master abort
I'll find a way how to handle the PCI bridges.

By the way, I am not sure that the upstream transactions (DMA)
can actually end with a master abort. Master abort would happen
if a transaction will not be claimed by any device on the bus.
But in DMA transactions, maybe the host bridge will always claim it
and return with error. Does anybody know something about this?

Thanks for all the help!
Marcel




> 
> -- PMM
Michael S. Tsirkin Sept. 9, 2013, 2:04 p.m. UTC | #19
On Mon, Sep 09, 2013 at 04:29:04PM +0300, Marcel Apfelbaum wrote:
> On Mon, 2013-09-09 at 14:19 +0100, Peter Maydell wrote:
> > On 9 September 2013 14:15, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > > On Mon, 2013-09-09 at 14:02 +0100, Peter Maydell wrote:
> > >> Can you just pick the device which is (a subclass of)
> > >> TYPE_PCI_HOST_BRIDGE, or do we have host bridges which
> > >> aren't using that class?
> > > This is what I would really want to do, but some HOST Bridge devices
> > > inherit directly from PCI_DEVICE.
> > 
> > > TYPE_PCI_HOST_BRIDGE derives from TYPE_SYS_BUS_DEVICE which
> > > is a not a PCI device and does not help us here (not a PCI_DEVICE
> > > on the bus)
> > 
> > Oops, yes, I get those two the wrong way round a lot. Anyway,
> > if we need to make all host bridges have a common subclass
> > we could certainly refactor them accordingly.
> > 
> > > Strangely TYPE_Q35_HOST_DEVICE derives from TYPE_SYS_BUS_DEVICE
> > > and it hold as composition a PCIDevice that will be part of
> > > the bus, as opposed to TYPE_I440FX_PCI_DEVICE which directly
> > > inherits from PCI_DEVICE.
> > 
> > This may just be wrong choice of name rather than actually
> > wrong hierarchy.
> I try not to "judge" the naming convention, so let's leave it
> aside for now.
> My issue is that we have at least 2 ways to model the bridges:
> 1. TYPE_PCI_HOST_BRIDGE
>    * derives from TYPE_SYS_BUS_DEVICE
>    * has a bus
>    * one of the bus devices is a TYPE_I440FX_PCI_DEVICE which
>     derives from TYPE_PCI_DEVICE
> 2. TYPE_PCIE_HOST_BRIDGE
>    * derives from TYPE_PCI_HOST_BRIDGE which derives
>      from TYPE_SYS_BUS_DEVICE
>    * has a PciDevice and register it to the bus in order
>      to work as (1)
> 
> I would like to implement an hierarchy that will allow
> all the host bridge devices to have a common ancestor
> In this was, we can scan the PCI bus to look for
> master...

I wouldn't object to is_host is stuct PCIDeviceClass.
That's probably easier that trying to create
a common class for devices that share no common code.


> 
> > 
> > -- PMM
>
Marcel Apfelbaum Sept. 9, 2013, 2:10 p.m. UTC | #20
On Mon, 2013-09-09 at 16:58 +0300, Michael S. Tsirkin wrote:
> On Mon, Sep 09, 2013 at 03:43:49PM +0300, Marcel Apfelbaum wrote:
> > On Mon, 2013-09-09 at 15:23 +0300, Michael S. Tsirkin wrote:
> > > On Mon, Sep 09, 2013 at 03:11:55PM +0300, Marcel Apfelbaum wrote:
> > > > On Mon, 2013-09-09 at 14:40 +0300, Michael S. Tsirkin wrote:
> > > > > On Mon, Sep 09, 2013 at 02:11:54PM +0300, Marcel Apfelbaum wrote:
> > > > > > Created a MemoryRegion with negative priority that
> > > > > > spans over all the pci address space.
> > > > > > It "intercepts" the accesses to unassigned pci
> > > > > > address space and will follow the pci spec:
> > > > > >  1. returns -1 on read
> > > > > >  2. does nothing on write
> > > > > >  3. sets the RECEIVED MASTER ABORT bit in the STATUS register
> > > > > >     of the device that initiated the transaction
> > > > > > 
> > > > > > Note: This implementation assumes that all the reads/writes to
> > > > > > the pci address space are done by the cpu.
> > > > > > 
> > > > > > Signed-off-by: Marcel Apfelbaum <marcel.a@redhat.com>
> > > > > > ---
> > > > > > Changes from v1:
> > > > > >  - "pci-unassigned-mem" MemoryRegion resides now in PCIBus and not on
> > > > > >     various Host Bridges
> > > > > >  - "pci-unassgined-mem" does not have a ".valid.accept" field and
> > > > > >     implements read write methods
> > > > > > 
> > > > > >  hw/pci/pci.c             | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >  include/hw/pci/pci_bus.h |  1 +
> > > > > >  2 files changed, 47 insertions(+)
> > > > > > 
> > > > > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > > > > > index d00682e..b6a8026 100644
> > > > > > --- a/hw/pci/pci.c
> > > > > > +++ b/hw/pci/pci.c
> > > > > > @@ -283,6 +283,43 @@ const char *pci_root_bus_path(PCIDevice *dev)
> > > > > >      return rootbus->qbus.name;
> > > > > >  }
> > > > > >  
> > > > > > +static void unassigned_mem_access(PCIBus *bus)
> > > > > > +{
> > > > > > +    /* FIXME assumption: memory access to the pci address
> > > > > > +     * space is always initiated by the host bridge
> > > > > 
> > > > > /* Always
> > > > >  * like
> > > > >  * this
> > > > >  */
> > > > > 
> > > > > /* Not
> > > > >  * like this */
> > > > OK
> > > > 
> > > > > 
> > > > > > +     * (device 0 on the bus) */
> > > > > 
> > > > > Can't we pass the master device in?
> > > > > We are still left with the assumption that
> > > > > there's a single master, but at least
> > > > > we get rid of the assumption that it's
> > > > > always device 0 function 0.
> > > > > 
> > > > > 
> > > > > > +    PCIDevice *d = bus->devices[0];
> > > > > > +    if (!d) {
> > > > > > +        return;
> > > > > > +    }
> > > > > > +
> > > > > > +    pci_word_test_and_set_mask(d->config + PCI_STATUS,
> > > > > > +                               PCI_STATUS_REC_MASTER_ABORT);
> > > > > 
> > > > > Probably should check and set secondary status for
> > > > > a bridge device.
> > > > I wanted to add the support for bridges later,
> > > > I am struggling with some issues with PCI bridges.
> > > 
> > > Shouldn't be very different.
> > The current implementation uses only one MemoryRegion that spans
> > over all pci address space. It catches all the accesses both to
> > primary bus addresses (bus 0), or to the regions covered by the bridges.
> 
> That's not what I see in code.
> 
> pci_bridge_initfn creates address spaces for IO and memory.
> Bridge windows are aliases created by pci_bridge_init_alias.
> All they do is forward transactions to the secondary bus,
> so these address spaces represent secondary bus addressing.
> 
> What did I miss?
I was referring to *my* implementation of "unassigned mem" region,
not to pci bridge's regions.
> 
> > The callback ALWAYS receive a pointer to the primary bus.
> > 
> > What would be a better approach?
> > 1. Remain with a single MemoryRegion and check if the address
> >    belongs to a bridge address range and recursively travel
> >    the bridges tree and update the registers
> > 2. Model a MemoryRegion for each bridge that represents
> >    the address range behind the bridge (not existing today).
> 
> I think this is exactly what address_space_XXX represent.
Today I see (using info mtree) under the bridge:
 1. A  container memory region for all the space 0 - 2^64
 2. Regions for the devices under the bridge
I do not see a region corresponding with the bridge "memory region"
that would be forwarded to the secondary bus.

Thanks,
Marcel

>  
> >    Add a MemoryRegion child to catch accesses to unassigned
> >    addresses behind the bridge, then recursively travel the
> >    bridges to top and update the registers. 
> 
> This bit sounds right.
> 
> > 
> > > 
> > > > > 
> > > > > > +}
> > > > > > +
> > > > > > +static uint64_t unassigned_mem_read(void *opaque, hwaddr addr, unsigned size)
> > > > > > +{
> > > > > > +    PCIBus *bus = opaque;
> > > > > > +    unassigned_mem_access(bus);
> > > > > > +
> > > > > > +    return -1ULL;
> > > > > > +}
> > > > > > +
> > > > > > +static void unassigned_mem_write(void *opaque, hwaddr addr, uint64_t val,
> > > > > > +                                unsigned size)
> > > > > > +{
> > > > > > +    PCIBus *bus = opaque;
> > > > > > +    unassigned_mem_access(bus);
> > > > > > +}
> > > > > > +
> > > > > > +static const MemoryRegionOps unassigned_mem_ops = {
> > > > > > +    .read = unassigned_mem_read,
> > > > > > +    .write = unassigned_mem_write,
> > > > > > +    .endianness = DEVICE_NATIVE_ENDIAN,
> > > > > > +};
> > > > > > +
> > > > > > +#define UNASSIGNED_MEM_PRIORITY -1
> > > > > > +
> > > > > >  static void pci_bus_init(PCIBus *bus, DeviceState *parent,
> > > > > >                           const char *name,
> > > > > >                           MemoryRegion *address_space_mem,
> > > > > 
> > > > > 
> > > > > I would rename "unassigned" to "pci_master_Abort_" everywhere.
> > > > Are you sure about the capital "A"?
> > > 
> > > Ugh no, that's a typo.
> > > 
> > > > 
> > > > For the memory ops I suppose is OK, but the MemoryRegion name
> > > > I think it should remain "unassigned_mem"; it will be strange
> > > > to name it pci_master_Abort_mem.
> > > 
> > > Why would it be strange?
> > 1. Because it states what happens when there is a an access to
> >    unassigned memory and not that IS an unassigned memory.
> > 2. This name is already used for unassigned IO ports and
> >    maybe is better to follow this convension
> > > 
> > > > > 
> > > > > Call things by what they do not how they are used.
> > > > > 
> > > > > > @@ -294,6 +331,15 @@ static void pci_bus_init(PCIBus *bus, DeviceState *parent,
> > > > > >      bus->address_space_mem = address_space_mem;
> > > > > >      bus->address_space_io = address_space_io;
> > > > > >  
> > > > > > +
> > > > > > +    memory_region_init_io(&bus->unassigned_mem, OBJECT(bus),
> > > > > > +                          &unassigned_mem_ops, bus, "pci-unassigned",
> > > > > > +                          memory_region_size(bus->address_space_mem));
> > > > > > +    memory_region_add_subregion_overlap(bus->address_space_mem,
> > > > > > +                                        bus->address_space_mem->addr,
> > > > > > +                                        &bus->unassigned_mem,
> > > > > > +                                        UNASSIGNED_MEM_PRIORITY);
> > > > > > +
> > > > > >      /* host bridge */
> > > > > >      QLIST_INIT(&bus->child);
> > > > > >
> > > > > 
> > > > > It seems safer to add an API for enabling this functionality.
> > > > > Something like
> > > > > 
> > > > > pci_set_master_for_master_abort(PCIBus *, PCIDevice *);
> > > > I started to implement that way, but it would be strange (I think) 
> > > > to see in the code the above method because almost all the pci devices
> > > > can be master devices.
> > > 
> > > In theory yes in practice no one programs devices
> > > with addresses that trigger master abort.
> > > 
> > > Addressing this theorectical issue would require
> > > memory ops to get the bus master pointer.
> > > When this happens the new API can go away,
> > > but we can do this gradually.
> > > 
> > > > I selected an "implicit" implementation so for this reason exactly.
> > > > We do not really select a master, but a "master for writes/reads on
> > > > pci address space". Selecting device 0 on the bus automatically for
> > > > this makes more sense to me.
> > > 
> > > No because you don't know that device 0 function 0 is the
> > > pci host bridge.
> > The scenario is covered only for the primary bus and not for buses
> > behind the PCI bridge (the later being handled differently.)
> > In this case, isn't the Host Bridge always device 00.0?
> > If not, can we find a way to scan all bus devices and find
> > the host bridge so we will not have to manually add it to each
> > host bridge?
> > 
> > > 
> > > And this won't work for bridges at all.
> > For bridges I follow a different path, please see above. 
> > 
> > Thanks!
> > Marcel
> > 
> > > 
> > > > What do you think?
> > > > Marcel 
> > > > 
> > > > > 
> > > > >   
> > > > > > diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> > > > > > index 9df1788..4cc25a3 100644
> > > > > > --- a/include/hw/pci/pci_bus.h
> > > > > > +++ b/include/hw/pci/pci_bus.h
> > > > > > @@ -23,6 +23,7 @@ struct PCIBus {
> > > > > >      PCIDevice *parent_dev;
> > > > > >      MemoryRegion *address_space_mem;
> > > > > >      MemoryRegion *address_space_io;
> > > > > > +    MemoryRegion unassigned_mem;
> > > > > >  
> > > > > >      QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
> > > > > >      QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
> > > > > > -- 
> > > > > > 1.8.3.1
> > > > > 
> > > > 
> >
Marcel Apfelbaum Sept. 9, 2013, 2:16 p.m. UTC | #21
On Mon, 2013-09-09 at 17:04 +0300, Michael S. Tsirkin wrote:
> On Mon, Sep 09, 2013 at 04:29:04PM +0300, Marcel Apfelbaum wrote:
> > On Mon, 2013-09-09 at 14:19 +0100, Peter Maydell wrote:
> > > On 9 September 2013 14:15, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > > > On Mon, 2013-09-09 at 14:02 +0100, Peter Maydell wrote:
> > > >> Can you just pick the device which is (a subclass of)
> > > >> TYPE_PCI_HOST_BRIDGE, or do we have host bridges which
> > > >> aren't using that class?
> > > > This is what I would really want to do, but some HOST Bridge devices
> > > > inherit directly from PCI_DEVICE.
> > > 
> > > > TYPE_PCI_HOST_BRIDGE derives from TYPE_SYS_BUS_DEVICE which
> > > > is a not a PCI device and does not help us here (not a PCI_DEVICE
> > > > on the bus)
> > > 
> > > Oops, yes, I get those two the wrong way round a lot. Anyway,
> > > if we need to make all host bridges have a common subclass
> > > we could certainly refactor them accordingly.
> > > 
> > > > Strangely TYPE_Q35_HOST_DEVICE derives from TYPE_SYS_BUS_DEVICE
> > > > and it hold as composition a PCIDevice that will be part of
> > > > the bus, as opposed to TYPE_I440FX_PCI_DEVICE which directly
> > > > inherits from PCI_DEVICE.
> > > 
> > > This may just be wrong choice of name rather than actually
> > > wrong hierarchy.
> > I try not to "judge" the naming convention, so let's leave it
> > aside for now.
> > My issue is that we have at least 2 ways to model the bridges:
> > 1. TYPE_PCI_HOST_BRIDGE
> >    * derives from TYPE_SYS_BUS_DEVICE
> >    * has a bus
> >    * one of the bus devices is a TYPE_I440FX_PCI_DEVICE which
> >     derives from TYPE_PCI_DEVICE
> > 2. TYPE_PCIE_HOST_BRIDGE
> >    * derives from TYPE_PCI_HOST_BRIDGE which derives
> >      from TYPE_SYS_BUS_DEVICE
> >    * has a PciDevice and register it to the bus in order
> >      to work as (1)
> > 
> > I would like to implement an hierarchy that will allow
> > all the host bridge devices to have a common ancestor
> > In this was, we can scan the PCI bus to look for
> > master...
> 
> I wouldn't object to is_host is stuct PCIDeviceClass.
> That's probably easier that trying to create
> a common class for devices that share no common code.
This will work too, and less changes to the code.
However Peter suggested that we can unify both upstream
and downstream handling of the master abort by putting
it all in this class.
But if we have "is_host" flag, we can differentiate
regular devices from host devices and handle them different.

If there are no objections, I will chose this path.
Thanks!
Marcel




> 
> 
> > 
> > > 
> > > -- PMM
> >
Peter Maydell Sept. 9, 2013, 2:21 p.m. UTC | #22
On 9 September 2013 15:04, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> By the way, I am not sure that the upstream transactions (DMA)
> can actually end with a master abort. Master abort would happen
> if a transaction will not be claimed by any device on the bus.
> But in DMA transactions, maybe the host bridge will always claim it
> and return with error. Does anybody know something about this?

No, it's perfectly possible for a bus master transaction
to abort. The PC's host controller happens to be set up so
that bus master DMA covers the whole of the PCI memory space
and so it's probably not possible to get an abort on that
platform, but this isn't necessarily the case. For instance
the versatilePB's PCI controller only responds to accesses
within its programmed MMIO BAR ranges, so if the device
or the controller have been misconfigured you can get an
abort when the device tries to do DMA. (This usually causes
the device to decide something has gone seriously wrong.
For instance an EHCI USB controller device will stop
and set the STS_FATAL bit in its status register:
http://lxr.free-electrons.com/source/include/linux/usb/ehci_def.h#L96

If we wanted to implement this correctly we would need
to be able to return an "access succeeded/failed" indicator
from dma_memory_write(): check hw/usb/hcd-ehci.c:get_dwords()
(which already has the logic for "whoops, DMA failed"
for the extremely simple case of "no DMA address space").

-- PMM
Marcel Apfelbaum Sept. 9, 2013, 2:51 p.m. UTC | #23
On Mon, 2013-09-09 at 15:21 +0100, Peter Maydell wrote:
> On 9 September 2013 15:04, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > By the way, I am not sure that the upstream transactions (DMA)
> > can actually end with a master abort. Master abort would happen
> > if a transaction will not be claimed by any device on the bus.
> > But in DMA transactions, maybe the host bridge will always claim it
> > and return with error. Does anybody know something about this?
> 
> No, it's perfectly possible for a bus master transaction
> to abort. The PC's host controller happens to be set up so
> that bus master DMA covers the whole of the PCI memory space
> and so it's probably not possible to get an abort on that
> platform, but this isn't necessarily the case. For instance
> the versatilePB's PCI controller only responds to accesses
> within its programmed MMIO BAR ranges, so if the device
> or the controller have been misconfigured you can get an
> abort when the device tries to do DMA. (This usually causes
> the device to decide something has gone seriously wrong.
Thanks, I am not familiar with versatilePB, I may be able
to code it, I don't know how to test it
Marcel


> For instance an EHCI USB controller device will stop
> and set the STS_FATAL bit in its status register:
> http://lxr.free-electrons.com/source/include/linux/usb/ehci_def.h#L96
> 
> If we wanted to implement this correctly we would need
> to be able to return an "access succeeded/failed" indicator
> from dma_memory_write(): check hw/usb/hcd-ehci.c:get_dwords()
> (which already has the logic for "whoops, DMA failed"
> for the extremely simple case of "no DMA address space").
> 
> -- PMM
Peter Maydell Sept. 9, 2013, 2:58 p.m. UTC | #24
On 9 September 2013 15:51, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> On Mon, 2013-09-09 at 15:21 +0100, Peter Maydell wrote:
>> No, it's perfectly possible for a bus master transaction
>> to abort. The PC's host controller happens to be set up so
>> that bus master DMA covers the whole of the PCI memory space
>> and so it's probably not possible to get an abort on that
>> platform, but this isn't necessarily the case. For instance
>> the versatilePB's PCI controller only responds to accesses
>> within its programmed MMIO BAR ranges, so if the device
>> or the controller have been misconfigured you can get an
>> abort when the device tries to do DMA. (This usually causes
>> the device to decide something has gone seriously wrong.
> Thanks, I am not familiar with versatilePB, I may be able
> to code it, I don't know how to test it

Don't worry about testing versatilePB particularly; you
just need to make sure your code can cope with master
aborts by device initiated transactions.

-- PMM
Michael S. Tsirkin Sept. 9, 2013, 3:54 p.m. UTC | #25
On Mon, Sep 09, 2013 at 03:21:44PM +0100, Peter Maydell wrote:
> On 9 September 2013 15:04, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > By the way, I am not sure that the upstream transactions (DMA)
> > can actually end with a master abort. Master abort would happen
> > if a transaction will not be claimed by any device on the bus.
> > But in DMA transactions, maybe the host bridge will always claim it
> > and return with error. Does anybody know something about this?
> 
> No, it's perfectly possible for a bus master transaction
> to abort. The PC's host controller happens to be set up so
> that bus master DMA covers the whole of the PCI memory space
> and so it's probably not possible to get an abort on that
> platform, but this isn't necessarily the case. For instance
> the versatilePB's PCI controller only responds to accesses
> within its programmed MMIO BAR ranges, so if the device
> or the controller have been misconfigured you can get an
> abort when the device tries to do DMA. (This usually causes
> the device to decide something has gone seriously wrong.
> For instance an EHCI USB controller device will stop
> and set the STS_FATAL bit in its status register:
> http://lxr.free-electrons.com/source/include/linux/usb/ehci_def.h#L96
> 
> If we wanted to implement this correctly we would need
> to be able to return an "access succeeded/failed" indicator
> from dma_memory_write(): check hw/usb/hcd-ehci.c:get_dwords()
> (which already has the logic for "whoops, DMA failed"
> for the extremely simple case of "no DMA address space").
> 
> -- PMM

Yes but that's not enough, you also need to set
the PCI status accordingly (and optionally AER if enabled  ...)
Michael S. Tsirkin Sept. 9, 2013, 4 p.m. UTC | #26
On Mon, Sep 09, 2013 at 03:58:36PM +0100, Peter Maydell wrote:
> On 9 September 2013 15:51, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > On Mon, 2013-09-09 at 15:21 +0100, Peter Maydell wrote:
> >> No, it's perfectly possible for a bus master transaction
> >> to abort. The PC's host controller happens to be set up so
> >> that bus master DMA covers the whole of the PCI memory space
> >> and so it's probably not possible to get an abort on that
> >> platform, but this isn't necessarily the case. For instance
> >> the versatilePB's PCI controller only responds to accesses
> >> within its programmed MMIO BAR ranges, so if the device
> >> or the controller have been misconfigured you can get an
> >> abort when the device tries to do DMA. (This usually causes
> >> the device to decide something has gone seriously wrong.
> > Thanks, I am not familiar with versatilePB, I may be able
> > to code it, I don't know how to test it
> 
> Don't worry about testing versatilePB particularly; you
> just need to make sure your code can cope with master
> aborts by device initiated transactions.
> 
> -- PMM

Device in question being PCI host right?
Just remove the assumption it's device 0...
Peter Maydell Sept. 9, 2013, 4:02 p.m. UTC | #27
On 9 September 2013 17:00, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Sep 09, 2013 at 03:58:36PM +0100, Peter Maydell wrote:
>> On 9 September 2013 15:51, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
>> > On Mon, 2013-09-09 at 15:21 +0100, Peter Maydell wrote:
>> >> No, it's perfectly possible for a bus master transaction
>> >> to abort. The PC's host controller happens to be set up so
>> >> that bus master DMA covers the whole of the PCI memory space
>> >> and so it's probably not possible to get an abort on that
>> >> platform, but this isn't necessarily the case. For instance
>> >> the versatilePB's PCI controller only responds to accesses
>> >> within its programmed MMIO BAR ranges, so if the device
>> >> or the controller have been misconfigured you can get an
>> >> abort when the device tries to do DMA. (This usually causes
>> >> the device to decide something has gone seriously wrong.
>> > Thanks, I am not familiar with versatilePB, I may be able
>> > to code it, I don't know how to test it
>>
>> Don't worry about testing versatilePB particularly; you
>> just need to make sure your code can cope with master
>> aborts by device initiated transactions.
>
> Device in question being PCI host right?

No, in the scenario described above the device doing the write
and getting the abort is the EHCI USB controller.

-- PMM
Michael S. Tsirkin Sept. 9, 2013, 4:34 p.m. UTC | #28
On Mon, Sep 09, 2013 at 05:02:15PM +0100, Peter Maydell wrote:
> On 9 September 2013 17:00, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Sep 09, 2013 at 03:58:36PM +0100, Peter Maydell wrote:
> >> On 9 September 2013 15:51, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> >> > On Mon, 2013-09-09 at 15:21 +0100, Peter Maydell wrote:
> >> >> No, it's perfectly possible for a bus master transaction
> >> >> to abort. The PC's host controller happens to be set up so
> >> >> that bus master DMA covers the whole of the PCI memory space
> >> >> and so it's probably not possible to get an abort on that
> >> >> platform, but this isn't necessarily the case. For instance
> >> >> the versatilePB's PCI controller only responds to accesses
> >> >> within its programmed MMIO BAR ranges, so if the device
> >> >> or the controller have been misconfigured you can get an
> >> >> abort when the device tries to do DMA. (This usually causes
> >> >> the device to decide something has gone seriously wrong.
> >> > Thanks, I am not familiar with versatilePB, I may be able
> >> > to code it, I don't know how to test it
> >>
> >> Don't worry about testing versatilePB particularly; you
> >> just need to make sure your code can cope with master
> >> aborts by device initiated transactions.
> >
> > Device in question being PCI host right?
> 
> No, in the scenario described above the device doing the write
> and getting the abort is the EHCI USB controller.
> 
> -- PMM


Well I think we shouldn't require handling this upfront - these
are very uncommon, typically a result of a driver bug.
OTOH master aborts from CPU are common, so a patch fixing
that would be a step in the right direction.
Jan Kiszka Sept. 9, 2013, 4:54 p.m. UTC | #29
On 2013-09-09 18:34, Michael S. Tsirkin wrote:
> On Mon, Sep 09, 2013 at 05:02:15PM +0100, Peter Maydell wrote:
>> On 9 September 2013 17:00, Michael S. Tsirkin <mst@redhat.com> wrote:
>>> On Mon, Sep 09, 2013 at 03:58:36PM +0100, Peter Maydell wrote:
>>>> On 9 September 2013 15:51, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
>>>>> On Mon, 2013-09-09 at 15:21 +0100, Peter Maydell wrote:
>>>>>> No, it's perfectly possible for a bus master transaction
>>>>>> to abort. The PC's host controller happens to be set up so
>>>>>> that bus master DMA covers the whole of the PCI memory space
>>>>>> and so it's probably not possible to get an abort on that
>>>>>> platform, but this isn't necessarily the case. For instance
>>>>>> the versatilePB's PCI controller only responds to accesses
>>>>>> within its programmed MMIO BAR ranges, so if the device
>>>>>> or the controller have been misconfigured you can get an
>>>>>> abort when the device tries to do DMA. (This usually causes
>>>>>> the device to decide something has gone seriously wrong.
>>>>> Thanks, I am not familiar with versatilePB, I may be able
>>>>> to code it, I don't know how to test it
>>>>
>>>> Don't worry about testing versatilePB particularly; you
>>>> just need to make sure your code can cope with master
>>>> aborts by device initiated transactions.
>>>
>>> Device in question being PCI host right?
>>
>> No, in the scenario described above the device doing the write
>> and getting the abort is the EHCI USB controller.
>>
>> -- PMM
> 
> 
> Well I think we shouldn't require handling this upfront - these
> are very uncommon, typically a result of a driver bug.
> OTOH master aborts from CPU are common, so a patch fixing
> that would be a step in the right direction.

DMA requests from one device to another targeting anything else but
RAM-backed regions will have to be rejected by QEMU in the future. We
cannot map this sanely on a per-device locking model. The filtering will
take place early in the memory core, to avoid any risk of deadlocking.
No idea if reporting them as aborts will be easily feasible then.

Jan
Peter Maydell Sept. 9, 2013, 4:58 p.m. UTC | #30
On 9 September 2013 17:54, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> DMA requests from one device to another targeting anything else but
> RAM-backed regions will have to be rejected by QEMU in the future. We
> cannot map this sanely on a per-device locking model. The filtering will
> take place early in the memory core, to avoid any risk of deadlocking.
> No idea if reporting them as aborts will be easily feasible then.

Why is a DMA request any different from any other communication
between two devices?

-- PMM
Jan Kiszka Sept. 9, 2013, 5:09 p.m. UTC | #31
On 2013-09-09 18:58, Peter Maydell wrote:
> On 9 September 2013 17:54, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> DMA requests from one device to another targeting anything else but
>> RAM-backed regions will have to be rejected by QEMU in the future. We
>> cannot map this sanely on a per-device locking model. The filtering will
>> take place early in the memory core, to avoid any risk of deadlocking.
>> No idea if reporting them as aborts will be easily feasible then.
> 
> Why is a DMA request any different from any other communication
> between two devices?

Other communication between devices requiring to take the target
device's lock while holding the one of the initiator will be a no-go as
well. But usually these scenarios are clearly defined, not
guest-influenceable and can be avoided by the initiator. DMA is too
generic. E.g., the guest can easily program a device to "accidentally"
hit another device's MMIO region, creating the precondition for
deadlocks on the host side.

Jan
Peter Maydell Sept. 9, 2013, 5:14 p.m. UTC | #32
On 9 September 2013 18:09, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2013-09-09 18:58, Peter Maydell wrote:
>> Why is a DMA request any different from any other communication
>> between two devices?
>
> Other communication between devices requiring to take the target
> device's lock while holding the one of the initiator will be a no-go as
> well. But usually these scenarios are clearly defined, not
> guest-influenceable and can be avoided by the initiator.

How? If I'm a device and I need to raise a GPIO output line
I have no idea what the other end is connected to. Similarly
for more interesting device-to-device connections than
pure on-or-off signal lines.

> DMA is too
> generic. E.g., the guest can easily program a device to
> "accidentally" hit another device's MMIO region

This is just a special case of generic device-device interaction.

-- PMM
Jan Kiszka Sept. 9, 2013, 5:27 p.m. UTC | #33
On 2013-09-09 19:14, Peter Maydell wrote:
> On 9 September 2013 18:09, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2013-09-09 18:58, Peter Maydell wrote:
>>> Why is a DMA request any different from any other communication
>>> between two devices?
>>
>> Other communication between devices requiring to take the target
>> device's lock while holding the one of the initiator will be a no-go as
>> well. But usually these scenarios are clearly defined, not
>> guest-influenceable and can be avoided by the initiator.
> 
> How? If I'm a device and I need to raise a GPIO output line
> I have no idea what the other end is connected to. Similarly
> for more interesting device-to-device connections than
> pure on-or-off signal lines.

Then you will have to write all devices involved in this in a way that
they preserve a clear locking order or drop locks before triggering such
signals - or stay with this communication completely under the BQL.

> 
>> DMA is too
>> generic. E.g., the guest can easily program a device to
>> "accidentally" hit another device's MMIO region
> 
> This is just a special case of generic device-device interaction.

DMA is dispatched by the memory core which we would like to move out of
the BQL context soon.

But you are right, we will have "fun" with interrupts and maybe some
other performance sensitive inter-device channels as well while breaking
up the BQL. Same rules as above will have to applied there.

Jan
Michael S. Tsirkin Sept. 9, 2013, 5:37 p.m. UTC | #34
On Mon, Sep 09, 2013 at 07:27:48PM +0200, Jan Kiszka wrote:
> On 2013-09-09 19:14, Peter Maydell wrote:
> > On 9 September 2013 18:09, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> >> On 2013-09-09 18:58, Peter Maydell wrote:
> >>> Why is a DMA request any different from any other communication
> >>> between two devices?
> >>
> >> Other communication between devices requiring to take the target
> >> device's lock while holding the one of the initiator will be a no-go as
> >> well. But usually these scenarios are clearly defined, not
> >> guest-influenceable and can be avoided by the initiator.
> > 
> > How? If I'm a device and I need to raise a GPIO output line
> > I have no idea what the other end is connected to. Similarly
> > for more interesting device-to-device connections than
> > pure on-or-off signal lines.
> 
> Then you will have to write all devices involved in this in a way that
> they preserve a clear locking order or drop locks before triggering such
> signals - or stay with this communication completely under the BQL.
> 
> > 
> >> DMA is too
> >> generic. E.g., the guest can easily program a device to
> >> "accidentally" hit another device's MMIO region
> > 
> > This is just a special case of generic device-device interaction.
> 
> DMA is dispatched by the memory core which we would like to move out of
> the BQL context soon.
> 
> But you are right, we will have "fun" with interrupts and maybe some
> other performance sensitive inter-device channels as well while breaking
> up the BQL. Same rules as above will have to applied there.
> 
> Jan

Do we really care about BQL for emulated devices so much?

Reason I ask, ATM I see no good reason for virtio specifically
to talk to other PCI devices, and since we control
the drivers for virtio we just need to make sure there are no
security issues with malicious guests - no need to
make it really work, trigger master aborts etc.

So maybe the solution is two APIs: a slower one that
supports device to device, and a fast one that doesn't.

> -- 
> Siemens AG, Corporate Technology, CT RTC ITP SES-DE
> Corporate Competence Center Embedded Linux
Peter Maydell Sept. 9, 2013, 5:41 p.m. UTC | #35
On 9 September 2013 18:27, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2013-09-09 19:14, Peter Maydell wrote:
>> On 9 September 2013 18:09, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>> Other communication between devices requiring to take the target
>>> device's lock while holding the one of the initiator will be a no-go as
>>> well. But usually these scenarios are clearly defined, not
>>> guest-influenceable and can be avoided by the initiator.
>>
>> How? If I'm a device and I need to raise a GPIO output line
>> I have no idea what the other end is connected to. Similarly
>> for more interesting device-to-device connections than
>> pure on-or-off signal lines.
>
> Then you will have to write all devices involved in this in a way that
> they preserve a clear locking order or drop locks before triggering such
> signals - or stay with this communication completely under the BQL.

I don't have to do anything -- this already exists and
works fine. If you want to get rid of the big lock
then you need to do something... More to the point,
"all devices involved" is the entire set of QEMU's
device models -- you can't tell what a gpio line is
going to be connected to or restrict it magically to
a subset of devices.

>>> DMA is too
>>> generic. E.g., the guest can easily program a device to
>>> "accidentally" hit another device's MMIO region
>>
>> This is just a special case of generic device-device interaction.
>
> DMA is dispatched by the memory core which we would like to move out of
> the BQL context soon.

I'm not convinced this is sufficient reason to go backwards
on emulation accuracy. You know at flatten-to-address-
space time whether any particular region is going to be
to RAM or MMIO, so it should be possible to fast/slowpath
it at that point...

> But you are right, we will have "fun" with interrupts and maybe some
> other performance sensitive inter-device channels as well while breaking
> up the BQL. Same rules as above will have to applied there.

-- PMM
Paolo Bonzini Sept. 9, 2013, 6:03 p.m. UTC | #36
Il 09/09/2013 19:27, Jan Kiszka ha scritto:
> On 2013-09-09 19:14, Peter Maydell wrote:
>> On 9 September 2013 18:09, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>> On 2013-09-09 18:58, Peter Maydell wrote:
>>>> Why is a DMA request any different from any other communication
>>>> between two devices?
>>>
>>> Other communication between devices requiring to take the target
>>> device's lock while holding the one of the initiator will be a no-go as
>>> well. But usually these scenarios are clearly defined, not
>>> guest-influenceable and can be avoided by the initiator.
>>
>> How? If I'm a device and I need to raise a GPIO output line
>> I have no idea what the other end is connected to. Similarly
>> for more interesting device-to-device connections than
>> pure on-or-off signal lines.
> 
> Then you will have to write all devices involved in this in a way that
> they preserve a clear locking order or drop locks before triggering such
> signals - or stay with this communication completely under the BQL.

I'm with Peter on this---I'm not sure why DMA-outside-BQL is different
from interrupts-outside-BQL.  If you drop locks before triggering
either, there is no need to forbid DMA between devices.

Yes, it is harder, but I'm not sure why it shouldn't work.

If it is really needed, we could do things such as wait-wound locks that
are used in databases (and in the Linux kernel) to avoid deadlocks.
Databases need to take locks in arbitrary order decided by the query
planner.

Paolo

>>
>>> DMA is too
>>> generic. E.g., the guest can easily program a device to
>>> "accidentally" hit another device's MMIO region
>>
>> This is just a special case of generic device-device interaction.
> 
> DMA is dispatched by the memory core which we would like to move out of
> the BQL context soon.
> 
> But you are right, we will have "fun" with interrupts and maybe some
> other performance sensitive inter-device channels as well while breaking
> up the BQL. Same rules as above will have to applied there.
> 
> Jan
>
Jan Kiszka Sept. 9, 2013, 6:06 p.m. UTC | #37
On 2013-09-09 19:41, Peter Maydell wrote:
> On 9 September 2013 18:27, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2013-09-09 19:14, Peter Maydell wrote:
>>> On 9 September 2013 18:09, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>>> Other communication between devices requiring to take the target
>>>> device's lock while holding the one of the initiator will be a no-go as
>>>> well. But usually these scenarios are clearly defined, not
>>>> guest-influenceable and can be avoided by the initiator.
>>>
>>> How? If I'm a device and I need to raise a GPIO output line
>>> I have no idea what the other end is connected to. Similarly
>>> for more interesting device-to-device connections than
>>> pure on-or-off signal lines.
>>
>> Then you will have to write all devices involved in this in a way that
>> they preserve a clear locking order or drop locks before triggering such
>> signals - or stay with this communication completely under the BQL.
> 
> I don't have to do anything -- this already exists and
> works fine. If you want to get rid of the big lock
> then you need to do something... More to the point,
> "all devices involved" is the entire set of QEMU's
> device models -- you can't tell what a gpio line is
> going to be connected to or restrict it magically to
> a subset of devices.

We need to do something about specific communication channels, where we
do know how can talk to whom - e.g. interrupts. But you can't address
this topic generically. Device models interested in BQL-free (or
reduced) operation will have to be changed. On a case-by-case base.

> 
>>>> DMA is too
>>>> generic. E.g., the guest can easily program a device to
>>>> "accidentally" hit another device's MMIO region
>>>
>>> This is just a special case of generic device-device interaction.
>>
>> DMA is dispatched by the memory core which we would like to move out of
>> the BQL context soon.
> 
> I'm not convinced this is sufficient reason to go backwards
> on emulation accuracy. You know at flatten-to-address-
> space time whether any particular region is going to be
> to RAM or MMIO, so it should be possible to fast/slowpath
> it at that point...

You also have to know the source in order to tell if a transaction can
be safely forwarded because BQL takes care or the initiator is holding
no locks. This has nothing to do with fast/slow, this is about the risk
to deadlock the QEMU process upon guest request.

BTW, this was discussed earlier on the list a few times (need to dig the
archive, was in the context of lock-less MMIO dispatching), and the
consensus back then was that device-to-device DMA is generally a bug
that is not worth supporting in all its beauty. But if you know a
concrete scenario / guest where it matters, that would bring in a new
aspect.

Jan
Paolo Bonzini Sept. 9, 2013, 6:11 p.m. UTC | #38
Il 09/09/2013 20:06, Jan Kiszka ha scritto:
> archive, was in the context of lock-less MMIO dispatching), and the
> consensus back then was that device-to-device DMA is generally a bug
> that is not worth supporting in all its beauty. But if you know a
> concrete scenario / guest where it matters, that would bring in a new
> aspect.

Well, one I know is

10 SCREEN 1
20 COLOR 1: CIRCLE (160, 100), 80
30 PRINT "Hello QEMU!"
40 DEF SEG=&hB800
50 BSAVE "FOO.PIC", 16000
RUN

Not sure if that counts as something that matters, or even if it works
right now with virtio-blk.

Paolo
Jan Kiszka Sept. 9, 2013, 6:49 p.m. UTC | #39
On 2013-09-09 20:03, Paolo Bonzini wrote:
> Il 09/09/2013 19:27, Jan Kiszka ha scritto:
>> On 2013-09-09 19:14, Peter Maydell wrote:
>>> On 9 September 2013 18:09, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>>> On 2013-09-09 18:58, Peter Maydell wrote:
>>>>> Why is a DMA request any different from any other communication
>>>>> between two devices?
>>>>
>>>> Other communication between devices requiring to take the target
>>>> device's lock while holding the one of the initiator will be a no-go as
>>>> well. But usually these scenarios are clearly defined, not
>>>> guest-influenceable and can be avoided by the initiator.
>>>
>>> How? If I'm a device and I need to raise a GPIO output line
>>> I have no idea what the other end is connected to. Similarly
>>> for more interesting device-to-device connections than
>>> pure on-or-off signal lines.
>>
>> Then you will have to write all devices involved in this in a way that
>> they preserve a clear locking order or drop locks before triggering such
>> signals - or stay with this communication completely under the BQL.
> 
> I'm with Peter on this---I'm not sure why DMA-outside-BQL is different
> from interrupts-outside-BQL.  If you drop locks before triggering
> either, there is no need to forbid DMA between devices.
> 
> Yes, it is harder, but I'm not sure why it shouldn't work.

Well, even if you resolve the locking issues in all the interesting
devices (not impossible, just pretty costly in several regards), you
cannot reasonably allow device A talking to device B triggering a
request on A issuing a command to B... in the general case. If such
recursions are programmable, we need to stop them before QEMU's stack
explodes.

Interrupts do not have the potential to cause this, at least with
existing machines. If a guest can configure GPIO loops between devices
models on some machine, this likely has to be addressed as well.

> 
> If it is really needed, we could do things such as wait-wound locks that
> are used in databases (and in the Linux kernel) to avoid deadlocks.
> Databases need to take locks in arbitrary order decided by the query
> planner.

Please not. Such lock semantics make it very hard - if not impossible -
to apply priority inversion avoidance protocols. Not to speak of the
massive changes on the code base to implement safe rollback. Just
because one domain can benefit from it doesn't make it a generally
useful tool.

Jan
Peter Maydell Sept. 9, 2013, 6:59 p.m. UTC | #40
On 9 September 2013 19:49, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> Well, even if you resolve the locking issues in all the interesting
> devices (not impossible, just pretty costly in several regards), you
> cannot reasonably allow device A talking to device B triggering a
> request on A issuing a command to B... in the general case. If such
> recursions are programmable, we need to stop them before QEMU's stack
> explodes.

Typically on real hardware configuring things this way causes
either (a) a memory transaction abort or (b) a deadlock. I
think we could reasonably model that by deadlocking our
device model, though as you say we should avoid actually
crashing :-)

-- PMM
Jan Kiszka Sept. 9, 2013, 7:04 p.m. UTC | #41
On 2013-09-09 20:59, Peter Maydell wrote:
> On 9 September 2013 19:49, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> Well, even if you resolve the locking issues in all the interesting
>> devices (not impossible, just pretty costly in several regards), you
>> cannot reasonably allow device A talking to device B triggering a
>> request on A issuing a command to B... in the general case. If such
>> recursions are programmable, we need to stop them before QEMU's stack
>> explodes.
> 
> Typically on real hardware configuring things this way causes
> either (a) a memory transaction abort or (b) a deadlock. I
> think we could reasonably model that by deadlocking our
> device model, though as you say we should avoid actually
> crashing :-)

If the deadlock makes the QEMU process unresponsive for management
software that tries to reset the machine, or emulated hardware watchdogs...

Jan
Michael S. Tsirkin Sept. 9, 2013, 7:27 p.m. UTC | #42
On Mon, Sep 09, 2013 at 07:59:06PM +0100, Peter Maydell wrote:
> On 9 September 2013 19:49, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> > Well, even if you resolve the locking issues in all the interesting
> > devices (not impossible, just pretty costly in several regards), you
> > cannot reasonably allow device A talking to device B triggering a
> > request on A issuing a command to B... in the general case. If such
> > recursions are programmable, we need to stop them before QEMU's stack
> > explodes.
> 
> Typically on real hardware configuring things this way causes
> either (a) a memory transaction abort or (b) a deadlock. I
> think we could reasonably model that by deadlocking our
> device model, though as you say we should avoid actually
> crashing :-)
> 
> -- PMM

That's not really true.  The PCI spec says:

The target and master state machines in the PCI interface of a simple
device are completely independent. A device cannot make the completion
of any transaction (either posted or non-posted) as a target contingent
upon the prior completion of any other transaction as a master.


But it is certainly legal for a device to complete
a transaction and then start another transaction
in response. There's no reason this should deadlock
on real hardware if implemented according to the
above spec rule.

The spec-compliant way to emulate this therefore would be
to do exactly as spec says, and make handling of incoming
IO requests and DMA independent of each other.
Locking issues would then be solved automaticlly.
Michael S. Tsirkin Sept. 9, 2013, 7:31 p.m. UTC | #43
On Mon, Sep 09, 2013 at 08:49:53PM +0200, Jan Kiszka wrote:
> On 2013-09-09 20:03, Paolo Bonzini wrote:
> > Il 09/09/2013 19:27, Jan Kiszka ha scritto:
> >> On 2013-09-09 19:14, Peter Maydell wrote:
> >>> On 9 September 2013 18:09, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> >>>> On 2013-09-09 18:58, Peter Maydell wrote:
> >>>>> Why is a DMA request any different from any other communication
> >>>>> between two devices?
> >>>>
> >>>> Other communication between devices requiring to take the target
> >>>> device's lock while holding the one of the initiator will be a no-go as
> >>>> well. But usually these scenarios are clearly defined, not
> >>>> guest-influenceable and can be avoided by the initiator.
> >>>
> >>> How? If I'm a device and I need to raise a GPIO output line
> >>> I have no idea what the other end is connected to. Similarly
> >>> for more interesting device-to-device connections than
> >>> pure on-or-off signal lines.
> >>
> >> Then you will have to write all devices involved in this in a way that
> >> they preserve a clear locking order or drop locks before triggering such
> >> signals - or stay with this communication completely under the BQL.
> > 
> > I'm with Peter on this---I'm not sure why DMA-outside-BQL is different
> > from interrupts-outside-BQL.  If you drop locks before triggering
> > either, there is no need to forbid DMA between devices.
> > 
> > Yes, it is harder, but I'm not sure why it shouldn't work.
> 
> Well, even if you resolve the locking issues in all the interesting
> devices (not impossible, just pretty costly in several regards), you
> cannot reasonably allow device A talking to device B triggering a
> request on A issuing a command to B... in the general case. If such
> recursions are programmable, we need to stop them before QEMU's stack
> explodes.

Actually in PCI, spec explicitly outlaws hardware
that blocks an incoming request because an outgoing
one is pending.

So I don't think one can get away with doing DMA directly
from a memory op and still claim strict PCI spec compliance.

> Interrupts do not have the potential to cause this, at least with
> existing machines. If a guest can configure GPIO loops between devices
> models on some machine, this likely has to be addressed as well.
> 
> > 
> > If it is really needed, we could do things such as wait-wound locks that
> > are used in databases (and in the Linux kernel) to avoid deadlocks.
> > Databases need to take locks in arbitrary order decided by the query
> > planner.
> 
> Please not. Such lock semantics make it very hard - if not impossible -
> to apply priority inversion avoidance protocols. Not to speak of the
> massive changes on the code base to implement safe rollback. Just
> because one domain can benefit from it doesn't make it a generally
> useful tool.
> 
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT RTC ITP SES-DE
> Corporate Competence Center Embedded Linux
Michael S. Tsirkin Sept. 9, 2013, 7:35 p.m. UTC | #44
On Mon, Sep 09, 2013 at 08:11:25PM +0200, Paolo Bonzini wrote:
> Il 09/09/2013 20:06, Jan Kiszka ha scritto:
> > archive, was in the context of lock-less MMIO dispatching), and the
> > consensus back then was that device-to-device DMA is generally a bug
> > that is not worth supporting in all its beauty. But if you know a
> > concrete scenario / guest where it matters, that would bring in a new
> > aspect.
> 
> Well, one I know is
> 
> 10 SCREEN 1
> 20 COLOR 1: CIRCLE (160, 100), 80
> 30 PRINT "Hello QEMU!"
> 40 DEF SEG=&hB800
> 50 BSAVE "FOO.PIC", 16000
> RUN
> 
> Not sure if that counts as something that matters, or even if it works
> right now with virtio-blk.
> 
> Paolo

I don't think it matters for virtio - it's a PV device and
we can reasonably limit what works and what doesn't work there.
It makes some sense for assigned devices (I think VFIO
supports this to some extent) and we might at some
point start emulating devices that do this.

But mostly it's a driver bug, and all we need to do
is make sure it doesn't cause security issues.
Michael S. Tsirkin Sept. 10, 2013, 12:39 p.m. UTC | #45
On Mon, Sep 09, 2013 at 02:16:41PM +0100, Peter Maydell wrote:
> On 9 September 2013 14:07, Marcel Apfelbaum <marcel.a@redhat.com> wrote:
> > This is exactly my point. ALL device on the bus can be masters
> > of a DMA transaction. So adding an interface as suggested by
> > Michael: pci_set_master_for_master_abort(PCIBus *, PCIDevice *)
> > for the general case (a device doing DMA) it is too far from reality.
> 
> Actually I don't think it would be too painful.
> At the moment in do_pci_register_device() we do this to
> create the memory region used by a device for its bus
> master transactions:
> 
>     memory_region_init_alias(&pci_dev->bus_master_enable_region,
>                              OBJECT(pci_dev), "bus master",
>                              dma_as->root, 0,
>                              memory_region_size(dma_as->root));
> 
> If instead of using this alias directly as the
> bus_master_enable region you instead:
>  * create a container region
>  * create a 'background' region at negative priority
>    (ie one per device, and you can make the 'opaque' pointer
>    point to the device, not the bus)
>  * put the alias and the background region into the container
>  * use the container as the bus_master_enable region

Interesting. There's one thing I don't understand here:
as far as I can see bus_master_enable_region covers the
whole 64 bit memory address space.

It looks like it will always override the background
region in the same container. What did I miss?

> then you will get in your callback a pointer to the
> device which caused the abort. You can then have your
> callback call a method defined on PCIDevice which we
> implement:
>  * as do-nothing in the PCI device base class
>  * as set-the-master-abort bit in the PCI host bridge
>    class
> (and anybody who wants to get fancy about handling aborts
> can override it in their own device implementation)
> 
> That seems achievable without really requiring new
> infrastructure. Have I missed something that wouldn't
> work if we did this?
> 
> thanks
> -- PMM

Actually, I think a base class would have to set received master abort
bit in the status register.
And it's not even that simple: memory writes are completed by a P2P
bridge so I think it has to set a bit in the primary status register for
the bridge and not for the device (though I'm speaking from memory,
need to check the spec).
Peter Maydell Sept. 10, 2013, 12:50 p.m. UTC | #46
On 10 September 2013 13:39, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Sep 09, 2013 at 02:16:41PM +0100, Peter Maydell wrote:
>>     memory_region_init_alias(&pci_dev->bus_master_enable_region,
>>                              OBJECT(pci_dev), "bus master",
>>                              dma_as->root, 0,
>>                              memory_region_size(dma_as->root));
>>
>> If instead of using this alias directly as the
>> bus_master_enable region you instead:
>>  * create a container region
>>  * create a 'background' region at negative priority
>>    (ie one per device, and you can make the 'opaque' pointer
>>    point to the device, not the bus)
>>  * put the alias and the background region into the container
>>  * use the container as the bus_master_enable region
>
> Interesting. There's one thing I don't understand here:
> as far as I can see bus_master_enable_region covers the
> whole 64 bit memory address space.
>
> It looks like it will always override the background
> region in the same container. What did I miss?

That should be itself a container, so assuming it doesn't
itself have any kind of background region the "holes"
inside it will still be present when we put it in
our new container. (Basically putting a container,
or an alias to one, inside a region is just saying
"put everything in that container inside this region
at the appropriate place").

>> then you will get in your callback a pointer to the
>> device which caused the abort. You can then have your
>> callback call a method defined on PCIDevice which we
>> implement:
>>  * as do-nothing in the PCI device base class
>>  * as set-the-master-abort bit in the PCI host bridge
>>    class
>> (and anybody who wants to get fancy about handling aborts
>> can override it in their own device implementation)
>>
>> That seems achievable without really requiring new
>> infrastructure. Have I missed something that wouldn't
>> work if we did this?

> Actually, I think a base class would have to set received master abort
> bit in the status register.
> And it's not even that simple: memory writes are completed by a P2P
> bridge so I think it has to set a bit in the primary status register for
> the bridge and not for the device (though I'm speaking from memory,
> need to check the spec).

Yes, I didn't really work through how bridges might
need to be handled. Hopefully we can come up with
a neat trick for those too :-)

-- PMM
Michael S. Tsirkin Sept. 10, 2013, 1:02 p.m. UTC | #47
On Tue, Sep 10, 2013 at 01:50:47PM +0100, Peter Maydell wrote:
> On 10 September 2013 13:39, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Sep 09, 2013 at 02:16:41PM +0100, Peter Maydell wrote:
> >>     memory_region_init_alias(&pci_dev->bus_master_enable_region,
> >>                              OBJECT(pci_dev), "bus master",
> >>                              dma_as->root, 0,
> >>                              memory_region_size(dma_as->root));
> >>
> >> If instead of using this alias directly as the
> >> bus_master_enable region you instead:
> >>  * create a container region
> >>  * create a 'background' region at negative priority
> >>    (ie one per device, and you can make the 'opaque' pointer
> >>    point to the device, not the bus)
> >>  * put the alias and the background region into the container
> >>  * use the container as the bus_master_enable region
> >
> > Interesting. There's one thing I don't understand here:
> > as far as I can see bus_master_enable_region covers the
> > whole 64 bit memory address space.
> >
> > It looks like it will always override the background
> > region in the same container. What did I miss?
> 
> That should be itself a container,
> so assuming it doesn't
> itself have any kind of background region the "holes"
> inside it will still be present when we put it in
> our new container. (Basically putting a container,
> or an alias to one, inside a region is just saying
> "put everything in that container inside this region
> at the appropriate place").

Confused.  "That" and "it" here refers to what exactly?

> >> then you will get in your callback a pointer to the
> >> device which caused the abort. You can then have your
> >> callback call a method defined on PCIDevice which we
> >> implement:
> >>  * as do-nothing in the PCI device base class
> >>  * as set-the-master-abort bit in the PCI host bridge
> >>    class
> >> (and anybody who wants to get fancy about handling aborts
> >> can override it in their own device implementation)
> >>
> >> That seems achievable without really requiring new
> >> infrastructure. Have I missed something that wouldn't
> >> work if we did this?
> 
> > Actually, I think a base class would have to set received master abort
> > bit in the status register.
> > And it's not even that simple: memory writes are completed by a P2P
> > bridge so I think it has to set a bit in the primary status register for
> > the bridge and not for the device (though I'm speaking from memory,
> > need to check the spec).
> 
> Yes, I didn't really work through how bridges might
> need to be handled. Hopefully we can come up with
> a neat trick for those too :-)
> 
> -- PMM
Peter Maydell Sept. 10, 2013, 1:12 p.m. UTC | #48
On 10 September 2013 14:02, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Tue, Sep 10, 2013 at 01:50:47PM +0100, Peter Maydell wrote:
>> On 10 September 2013 13:39, Michael S. Tsirkin <mst@redhat.com> wrote:
>> > On Mon, Sep 09, 2013 at 02:16:41PM +0100, Peter Maydell wrote:
>> >>     memory_region_init_alias(&pci_dev->bus_master_enable_region,
>> >>                              OBJECT(pci_dev), "bus master",
>> >>                              dma_as->root, 0,
>> >>                              memory_region_size(dma_as->root));
>> >>
>> >> If instead of using this alias directly as the
>> >> bus_master_enable region you instead:
>> >>  * create a container region
>> >>  * create a 'background' region at negative priority
>> >>    (ie one per device, and you can make the 'opaque' pointer
>> >>    point to the device, not the bus)
>> >>  * put the alias and the background region into the container
>> >>  * use the container as the bus_master_enable region
>> >
>> > Interesting. There's one thing I don't understand here:
>> > as far as I can see bus_master_enable_region covers the
>> > whole 64 bit memory address space.
>> >
>> > It looks like it will always override the background
>> > region in the same container. What did I miss?
>>
>> That should be itself a container,
>> so assuming it doesn't
>> itself have any kind of background region the "holes"
>> inside it will still be present when we put it in
>> our new container. (Basically putting a container,
>> or an alias to one, inside a region is just saying
>> "put everything in that container inside this region
>> at the appropriate place").
>
> Confused.  "That" and "it" here refers to what exactly?

Well, I was a bit confused by your talking about
the properties of "bus_master_enable_region" when my
suggestion is effectively that we change what that is.
So let's start again:
 * create a container region
This is 64 bits wide, but totally empty
 * create a 'background' region at negative priority
64 bits wide
 * put the alias and the background region into the container
The alias is 64 bits wide too, but it is an alias of
dma_as->root, which is a container with no background
region.
 * use the container as the bus_master_enable region
 -- all you see in this container is the background region
and anyhing that was in dma_as->root.

So when I said "that" and "it" I meant dma_as->root.

Hope that is a little less opaque.

-- PMM
Michael S. Tsirkin Sept. 10, 2013, 2:11 p.m. UTC | #49
On Tue, Sep 10, 2013 at 02:12:56PM +0100, Peter Maydell wrote:
> On 10 September 2013 14:02, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Tue, Sep 10, 2013 at 01:50:47PM +0100, Peter Maydell wrote:
> >> On 10 September 2013 13:39, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> > On Mon, Sep 09, 2013 at 02:16:41PM +0100, Peter Maydell wrote:
> >> >>     memory_region_init_alias(&pci_dev->bus_master_enable_region,
> >> >>                              OBJECT(pci_dev), "bus master",
> >> >>                              dma_as->root, 0,
> >> >>                              memory_region_size(dma_as->root));
> >> >>
> >> >> If instead of using this alias directly as the
> >> >> bus_master_enable region you instead:
> >> >>  * create a container region
> >> >>  * create a 'background' region at negative priority
> >> >>    (ie one per device, and you can make the 'opaque' pointer
> >> >>    point to the device, not the bus)
> >> >>  * put the alias and the background region into the container
> >> >>  * use the container as the bus_master_enable region
> >> >
> >> > Interesting. There's one thing I don't understand here:
> >> > as far as I can see bus_master_enable_region covers the
> >> > whole 64 bit memory address space.
> >> >
> >> > It looks like it will always override the background
> >> > region in the same container. What did I miss?
> >>
> >> That should be itself a container,
> >> so assuming it doesn't
> >> itself have any kind of background region the "holes"
> >> inside it will still be present when we put it in
> >> our new container. (Basically putting a container,
> >> or an alias to one, inside a region is just saying
> >> "put everything in that container inside this region
> >> at the appropriate place").
> >
> > Confused.  "That" and "it" here refers to what exactly?
> 
> Well, I was a bit confused by your talking about
> the properties of "bus_master_enable_region" when my
> suggestion is effectively that we change what that is.
> So let's start again:
>  * create a container region
> This is 64 bits wide, but totally empty
>  * create a 'background' region at negative priority
> 64 bits wide
>  * put the alias and the background region into the container
> The alias is 64 bits wide too, but it is an alias of
> dma_as->root, which is a container with no background
> region.
>  * use the container as the bus_master_enable region
>  -- all you see in this container is the background region
> and anyhing that was in dma_as->root.
> 
> So when I said "that" and "it" I meant dma_as->root.
> 
> Hope that is a little less opaque.
> 
> -- PMM

Aha. I think I begin to understand.
So it looks like that'll work for the root bus.
For the bridge however, we have this inverse decoding
of transactions: anything that falls within the bridge windows,
behaves like the root bus and sets secondary status bit.

Anything outside the window, behaves as if bridge is
the originator (for error handling purposes - e.g.
for the IOMMU you must track the original device).
Michael S. Tsirkin Sept. 15, 2013, 7:14 a.m. UTC | #50
On Tue, Sep 10, 2013 at 02:12:56PM +0100, Peter Maydell wrote:
> On 10 September 2013 14:02, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Tue, Sep 10, 2013 at 01:50:47PM +0100, Peter Maydell wrote:
> >> On 10 September 2013 13:39, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> > On Mon, Sep 09, 2013 at 02:16:41PM +0100, Peter Maydell wrote:
> >> >>     memory_region_init_alias(&pci_dev->bus_master_enable_region,
> >> >>                              OBJECT(pci_dev), "bus master",
> >> >>                              dma_as->root, 0,
> >> >>                              memory_region_size(dma_as->root));
> >> >>
> >> >> If instead of using this alias directly as the
> >> >> bus_master_enable region you instead:
> >> >>  * create a container region
> >> >>  * create a 'background' region at negative priority
> >> >>    (ie one per device, and you can make the 'opaque' pointer
> >> >>    point to the device, not the bus)
> >> >>  * put the alias and the background region into the container
> >> >>  * use the container as the bus_master_enable region
> >> >
> >> > Interesting. There's one thing I don't understand here:
> >> > as far as I can see bus_master_enable_region covers the
> >> > whole 64 bit memory address space.
> >> >
> >> > It looks like it will always override the background
> >> > region in the same container. What did I miss?
> >>
> >> That should be itself a container,
> >> so assuming it doesn't
> >> itself have any kind of background region the "holes"
> >> inside it will still be present when we put it in
> >> our new container. (Basically putting a container,
> >> or an alias to one, inside a region is just saying
> >> "put everything in that container inside this region
> >> at the appropriate place").
> >
> > Confused.  "That" and "it" here refers to what exactly?
> 
> Well, I was a bit confused by your talking about
> the properties of "bus_master_enable_region" when my
> suggestion is effectively that we change what that is.
> So let's start again:
>  * create a container region
> This is 64 bits wide, but totally empty
>  * create a 'background' region at negative priority
> 64 bits wide
>  * put the alias and the background region into the container
> The alias is 64 bits wide too, but it is an alias of
> dma_as->root, which is a container with no background
> region.
>  * use the container as the bus_master_enable region
>  -- all you see in this container is the background region
> and anyhing that was in dma_as->root.

This confused me even more.
dma root covers whole 64 bit doesn't it?

The doc says:
"This is done with memory_region_add_subregion_overlap(), which
allows the region to overlap any other region in the same container, and
specifies a priority that allows the core to decide which of two regions
at
the same address are visible (highest wins)."

So if I create an alias that also covers whole 64 bit
and background in the same
container, background with a negative priority,
won't alias always win?

> So when I said "that" and "it" I meant dma_as->root.
> 
> Hope that is a little less opaque.
> 
> -- PMM
Marcel Apfelbaum Sept. 15, 2013, 9:29 a.m. UTC | #51
On Tue, 2013-09-10 at 14:12 +0100, Peter Maydell wrote:
> On 10 September 2013 14:02, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Tue, Sep 10, 2013 at 01:50:47PM +0100, Peter Maydell wrote:
> >> On 10 September 2013 13:39, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> > On Mon, Sep 09, 2013 at 02:16:41PM +0100, Peter Maydell wrote:
> >> >>     memory_region_init_alias(&pci_dev->bus_master_enable_region,
> >> >>                              OBJECT(pci_dev), "bus master",
> >> >>                              dma_as->root, 0,
> >> >>                              memory_region_size(dma_as->root));
> >> >>
> >> >> If instead of using this alias directly as the
> >> >> bus_master_enable region you instead:
> >> >>  * create a container region
> >> >>  * create a 'background' region at negative priority
> >> >>    (ie one per device, and you can make the 'opaque' pointer
> >> >>    point to the device, not the bus)
> >> >>  * put the alias and the background region into the container
> >> >>  * use the container as the bus_master_enable region
> >> >
> >> > Interesting. There's one thing I don't understand here:
> >> > as far as I can see bus_master_enable_region covers the
> >> > whole 64 bit memory address space.
> >> >
> >> > It looks like it will always override the background
> >> > region in the same container. What did I miss?
> >>
> >> That should be itself a container,
> >> so assuming it doesn't
> >> itself have any kind of background region the "holes"
> >> inside it will still be present when we put it in
> >> our new container. (Basically putting a container,
> >> or an alias to one, inside a region is just saying
> >> "put everything in that container inside this region
> >> at the appropriate place").
> >
> > Confused.  "That" and "it" here refers to what exactly?
> 
> Well, I was a bit confused by your talking about
> the properties of "bus_master_enable_region" when my
> suggestion is effectively that we change what that is.
> So let's start again:
>  * create a container region
> This is 64 bits wide, but totally empty
>  * create a 'background' region at negative priority
> 64 bits wide
>  * put the alias and the background region into the container
> The alias is 64 bits wide too, but it is an alias of
> dma_as->root, which is a container with no background
> region.
Sadly, the priority is a local feature (per container),
so it cannot be used "cross container".
We need to find another way to leverage the fact that each device
has its own address space as target to transactions.
In the mean time I will submit another version that handles
only downstream transactions.
When we'll find a way to do it I will submit another series
Thanks for the help,
Marcel

>  * use the container as the bus_master_enable region
>  -- all you see in this container is the background region
> and anyhing that was in dma_as->root.
> 
> So when I said "that" and "it" I meant dma_as->root.
> 
> Hope that is a little less opaque.
> 
> -- PMM
Peter Maydell Sept. 15, 2013, 10:56 a.m. UTC | #52
On 15 September 2013 08:14, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Tue, Sep 10, 2013 at 02:12:56PM +0100, Peter Maydell wrote:
>> On 10 September 2013 14:02, Michael S. Tsirkin <mst@redhat.com> wrote:
>> > On Tue, Sep 10, 2013 at 01:50:47PM +0100, Peter Maydell wrote:
>> >> On 10 September 2013 13:39, Michael S. Tsirkin <mst@redhat.com> wrote:
>> >> > On Mon, Sep 09, 2013 at 02:16:41PM +0100, Peter Maydell wrote:
>> >> >>     memory_region_init_alias(&pci_dev->bus_master_enable_region,
>> >> >>                              OBJECT(pci_dev), "bus master",
>> >> >>                              dma_as->root, 0,
>> >> >>                              memory_region_size(dma_as->root));
>> >> >>
>> >> >> If instead of using this alias directly as the
>> >> >> bus_master_enable region you instead:
>> >> >>  * create a container region
>> >> >>  * create a 'background' region at negative priority
>> >> >>    (ie one per device, and you can make the 'opaque' pointer
>> >> >>    point to the device, not the bus)
>> >> >>  * put the alias and the background region into the container
>> >> >>  * use the container as the bus_master_enable region
>> >> >
>> >> > Interesting. There's one thing I don't understand here:
>> >> > as far as I can see bus_master_enable_region covers the
>> >> > whole 64 bit memory address space.
>> >> >
>> >> > It looks like it will always override the background
>> >> > region in the same container. What did I miss?
>> >>
>> >> That should be itself a container,
>> >> so assuming it doesn't
>> >> itself have any kind of background region the "holes"
>> >> inside it will still be present when we put it in
>> >> our new container. (Basically putting a container,
>> >> or an alias to one, inside a region is just saying
>> >> "put everything in that container inside this region
>> >> at the appropriate place").
>> >
>> > Confused.  "That" and "it" here refers to what exactly?
>>
>> Well, I was a bit confused by your talking about
>> the properties of "bus_master_enable_region" when my
>> suggestion is effectively that we change what that is.
>> So let's start again:
>>  * create a container region
>> This is 64 bits wide, but totally empty
>>  * create a 'background' region at negative priority
>> 64 bits wide
>>  * put the alias and the background region into the container
>> The alias is 64 bits wide too, but it is an alias of
>> dma_as->root, which is a container with no background
>> region.
>>  * use the container as the bus_master_enable region
>>  -- all you see in this container is the background region
>> and anyhing that was in dma_as->root.
>
> This confused me even more.
> dma root covers whole 64 bit doesn't it?

It has a size of 64 bits but it doesn't necessarily have
actual memory subregions with I/O operations covering the
whole contiguous range from 0 to 2^64-1 (that would only
happen if the things responding to DMA cover the whole
64 bit address, which corresponds to the situation in hardware
where something will respond to the bus master transaction
for any address. The hardware still has the "if nothing
responds do this" capability, it just never gets used.)

The case where the DMA root is an IOMMU could be
a problem though, because address_space_translate()
hardcodes io_mem_unassigned as the fallback if the
IOMMU says "no, can't write here".

> The doc says:
> "This is done with memory_region_add_subregion_overlap(), which
> allows the region to overlap any other region in the same container, and
> specifies a priority that allows the core to decide which of two regions
> at
> the same address are visible (highest wins)."
>
> So if I create an alias that also covers whole 64 bit
> and background in the same
> container, background with a negative priority,
> won't alias always win?

The alias will win for the addresses it handles. But if
the alias is a container with "holes" then it doesn't handle
the "holes" and the lower priority background region will
get them.

-- PMM
Michael S. Tsirkin Sept. 15, 2013, 11:05 a.m. UTC | #53
On Sun, Sep 15, 2013 at 11:56:40AM +0100, Peter Maydell wrote:
> On 15 September 2013 08:14, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Tue, Sep 10, 2013 at 02:12:56PM +0100, Peter Maydell wrote:
> >> On 10 September 2013 14:02, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> > On Tue, Sep 10, 2013 at 01:50:47PM +0100, Peter Maydell wrote:
> >> >> On 10 September 2013 13:39, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> >> > On Mon, Sep 09, 2013 at 02:16:41PM +0100, Peter Maydell wrote:
> >> >> >>     memory_region_init_alias(&pci_dev->bus_master_enable_region,
> >> >> >>                              OBJECT(pci_dev), "bus master",
> >> >> >>                              dma_as->root, 0,
> >> >> >>                              memory_region_size(dma_as->root));
> >> >> >>
> >> >> >> If instead of using this alias directly as the
> >> >> >> bus_master_enable region you instead:
> >> >> >>  * create a container region
> >> >> >>  * create a 'background' region at negative priority
> >> >> >>    (ie one per device, and you can make the 'opaque' pointer
> >> >> >>    point to the device, not the bus)
> >> >> >>  * put the alias and the background region into the container
> >> >> >>  * use the container as the bus_master_enable region
> >> >> >
> >> >> > Interesting. There's one thing I don't understand here:
> >> >> > as far as I can see bus_master_enable_region covers the
> >> >> > whole 64 bit memory address space.
> >> >> >
> >> >> > It looks like it will always override the background
> >> >> > region in the same container. What did I miss?
> >> >>
> >> >> That should be itself a container,
> >> >> so assuming it doesn't
> >> >> itself have any kind of background region the "holes"
> >> >> inside it will still be present when we put it in
> >> >> our new container. (Basically putting a container,
> >> >> or an alias to one, inside a region is just saying
> >> >> "put everything in that container inside this region
> >> >> at the appropriate place").
> >> >
> >> > Confused.  "That" and "it" here refers to what exactly?
> >>
> >> Well, I was a bit confused by your talking about
> >> the properties of "bus_master_enable_region" when my
> >> suggestion is effectively that we change what that is.
> >> So let's start again:
> >>  * create a container region
> >> This is 64 bits wide, but totally empty
> >>  * create a 'background' region at negative priority
> >> 64 bits wide
> >>  * put the alias and the background region into the container
> >> The alias is 64 bits wide too, but it is an alias of
> >> dma_as->root, which is a container with no background
> >> region.
> >>  * use the container as the bus_master_enable region
> >>  -- all you see in this container is the background region
> >> and anyhing that was in dma_as->root.
> >
> > This confused me even more.
> > dma root covers whole 64 bit doesn't it?
> 
> It has a size of 64 bits but it doesn't necessarily have
> actual memory subregions with I/O operations covering the
> whole contiguous range from 0 to 2^64-1
> (that would only
> happen if the things responding to DMA cover the whole
> 64 bit address, which corresponds to the situation in hardware
> where something will respond to the bus master transaction
> for any address. The hardware still has the "if nothing
> responds do this" capability, it just never gets used.)
> 
> The case where the DMA root is an IOMMU could be
> a problem though, because address_space_translate()
> hardcodes io_mem_unassigned as the fallback if the
> IOMMU says "no, can't write here".
> 
> > The doc says:
> > "This is done with memory_region_add_subregion_overlap(), which
> > allows the region to overlap any other region in the same container, and
> > specifies a priority that allows the core to decide which of two regions
> > at
> > the same address are visible (highest wins)."
> >
> > So if I create an alias that also covers whole 64 bit
> > and background in the same
> > container, background with a negative priority,
> > won't alias always win?
> 
> The alias will win for the addresses it handles. But if
> the alias is a container with "holes" then it doesn't handle
> the "holes" and the lower priority background region will
> get them.
> 
> -- PMM

Confused. How can there be a container with holes?
I thought the only way to create non-contigious
configurations is by creating multiple aliases?


Imagine this configuration:

region B - subregion of A, from 0x1000 to 0x3000
region C - subregion of A, from 0x2000 to 0x4000

region D - subregion of B from offset 0 to 0x1000

If B has higher priority that C, then part of C
from 0x2000 to 0x3000 is hidden, even though B
is a container and there's no subregion of B covering
that address range.

Right?
Peter Maydell Sept. 15, 2013, 11:23 a.m. UTC | #54
On 15 September 2013 12:05, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Sep 15, 2013 at 11:56:40AM +0100, Peter Maydell wrote:
>> The alias will win for the addresses it handles. But if
>> the alias is a container with "holes" then it doesn't handle
>> the "holes" and the lower priority background region will
>> get them.

> Confused. How can there be a container with holes?

You just create a container memory region with size,
say 0x8000, and map subregions into it which
cover, say, 0x0-0xfff and 0x2000-0x3fff. Then the
remaining area 0x1000-0x1fff and 0x4000-0x7fff
isn't covered by anything.

> Imagine this configuration:
>
> region B - subregion of A, from 0x1000 to 0x3000
> region C - subregion of A, from 0x2000 to 0x4000
>
> region D - subregion of B from offset 0 to 0x1000
>
> If B has higher priority that C, then part of C
> from 0x2000 to 0x3000 is hidden, even though B
> is a container and there's no subregion of B covering
> that address range.

No, unless you've given B itself I/O operations by
creating it with memory_region_init_io() [or _ram,
_rom_device or _iommu, but giving those subregions
is pretty weird]. If it's a "pure container" then it
doesn't respond for areas that none of its subregions
cover (it can't, it has no idea what it should do).

The code that implements this is the recursive
function memory.c:render_memory_region(),
which is what flattens the MemoryRegion hierarchy
into a flat view of "what should each part of this
address space do?".

In your example, we start by calling render_memory_region()
to render A into our FlatView. To do this we render each
subregion of A in priority order, so that's B then C.
To render B, since it's also a container, we render each
of its subregions. That means just D, so we add D's
I/O operations to the FlatView at addresses 0x1000..0x1fff.
Then we're done rendering B, because it has no I/O
ops of its own (mr->terminates is false).
Next up, render C. No subregions, so just render itself
into the FlatView. When we are working out if we can
put it into the FlatView, already claimed areas of the
FlatView take precedence. But the only thing there is
the 0x1000..0x1fff, so all of 0x2000..0x3fff is free and
we put C's I/O ops there.
Then we're done, because there are no I/O ops for A.

The key point I think is that when we're doing the "can
I put this thing here?" check we're checking against the
FlatView as populated so far, not against sibling
MemoryRegions. Note also that we can handle the
case where we have a MemoryRegion that in the
FlatView is split into two pieces because of a preexisting
section which has already been assigned.

Mostly this doesn't come up because you don't need
to play games with overlapping memory regions and
containers very often: the common case is "nothing
overlaps at all". But the facilities are there if you need
them.

-- PMM
Michael S. Tsirkin Sept. 15, 2013, 12:17 p.m. UTC | #55
On Sun, Sep 15, 2013 at 12:23:41PM +0100, Peter Maydell wrote:
> On 15 September 2013 12:05, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Sun, Sep 15, 2013 at 11:56:40AM +0100, Peter Maydell wrote:
> >> The alias will win for the addresses it handles. But if
> >> the alias is a container with "holes" then it doesn't handle
> >> the "holes" and the lower priority background region will
> >> get them.
> 
> > Confused. How can there be a container with holes?
> 
> You just create a container memory region with size,
> say 0x8000, and map subregions into it which
> cover, say, 0x0-0xfff and 0x2000-0x3fff. Then the
> remaining area 0x1000-0x1fff and 0x4000-0x7fff
> isn't covered by anything.
> 
> > Imagine this configuration:
> >
> > region B - subregion of A, from 0x1000 to 0x3000
> > region C - subregion of A, from 0x2000 to 0x4000
> >
> > region D - subregion of B from offset 0 to 0x1000
> >
> > If B has higher priority that C, then part of C
> > from 0x2000 to 0x3000 is hidden, even though B
> > is a container and there's no subregion of B covering
> > that address range.
> 
> No, unless you've given B itself I/O operations by
> creating it with memory_region_init_io() [or _ram,
> _rom_device or _iommu, but giving those subregions
> is pretty weird]

Is this allowed then?
If not maybe we should add an assert.

>. If it's a "pure container" then it
> doesn't respond for areas that none of its subregions
> cover (it can't, it has no idea what it should do).

Interesting. This really is completely undocumented
though.

documentation merely says:
	specifies a priority that allows the core to decide which of two regions at
	the same address are visible (highest wins)

which makes one think the only thing affecting
visibility is the priority.


I guess it's just like the other weird rule
which says that only the start address of
the transaction is used to select a region.

> The code that implements this is the recursive
> function memory.c:render_memory_region(),
> which is what flattens the MemoryRegion hierarchy
> into a flat view of "what should each part of this
> address space do?".
> 
> In your example, we start by calling render_memory_region()
> to render A into our FlatView. To do this we render each
> subregion of A in priority order, so that's B then C.
> To render B, since it's also a container, we render each
> of its subregions. That means just D, so we add D's
> I/O operations to the FlatView at addresses 0x1000..0x1fff.
> Then we're done rendering B, because it has no I/O
> ops of its own (mr->terminates is false).
> Next up, render C. No subregions, so just render itself
> into the FlatView. When we are working out if we can
> put it into the FlatView, already claimed areas of the
> FlatView take precedence. But the only thing there is
> the 0x1000..0x1fff, so all of 0x2000..0x3fff is free and
> we put C's I/O ops there.
> Then we're done, because there are no I/O ops for A.
> 
> The key point I think is that when we're doing the "can
> I put this thing here?" check we're checking against the
> FlatView as populated so far, not against sibling
> MemoryRegions. Note also that we can handle the
> case where we have a MemoryRegion that in the
> FlatView is split into two pieces because of a preexisting
> section which has already been assigned.

Okay, I missed this part. But we can't put this
in the document I think this is too tied to
a specific implementation.
I wonder whether there's a way to describe this
in terms that dont expose the implementation of
render_memory_region, somehow.

Maybe like follows:
when multiple regions cover the same address only one region is going to
"win" and get invoked for an access.
The winner can be determined as follows:
- "pure container" regions created with memory_region_init(..)
   are ignored
- if multiple non-container regions cover an address, the winner is
  determined using a priority vector, built of priority field values
  from address space down to our region (i.e. region priority, followed by
  a subregion priority, followed by a sub subregion priority etc).

These priority vectors are compared as follows:

- if vectors are identical, which wins is undefined
- otherwise if one vector is a sub-vector of another,
  which wins is undefined
- otherwise the first vector in the lexicographical
  order wins



> Mostly this doesn't come up because you don't need
> to play games with overlapping memory regions and
> containers very often: the common case is "nothing
> overlaps at all". But the facilities are there if you need
> them.
> 
> -- PMM

Dynamic regions like PI BARs are actually very common,
IMO this means overlapping case is actually very common.
Peter Maydell Sept. 15, 2013, 1:24 p.m. UTC | #56
On 15 September 2013 13:17, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Sep 15, 2013 at 12:23:41PM +0100, Peter Maydell wrote:
>>. If it's a "pure container" then it
>> doesn't respond for areas that none of its subregions
>> cover (it can't, it has no idea what it should do).
>
> Interesting. This really is completely undocumented
> though.
>
> documentation merely says:
>         specifies a priority that allows the core to decide which of two regions at
>         the same address are visible (highest wins)
>
> which makes one think the only thing affecting
> visibility is the priority.

Yes, we could definitely stand to improve the documentation.

> I wonder whether there's a way to describe this
> in terms that dont expose the implementation of
> render_memory_region, somehow.
>
> Maybe like follows:
> when multiple regions cover the same address only one region is going to
> "win" and get invoked for an access.
> The winner can be determined as follows:
> - "pure container" regions created with memory_region_init(..)
>    are ignored
> - if multiple non-container regions cover an address, the winner is
>   determined using a priority vector, built of priority field values
>   from address space down to our region (i.e. region priority, followed by
>   a subregion priority, followed by a sub subregion priority etc).
>
> These priority vectors are compared as follows:
>
> - if vectors are identical, which wins is undefined
> - otherwise if one vector is a sub-vector of another,
>   which wins is undefined
> - otherwise the first vector in the lexicographical
>   order wins

This just seems to me to be documenting a theoretical
implementation. If we're lucky it has the same semantics
as what we actually do; if we're unlucky it doesn't in
some corner case and is just confusing. And it would
certainly be confusing if you look at the code and it does
absolutely nothing like the documentation's algorithm.

I think we could simply add an extra bullet point in the
'Visibility' section of docs/memory.txt saying:
 - if a search within a container or alias subregion does not find
   a match, we continue with the next subregion in priority order.

plus a note at the bottom saying:
"Notice that these rules mean that if a container subregion
does not handle an address in its range (ie it has "holes"
where it has not mapped its own subregions) then lower
priority siblings of the container will be checked to see if they
should be used for accesses within those "holes".

We could also do with explicitly mentioning in the section
about priority that priority is container local, since this seems
to be the number one confusion among people looking at
memory region priority.

>> Mostly this doesn't come up because you don't need
>> to play games with overlapping memory regions and
>> containers very often: the common case is "nothing
>> overlaps at all". But the facilities are there if you need
>> them.

> Dynamic regions like PI BARs are actually very common,
> IMO this means overlapping case is actually very common.

Yes, that's true, but even then it's usually just "overlaps
of subregions within a single container" and there isn't
a need to worry about within-container versus outside-container
interactions.

-- PMM
Michael S. Tsirkin Sept. 15, 2013, 1:39 p.m. UTC | #57
On Sun, Sep 15, 2013 at 02:24:07PM +0100, Peter Maydell wrote:
> On 15 September 2013 13:17, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Sun, Sep 15, 2013 at 12:23:41PM +0100, Peter Maydell wrote:
> >>. If it's a "pure container" then it
> >> doesn't respond for areas that none of its subregions
> >> cover (it can't, it has no idea what it should do).
> >
> > Interesting. This really is completely undocumented
> > though.
> >
> > documentation merely says:
> >         specifies a priority that allows the core to decide which of two regions at
> >         the same address are visible (highest wins)
> >
> > which makes one think the only thing affecting
> > visibility is the priority.
> 
> Yes, we could definitely stand to improve the documentation.
> 
> > I wonder whether there's a way to describe this
> > in terms that dont expose the implementation of
> > render_memory_region, somehow.
> >
> > Maybe like follows:
> > when multiple regions cover the same address only one region is going to
> > "win" and get invoked for an access.
> > The winner can be determined as follows:
> > - "pure container" regions created with memory_region_init(..)
> >    are ignored
> > - if multiple non-container regions cover an address, the winner is
> >   determined using a priority vector, built of priority field values
> >   from address space down to our region (i.e. region priority, followed by
> >   a subregion priority, followed by a sub subregion priority etc).
> >
> > These priority vectors are compared as follows:
> >
> > - if vectors are identical, which wins is undefined
> > - otherwise if one vector is a sub-vector of another,
> >   which wins is undefined
> > - otherwise the first vector in the lexicographical
> >   order wins
> 
> This just seems to me to be documenting a theoretical
> implementation. If we're lucky it has the same semantics
> as what we actually do; if we're unlucky it doesn't in
> some corner case and is just confusing. And it would
> certainly be confusing if you look at the code and it does
> absolutely nothing like the documentation's algorithm.
> 
> I think we could simply add an extra bullet point in the
> 'Visibility' section of docs/memory.txt saying:
>  - if a search within a container or alias subregion does not find
>    a match, we continue with the next subregion in priority order.
> 
> plus a note at the bottom saying:
> "Notice that these rules mean that if a container subregion
> does not handle an address in its range (ie it has "holes"
> where it has not mapped its own subregions) then lower
> priority siblings of the container will be checked to see if they
> should be used for accesses within those "holes".
> 
> We could also do with explicitly mentioning in the section
> about priority that priority is container local, since this seems
> to be the number one confusion among people looking at
> memory region priority.
> 
> >> Mostly this doesn't come up because you don't need
> >> to play games with overlapping memory regions and
> >> containers very often: the common case is "nothing
> >> overlaps at all". But the facilities are there if you need
> >> them.
> 
> > Dynamic regions like PI BARs are actually very common,
> > IMO this means overlapping case is actually very common.
> 
> Yes, that's true, but even then it's usually just "overlaps
> of subregions within a single container" and there isn't
> a need to worry about within-container versus outside-container
> interactions.
> 
> -- PMM

Well actually if you look at the 'subregion collisions'
mail that I sent, must of the collisions are exactly
of this sort.

For example, mcfg in q35 overlaps the pci hole, so
any pci bar can be made to overlap it.

I guess documentation should just explain what
happens when regions overlap and not try to
say whether this case is common, or not.
Peter Maydell Sept. 15, 2013, 1:49 p.m. UTC | #58
On 15 September 2013 14:39, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Sep 15, 2013 at 02:24:07PM +0100, Peter Maydell wrote:
>> Yes, that's true, but even then it's usually just "overlaps
>> of subregions within a single container" and there isn't
>> a need to worry about within-container versus outside-container
>> interactions.

> Well actually if you look at the 'subregion collisions'
> mail that I sent, must of the collisions are exactly
> of this sort.
>
> For example, mcfg in q35 overlaps the pci hole, so
> any pci bar can be made to overlap it.

Yes, but if we were applying a sensible set of priorities
then you don't need to care at all about the contents
of the pci hole container, because the pci hole will
be at a lower priority then mcfg; so you don't get into
this odd corner case of "what happens if the high
priority container turns out not to have anything there".

-- PMM
Peter Maydell Sept. 15, 2013, 2:08 p.m. UTC | #59
On 15 September 2013 15:08, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Sep 15, 2013 at 02:49:32PM +0100, Peter Maydell wrote:
>> Yes, but if we were applying a sensible set of priorities
>> then you don't need to care at all about the contents
>> of the pci hole container, because the pci hole will
>> be at a lower priority then mcfg; so you don't get into
>> this odd corner case of "what happens if the high
>> priority container turns out not to have anything there".

> So setting sensible priorities in this case would require figuring out
> what happens on real hardware.
> As long as no one investigated, I think we are better off
> leaving this as undefined behaviour.

Well, that's your choice, but I'd be really surprised if the
PCI controller allowed PCI BARs to get mapped over the
top of builtin devices like that.

> Again, the changes you proposed yourself to support MA status bit
> means we will be using this weird feature on each and every
> PCI bus :)

It's still an odd corner case that only the PCI controller
core code needs to care about (compared to the much
larger set of container uses in random other boards and
devices we have).

-- PMM
Michael S. Tsirkin Sept. 15, 2013, 2:08 p.m. UTC | #60
On Sun, Sep 15, 2013 at 02:49:32PM +0100, Peter Maydell wrote:
> On 15 September 2013 14:39, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Sun, Sep 15, 2013 at 02:24:07PM +0100, Peter Maydell wrote:
> >> Yes, that's true, but even then it's usually just "overlaps
> >> of subregions within a single container" and there isn't
> >> a need to worry about within-container versus outside-container
> >> interactions.
> 
> > Well actually if you look at the 'subregion collisions'
> > mail that I sent, must of the collisions are exactly
> > of this sort.
> >
> > For example, mcfg in q35 overlaps the pci hole, so
> > any pci bar can be made to overlap it.
> 
> Yes, but if we were applying a sensible set of priorities
> then you don't need to care at all about the contents
> of the pci hole container, because the pci hole will
> be at a lower priority then mcfg; so you don't get into
> this odd corner case of "what happens if the high
> priority container turns out not to have anything there".
> 
> -- PMM

So setting sensible priorities in this case would require figuring out
what happens on real hardware.
As long as no one investigated, I think we are better off
leaving this as undefined behaviour.

Again, the changes you proposed yourself to support MA status bit
means we will be using this weird feature on each and every
PCI bus :)
Michael S. Tsirkin Sept. 15, 2013, 2:20 p.m. UTC | #61
On Sun, Sep 15, 2013 at 03:08:38PM +0100, Peter Maydell wrote:
> On 15 September 2013 15:08, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Sun, Sep 15, 2013 at 02:49:32PM +0100, Peter Maydell wrote:
> >> Yes, but if we were applying a sensible set of priorities
> >> then you don't need to care at all about the contents
> >> of the pci hole container, because the pci hole will
> >> be at a lower priority then mcfg; so you don't get into
> >> this odd corner case of "what happens if the high
> >> priority container turns out not to have anything there".
> 
> > So setting sensible priorities in this case would require figuring out
> > what happens on real hardware.
> > As long as no one investigated, I think we are better off
> > leaving this as undefined behaviour.
> 
> Well, that's your choice, but I'd be really surprised if the
> PCI controller allowed PCI BARs to get mapped over the
> top of builtin devices like that.

Well it has no way not to allow this, what happens
in this configuration is another matter.

> > Again, the changes you proposed yourself to support MA status bit
> > means we will be using this weird feature on each and every
> > PCI bus :)
> 
> It's still an odd corner case that only the PCI controller
> core code needs to care about

Actually you previosly wrote:
	> the versatilePB's PCI controller only responds to accesses
	> within its programmed MMIO BAR ranges, so if the device
	> or the controller have been misconfigured you can get an
	> abort when the device tries to do DMA.
Doesn't this mean versatilePB will have to have
similar code in it's PCI controller implementation -
outside PCI controller core?

Also, PCI bridge core code will care about this as well.

> (compared to the much
> larger set of container uses in random other boards and
> devices we have).
> 
> -- PMM

Right. I guess that's because most boards are not
as configurable as PCI.
Peter Maydell Sept. 15, 2013, 2:49 p.m. UTC | #62
On 15 September 2013 15:20, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Sep 15, 2013 at 03:08:38PM +0100, Peter Maydell wrote:
>> Well, that's your choice, but I'd be really surprised if the
>> PCI controller allowed PCI BARs to get mapped over the
>> top of builtin devices like that.
>
> Well it has no way not to allow this

The key phrase there was "over the top". The controller and/or
the bus fabric can choose to direct accesses to these addresses
to the builtin device regardless of what the PCI BARs are mapped
as. It's that choice that we're modelling when we set the priorities of
overlapping regions.

>> It's still an odd corner case that only the PCI controller
>> core code needs to care about
>
> Actually you previosly wrote:
>         > the versatilePB's PCI controller only responds to accesses
>         > within its programmed MMIO BAR ranges, so if the device
>         > or the controller have been misconfigured you can get an
>         > abort when the device tries to do DMA.
> Doesn't this mean versatilePB will have to have
> similar code in it's PCI controller implementation -
> outside PCI controller core?

If you implement PCI bus mastering sufficiently accurately in the
PCI core code, then maybe not (DMA to the system RAM on
a versatilePB really does look exactly like any PCI device doing
a bus master access to any other); we'll see.

-- PMM
Michael S. Tsirkin Sept. 15, 2013, 3:05 p.m. UTC | #63
On Sun, Sep 15, 2013 at 03:49:00PM +0100, Peter Maydell wrote:
> On 15 September 2013 15:20, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Sun, Sep 15, 2013 at 03:08:38PM +0100, Peter Maydell wrote:
> >> Well, that's your choice, but I'd be really surprised if the
> >> PCI controller allowed PCI BARs to get mapped over the
> >> top of builtin devices like that.
> >
> > Well it has no way not to allow this
> 
> The key phrase there was "over the top". The controller and/or
> the bus fabric can choose to direct accesses to these addresses
> to the builtin device regardless of what the PCI BARs are mapped
> as. It's that choice that we're modelling when we set the priorities of
> overlapping regions.
> 
> >> It's still an odd corner case that only the PCI controller
> >> core code needs to care about
> >
> > Actually you previosly wrote:
> >         > the versatilePB's PCI controller only responds to accesses
> >         > within its programmed MMIO BAR ranges, so if the device
> >         > or the controller have been misconfigured you can get an
> >         > abort when the device tries to do DMA.
> > Doesn't this mean versatilePB will have to have
> > similar code in it's PCI controller implementation -
> > outside PCI controller core?
> 
> If you implement PCI bus mastering sufficiently accurately in the
> PCI core code, then maybe not

Well it's not about implementation I think:
RAM is not on the PCI bus, is it?
If it's not on the PCI bus I doubt RAM accesses
can cause master aborts on the PCI bus: PCI host
would have to claim and then possibly discard them.

> (DMA to the system RAM on
> a versatilePB really does look exactly like any PCI device doing
> a bus master access to any other); we'll see.
> 
> -- PMM

Well PCI device access to another PCI device looks
differently depending on whether the other device
is on the same bus, or not.

When it's on the same bus, device detects master abort.
When it's on a different bus (across a bridge),
bridge detects master abort, device detects completion.
Peter Maydell Sept. 15, 2013, 3:08 p.m. UTC | #64
On 15 September 2013 16:05, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Sep 15, 2013 at 03:49:00PM +0100, Peter Maydell wrote:
>> On 15 September 2013 15:20, Michael S. Tsirkin <mst@redhat.com> wrote:
>> > Actually you previosly wrote:
>> >         > the versatilePB's PCI controller only responds to accesses
>> >         > within its programmed MMIO BAR ranges, so if the device
>> >         > or the controller have been misconfigured you can get an
>> >         > abort when the device tries to do DMA.
>> > Doesn't this mean versatilePB will have to have
>> > similar code in it's PCI controller implementation -
>> > outside PCI controller core?
>>
>> If you implement PCI bus mastering sufficiently accurately in the
>> PCI core code, then maybe not
>
> Well it's not about implementation I think:
> RAM is not on the PCI bus, is it?

A PCI device can only do DMA to RAM if the versatile
host controller is configured so that (a) it has mapped its
BARs into the PCI memory space somewhere and (b)
the host controller's mapping from BARs to system
addresses means device access to the host controller
BARs turn into accesses to system addresses where
the RAM is.

> If it's not on the PCI bus I doubt RAM accesses
> can cause master aborts on the PCI bus: PCI host
> would have to claim and then possibly discard them.

The aborts I refer to above are if you misprogram the
device to try to do a bus master access to some part of
PCI memory space other than where the host controller's
BARs are (or if you misprogram the controller not to
map its BARs at all).

> Well PCI device access to another PCI device looks
> differently depending on whether the other device
> is on the same bus, or not.
>
> When it's on the same bus, device detects master abort.
> When it's on a different bus (across a bridge),
> bridge detects master abort, device detects completion.

Yeah, but this is all PCI core code, not controller specific.

-- PMM
Michael S. Tsirkin Sept. 15, 2013, 3:31 p.m. UTC | #65
On Sun, Sep 15, 2013 at 04:08:26PM +0100, Peter Maydell wrote:
> On 15 September 2013 16:05, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Sun, Sep 15, 2013 at 03:49:00PM +0100, Peter Maydell wrote:
> >> On 15 September 2013 15:20, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> > Actually you previosly wrote:
> >> >         > the versatilePB's PCI controller only responds to accesses
> >> >         > within its programmed MMIO BAR ranges, so if the device
> >> >         > or the controller have been misconfigured you can get an
> >> >         > abort when the device tries to do DMA.
> >> > Doesn't this mean versatilePB will have to have
> >> > similar code in it's PCI controller implementation -
> >> > outside PCI controller core?
> >>
> >> If you implement PCI bus mastering sufficiently accurately in the
> >> PCI core code, then maybe not
> >
> > Well it's not about implementation I think:
> > RAM is not on the PCI bus, is it?
> 
> A PCI device can only do DMA to RAM if the versatile
> host controller is configured so that (a) it has mapped its
> BARs into the PCI memory space somewhere and (b)
> the host controller's mapping from BARs to system
> addresses means device access to the host controller
> BARs turn into accesses to system addresses where
> the RAM is.
> 
> > If it's not on the PCI bus I doubt RAM accesses
> > can cause master aborts on the PCI bus: PCI host
> > would have to claim and then possibly discard them.
> 
> The aborts I refer to above are if you misprogram the
> device to try to do a bus master access to some part of
> PCI memory space other than where the host controller's
> BARs are

So the controller won't claim this transaction.
In that case you are right, they likely
trigger PCI master aborts.

> (or if you misprogram the controller not to
> map its BARs at all).

I'm not sure about this one. If BAR is enabled
but part of it maps somewhere outside system RAM,
you likely won't get an error on the PCI bus.

> > Well PCI device access to another PCI device looks
> > differently depending on whether the other device
> > is on the same bus, or not.
> >
> > When it's on the same bus, device detects master abort.
> > When it's on a different bus (across a bridge),
> > bridge detects master abort, device detects completion.
> 
> Yeah, but this is all PCI core code, not controller specific.
> 
> -- PMM
Peter Maydell Sept. 15, 2013, 5:12 p.m. UTC | #66
On 15 September 2013 16:31, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Sep 15, 2013 at 04:08:26PM +0100, Peter Maydell wrote:
>> The aborts I refer to above are if you misprogram the
>> device to try to do a bus master access to some part of
>> PCI memory space other than where the host controller's
>> BARs are
>
> So the controller won't claim this transaction.
> In that case you are right, they likely
> trigger PCI master aborts.
>
>> (or if you misprogram the controller not to
>> map its BARs at all).
>
> I'm not sure about this one. If BAR is enabled
> but part of it maps somewhere outside system RAM,
> you likely won't get an error on the PCI bus.

By "not mapping the BAR" I meant "not enabling the BAR".

-- PMM
diff mbox

Patch

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index d00682e..b6a8026 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -283,6 +283,43 @@  const char *pci_root_bus_path(PCIDevice *dev)
     return rootbus->qbus.name;
 }
 
+static void unassigned_mem_access(PCIBus *bus)
+{
+    /* FIXME assumption: memory access to the pci address
+     * space is always initiated by the host bridge
+     * (device 0 on the bus) */
+    PCIDevice *d = bus->devices[0];
+    if (!d) {
+        return;
+    }
+
+    pci_word_test_and_set_mask(d->config + PCI_STATUS,
+                               PCI_STATUS_REC_MASTER_ABORT);
+}
+
+static uint64_t unassigned_mem_read(void *opaque, hwaddr addr, unsigned size)
+{
+    PCIBus *bus = opaque;
+    unassigned_mem_access(bus);
+
+    return -1ULL;
+}
+
+static void unassigned_mem_write(void *opaque, hwaddr addr, uint64_t val,
+                                unsigned size)
+{
+    PCIBus *bus = opaque;
+    unassigned_mem_access(bus);
+}
+
+static const MemoryRegionOps unassigned_mem_ops = {
+    .read = unassigned_mem_read,
+    .write = unassigned_mem_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+#define UNASSIGNED_MEM_PRIORITY -1
+
 static void pci_bus_init(PCIBus *bus, DeviceState *parent,
                          const char *name,
                          MemoryRegion *address_space_mem,
@@ -294,6 +331,15 @@  static void pci_bus_init(PCIBus *bus, DeviceState *parent,
     bus->address_space_mem = address_space_mem;
     bus->address_space_io = address_space_io;
 
+
+    memory_region_init_io(&bus->unassigned_mem, OBJECT(bus),
+                          &unassigned_mem_ops, bus, "pci-unassigned",
+                          memory_region_size(bus->address_space_mem));
+    memory_region_add_subregion_overlap(bus->address_space_mem,
+                                        bus->address_space_mem->addr,
+                                        &bus->unassigned_mem,
+                                        UNASSIGNED_MEM_PRIORITY);
+
     /* host bridge */
     QLIST_INIT(&bus->child);
 
diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
index 9df1788..4cc25a3 100644
--- a/include/hw/pci/pci_bus.h
+++ b/include/hw/pci/pci_bus.h
@@ -23,6 +23,7 @@  struct PCIBus {
     PCIDevice *parent_dev;
     MemoryRegion *address_space_mem;
     MemoryRegion *address_space_io;
+    MemoryRegion unassigned_mem;
 
     QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
     QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */