Patchwork [1/9] Add stub functions for PCI device models to do PCI DMA

login
register
mail settings
Submitter David Gibson
Date Sept. 5, 2011, 4:34 a.m.
Message ID <1315197304-22469-2-git-send-email-david@gibson.dropbear.id.au>
Download mbox | patch
Permalink /patch/113314/
State New
Headers show

Comments

David Gibson - Sept. 5, 2011, 4:34 a.m.
This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
present, these are just stubs which perform directly cpu physical memory
accesses.

Using these stubs, however, distinguishes PCI device DMA transactions from
other accesses to physical memory, which will allow PCI IOMMU support to
be added in one place, rather than updating every PCI driver at that time.

That is, it allows us to update individual PCI drivers to support an IOMMU
without having yet determined the details of how the IOMMU emulation will
operate.  This will let us remove the most bitrot-sensitive part of an
IOMMU patch in advance.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 dma.h    |    2 ++
 hw/pci.c |   31 +++++++++++++++++++++++++++++++
 hw/pci.h |   33 +++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 0 deletions(-)
Michael S. Tsirkin - Oct. 2, 2011, 10:25 a.m.
On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
> This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
> present, these are just stubs which perform directly cpu physical memory
> accesses.
> 
> Using these stubs, however, distinguishes PCI device DMA transactions from
> other accesses to physical memory, which will allow PCI IOMMU support to
> be added in one place, rather than updating every PCI driver at that time.
> 
> That is, it allows us to update individual PCI drivers to support an IOMMU
> without having yet determined the details of how the IOMMU emulation will
> operate.  This will let us remove the most bitrot-sensitive part of an
> IOMMU patch in advance.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>

So something I just thought about:

all wrappers now go through cpu_physical_memory_rw.
This is a problem as e.g. virtio assumes that
accesses such as stw are atomic. cpu_physical_memory_rw
is a memcpy which makes no such guarantees.

> ---
>  dma.h    |    2 ++
>  hw/pci.c |   31 +++++++++++++++++++++++++++++++
>  hw/pci.h |   33 +++++++++++++++++++++++++++++++++
>  3 files changed, 66 insertions(+), 0 deletions(-)
> 
> diff --git a/dma.h b/dma.h
> index a6db5ba..06e91cb 100644
> --- a/dma.h
> +++ b/dma.h
> @@ -15,6 +15,8 @@
>  #include "hw/hw.h"
>  #include "block.h"
>  
> +typedef target_phys_addr_t dma_addr_t;
> +
>  typedef struct {
>      target_phys_addr_t base;
>      target_phys_addr_t len;
> diff --git a/hw/pci.c b/hw/pci.c
> index 1cdcbb7..0be7611 100644
> --- a/hw/pci.c
> +++ b/hw/pci.c
> @@ -2211,3 +2211,34 @@ MemoryRegion *pci_address_space(PCIDevice *dev)
>  {
>      return dev->bus->address_space_mem;
>  }
> +
> +#define PCI_DMA_DEFINE_LDST(_lname, _sname, _bits) \
> +    uint##_bits##_t ld##_lname##_pci_dma(PCIDevice *dev, dma_addr_t addr) \
> +    { \
> +        uint##_bits##_t val; \
> +        pci_dma_read(dev, addr, &val, sizeof(val)); \
> +        return le##_bits##_to_cpu(val); \
> +    } \
> +    void st##_sname##_pci_dma(PCIDevice *dev, \
> +                              dma_addr_t addr, uint##_bits##_t val) \
> +    { \
> +        val = cpu_to_le##_bits(val); \
> +        pci_dma_write(dev, addr, &val, sizeof(val)); \
> +    }
> +

I am still not 100% positive why do we do the LE conversions here.
st4_phys and friends don't seem to do it ...
Has something to do with the fact we pass a value as an array?
Probably worth a comment.

> +uint8_t ldub_pci_dma(PCIDevice *dev, dma_addr_t addr)
> +{
> +    uint8_t val;
> +
> +    pci_dma_read(dev, addr, &val, sizeof(val));
> +    return val;
> +}
> +
> +void stb_pci_dma(PCIDevice *dev, dma_addr_t addr, uint8_t val)
> +{
> +    pci_dma_write(dev, addr, &val, sizeof(val));
> +}
> +

pci_ XXX would be better names?

> +PCI_DMA_DEFINE_LDST(uw, w, 16);
> +PCI_DMA_DEFINE_LDST(l, l, 32);
> +PCI_DMA_DEFINE_LDST(q, q, 64);
> diff --git a/hw/pci.h b/hw/pci.h
> index 391217e..4426e9d 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -6,6 +6,7 @@
>  
>  #include "qdev.h"
>  #include "memory.h"
> +#include "dma.h"
>  
>  /* PCI includes legacy ISA access.  */
>  #include "isa.h"
> @@ -492,4 +493,36 @@ static inline uint32_t pci_config_size(const PCIDevice *d)
>      return pci_is_express(d) ? PCIE_CONFIG_SPACE_SIZE : PCI_CONFIG_SPACE_SIZE;
>  }
>  
> +/* DMA access functions */
> +static inline int pci_dma_rw(PCIDevice *dev, dma_addr_t addr,
> +                             void *buf, dma_addr_t len, int is_write)
> +{
> +    cpu_physical_memory_rw(addr, buf, len, is_write);
> +    return 0;
> +}
> +
> +static inline int pci_dma_read(PCIDevice *dev, dma_addr_t addr,
> +                               void *buf, dma_addr_t len)
> +{
> +    return pci_dma_rw(dev, addr, buf, len, 0);
> +}
> +
> +static inline int pci_dma_write(PCIDevice *dev, dma_addr_t addr,
> +                                const void *buf, dma_addr_t len)
> +{
> +    return pci_dma_rw(dev, addr, (void *) buf, len, 1);
> +}
> +
> +#define PCI_DMA_DECLARE_LDST(_lname, _sname, _bits) \
> +    uint##_bits##_t ld##_lname##_pci_dma(PCIDevice *dev, dma_addr_t addr); \
> +    void st##_sname##_pci_dma(PCIDevice *dev, dma_addr_t addr, \
> +                              uint##_bits##_t val);            \
> +
> +PCI_DMA_DECLARE_LDST(ub, b, 8);
> +PCI_DMA_DECLARE_LDST(uw, w, 16);
> +PCI_DMA_DECLARE_LDST(l, l, 32);
> +PCI_DMA_DECLARE_LDST(q, q, 64);
> +
> +#undef DECLARE_LDST_DMA
> +

I think macros should just create stX_phys/ldX_phys calls
directly, in the .h file. This will also make it clearer what is going on,
with less levels of indirection.



>  #endif
> -- 
> 1.7.5.4
Avi Kivity - Oct. 2, 2011, 10:29 a.m.
On 10/02/2011 12:25 PM, Michael S. Tsirkin wrote:
> On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
> >  This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
> >  present, these are just stubs which perform directly cpu physical memory
> >  accesses.
> >
> >  Using these stubs, however, distinguishes PCI device DMA transactions from
> >  other accesses to physical memory, which will allow PCI IOMMU support to
> >  be added in one place, rather than updating every PCI driver at that time.
> >
> >  That is, it allows us to update individual PCI drivers to support an IOMMU
> >  without having yet determined the details of how the IOMMU emulation will
> >  operate.  This will let us remove the most bitrot-sensitive part of an
> >  IOMMU patch in advance.
> >
> >  Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
>
> So something I just thought about:
>
> all wrappers now go through cpu_physical_memory_rw.
> This is a problem as e.g. virtio assumes that
> accesses such as stw are atomic. cpu_physical_memory_rw
> is a memcpy which makes no such guarantees.
>

Let's change cpu_physical_memory_rw() to provide that guarantee for 
aligned two and four byte accesses.  Having separate paths just for that 
is not maintainable.
Michael S. Tsirkin - Oct. 2, 2011, 10:52 a.m.
On Sun, Oct 02, 2011 at 12:29:08PM +0200, Avi Kivity wrote:
> On 10/02/2011 12:25 PM, Michael S. Tsirkin wrote:
> >On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
> >>  This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
> >>  present, these are just stubs which perform directly cpu physical memory
> >>  accesses.
> >>
> >>  Using these stubs, however, distinguishes PCI device DMA transactions from
> >>  other accesses to physical memory, which will allow PCI IOMMU support to
> >>  be added in one place, rather than updating every PCI driver at that time.
> >>
> >>  That is, it allows us to update individual PCI drivers to support an IOMMU
> >>  without having yet determined the details of how the IOMMU emulation will
> >>  operate.  This will let us remove the most bitrot-sensitive part of an
> >>  IOMMU patch in advance.
> >>
> >>  Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
> >
> >So something I just thought about:
> >
> >all wrappers now go through cpu_physical_memory_rw.
> >This is a problem as e.g. virtio assumes that
> >accesses such as stw are atomic. cpu_physical_memory_rw
> >is a memcpy which makes no such guarantees.
> >
> 
> Let's change cpu_physical_memory_rw() to provide that guarantee for
> aligned two and four byte accesses.  Having separate paths just for
> that is not maintainable.

Well, we also have stX_phys convert to target native endian-ness
(nop for KVM but not necessarily for qemu).

So if we do what you suggest, this patch will become more correct, but
it would still need to duplicate the endian-ness work.

For that reason, I think calling stX_phys and friends from pci
makes more sense - we get more simple inline wrappers
but that code duplication worries me much less than tricky
endian-ness hidden within a macro.

> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - Oct. 2, 2011, 10:58 a.m.
On 10/02/2011 12:52 PM, Michael S. Tsirkin wrote:
> On Sun, Oct 02, 2011 at 12:29:08PM +0200, Avi Kivity wrote:
> >  On 10/02/2011 12:25 PM, Michael S. Tsirkin wrote:
> >  >On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
> >  >>   This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
> >  >>   present, these are just stubs which perform directly cpu physical memory
> >  >>   accesses.
> >  >>
> >  >>   Using these stubs, however, distinguishes PCI device DMA transactions from
> >  >>   other accesses to physical memory, which will allow PCI IOMMU support to
> >  >>   be added in one place, rather than updating every PCI driver at that time.
> >  >>
> >  >>   That is, it allows us to update individual PCI drivers to support an IOMMU
> >  >>   without having yet determined the details of how the IOMMU emulation will
> >  >>   operate.  This will let us remove the most bitrot-sensitive part of an
> >  >>   IOMMU patch in advance.
> >  >>
> >  >>   Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
> >  >
> >  >So something I just thought about:
> >  >
> >  >all wrappers now go through cpu_physical_memory_rw.
> >  >This is a problem as e.g. virtio assumes that
> >  >accesses such as stw are atomic. cpu_physical_memory_rw
> >  >is a memcpy which makes no such guarantees.
> >  >
> >
> >  Let's change cpu_physical_memory_rw() to provide that guarantee for
> >  aligned two and four byte accesses.  Having separate paths just for
> >  that is not maintainable.
>
> Well, we also have stX_phys convert to target native endian-ness
> (nop for KVM but not necessarily for qemu).
>
> So if we do what you suggest, this patch will become more correct, but
> it would still need to duplicate the endian-ness work.
>
> For that reason, I think calling stX_phys and friends from pci
> makes more sense - we get more simple inline wrappers
> but that code duplication worries me much less than tricky
> endian-ness hidden within a macro.
>

Good point.  Though this is really a virtio specific issue since other 
devices have explicit endianness (not guest dependent).

I think endian conversion is best made explicit in virtio (like e1000 
does explicit conversions to little endian).
Michael S. Tsirkin - Oct. 2, 2011, 11:17 a.m.
On Sun, Oct 02, 2011 at 12:58:35PM +0200, Avi Kivity wrote:
> On 10/02/2011 12:52 PM, Michael S. Tsirkin wrote:
> >On Sun, Oct 02, 2011 at 12:29:08PM +0200, Avi Kivity wrote:
> >>  On 10/02/2011 12:25 PM, Michael S. Tsirkin wrote:
> >>  >On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
> >>  >>   This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
> >>  >>   present, these are just stubs which perform directly cpu physical memory
> >>  >>   accesses.
> >>  >>
> >>  >>   Using these stubs, however, distinguishes PCI device DMA transactions from
> >>  >>   other accesses to physical memory, which will allow PCI IOMMU support to
> >>  >>   be added in one place, rather than updating every PCI driver at that time.
> >>  >>
> >>  >>   That is, it allows us to update individual PCI drivers to support an IOMMU
> >>  >>   without having yet determined the details of how the IOMMU emulation will
> >>  >>   operate.  This will let us remove the most bitrot-sensitive part of an
> >>  >>   IOMMU patch in advance.
> >>  >>
> >>  >>   Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
> >>  >
> >>  >So something I just thought about:
> >>  >
> >>  >all wrappers now go through cpu_physical_memory_rw.
> >>  >This is a problem as e.g. virtio assumes that
> >>  >accesses such as stw are atomic. cpu_physical_memory_rw
> >>  >is a memcpy which makes no such guarantees.
> >>  >
> >>
> >>  Let's change cpu_physical_memory_rw() to provide that guarantee for
> >>  aligned two and four byte accesses.  Having separate paths just for
> >>  that is not maintainable.
> >
> >Well, we also have stX_phys convert to target native endian-ness
> >(nop for KVM but not necessarily for qemu).
> >
> >So if we do what you suggest, this patch will become more correct, but
> >it would still need to duplicate the endian-ness work.
> >
> >For that reason, I think calling stX_phys and friends from pci
> >makes more sense - we get more simple inline wrappers
> >but that code duplication worries me much less than tricky
> >endian-ness hidden within a macro.
> >
> 
> Good point.  Though this is really a virtio specific issue since
> other devices have explicit endianness (not guest dependent).

Hmm, not entirely virtio specific, some devices use stX macros to do the
conversion.  E.g. stw_be_phys and stl_le_phys are used in several
places.

> I think endian conversion is best made explicit in virtio (like
> e1000 does explicit conversions to little endian).

That's certainly possible. Though it's hard to see why duplicating e.g.

static void e100_stw_le_phys(target_phys_addr_t addr, uint16_t val)
{
    val = cpu_to_le16(val);
    cpu_physical_memory_write(addr, &val, sizeof(val));
}

is a better idea than a central utility that does this.
Maybe the address is not guaranteed to be aligned in the e100
case.


> -- 
> error compiling committee.c: too many arguments to function
Alexander Graf - Oct. 2, 2011, 11:28 a.m.
On 02.10.2011, at 13:17, Michael S. Tsirkin wrote:

> On Sun, Oct 02, 2011 at 12:58:35PM +0200, Avi Kivity wrote:
>> On 10/02/2011 12:52 PM, Michael S. Tsirkin wrote:
>>> On Sun, Oct 02, 2011 at 12:29:08PM +0200, Avi Kivity wrote:
>>>> On 10/02/2011 12:25 PM, Michael S. Tsirkin wrote:
>>>>> On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
>>>>>>  This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
>>>>>>  present, these are just stubs which perform directly cpu physical memory
>>>>>>  accesses.
>>>>>> 
>>>>>>  Using these stubs, however, distinguishes PCI device DMA transactions from
>>>>>>  other accesses to physical memory, which will allow PCI IOMMU support to
>>>>>>  be added in one place, rather than updating every PCI driver at that time.
>>>>>> 
>>>>>>  That is, it allows us to update individual PCI drivers to support an IOMMU
>>>>>>  without having yet determined the details of how the IOMMU emulation will
>>>>>>  operate.  This will let us remove the most bitrot-sensitive part of an
>>>>>>  IOMMU patch in advance.
>>>>>> 
>>>>>>  Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
>>>>> 
>>>>> So something I just thought about:
>>>>> 
>>>>> all wrappers now go through cpu_physical_memory_rw.
>>>>> This is a problem as e.g. virtio assumes that
>>>>> accesses such as stw are atomic. cpu_physical_memory_rw
>>>>> is a memcpy which makes no such guarantees.
>>>>> 
>>>> 
>>>> Let's change cpu_physical_memory_rw() to provide that guarantee for
>>>> aligned two and four byte accesses.  Having separate paths just for
>>>> that is not maintainable.
>>> 
>>> Well, we also have stX_phys convert to target native endian-ness
>>> (nop for KVM but not necessarily for qemu).
>>> 
>>> So if we do what you suggest, this patch will become more correct, but
>>> it would still need to duplicate the endian-ness work.
>>> 
>>> For that reason, I think calling stX_phys and friends from pci
>>> makes more sense - we get more simple inline wrappers
>>> but that code duplication worries me much less than tricky
>>> endian-ness hidden within a macro.
>>> 
>> 
>> Good point.  Though this is really a virtio specific issue since
>> other devices have explicit endianness (not guest dependent).
> 
> Hmm, not entirely virtio specific, some devices use stX macros to do the
> conversion.  E.g. stw_be_phys and stl_le_phys are used in several
> places.

Yes, explicit endianness. Virtio is the only device type we support in QEMU that changes its endianness depending on the guest CPU. All other devices are independent of the guest CPU we're targeting.


Alex
Michael S. Tsirkin - Oct. 2, 2011, 11:43 a.m.
On Sun, Oct 02, 2011 at 01:28:37PM +0200, Alexander Graf wrote:
> >> Good point.  Though this is really a virtio specific issue since
> >> other devices have explicit endianness (not guest dependent).
> > 
> > Hmm, not entirely virtio specific, some devices use stX macros to do the
> > conversion.  E.g. stw_be_phys and stl_le_phys are used in several
> > places.
> 
> Yes, explicit endianness. Virtio is the only device type we support in QEMU that changes its endianness depending on the guest CPU. All other devices are independent of the guest CPU we're targeting.
> 
> 
> Alex

True I think, for pci devices. And virtio bypasses the iommu
anyway, so we don't need to worry about it. But the point is that it
makes sense to support endian-ness handling in the core.
Avi Kivity - Oct. 2, 2011, 12:01 p.m.
On 10/02/2011 01:17 PM, Michael S. Tsirkin wrote:
> On Sun, Oct 02, 2011 at 12:58:35PM +0200, Avi Kivity wrote:
> >  On 10/02/2011 12:52 PM, Michael S. Tsirkin wrote:
> >  >On Sun, Oct 02, 2011 at 12:29:08PM +0200, Avi Kivity wrote:
> >  >>   On 10/02/2011 12:25 PM, Michael S. Tsirkin wrote:
> >  >>   >On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
> >  >>   >>    This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
> >  >>   >>    present, these are just stubs which perform directly cpu physical memory
> >  >>   >>    accesses.
> >  >>   >>
> >  >>   >>    Using these stubs, however, distinguishes PCI device DMA transactions from
> >  >>   >>    other accesses to physical memory, which will allow PCI IOMMU support to
> >  >>   >>    be added in one place, rather than updating every PCI driver at that time.
> >  >>   >>
> >  >>   >>    That is, it allows us to update individual PCI drivers to support an IOMMU
> >  >>   >>    without having yet determined the details of how the IOMMU emulation will
> >  >>   >>    operate.  This will let us remove the most bitrot-sensitive part of an
> >  >>   >>    IOMMU patch in advance.
> >  >>   >>
> >  >>   >>    Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
> >  >>   >
> >  >>   >So something I just thought about:
> >  >>   >
> >  >>   >all wrappers now go through cpu_physical_memory_rw.
> >  >>   >This is a problem as e.g. virtio assumes that
> >  >>   >accesses such as stw are atomic. cpu_physical_memory_rw
> >  >>   >is a memcpy which makes no such guarantees.
> >  >>   >
> >  >>
> >  >>   Let's change cpu_physical_memory_rw() to provide that guarantee for
> >  >>   aligned two and four byte accesses.  Having separate paths just for
> >  >>   that is not maintainable.
> >  >
> >  >Well, we also have stX_phys convert to target native endian-ness
> >  >(nop for KVM but not necessarily for qemu).
> >  >
> >  >So if we do what you suggest, this patch will become more correct, but
> >  >it would still need to duplicate the endian-ness work.
> >  >
> >  >For that reason, I think calling stX_phys and friends from pci
> >  >makes more sense - we get more simple inline wrappers
> >  >but that code duplication worries me much less than tricky
> >  >endian-ness hidden within a macro.
> >  >
> >
> >  Good point.  Though this is really a virtio specific issue since
> >  other devices have explicit endianness (not guest dependent).
>
> Hmm, not entirely virtio specific, some devices use stX macros to do the
> conversion.  E.g. stw_be_phys and stl_le_phys are used in several
> places.

These are fine - explicit endianness.

> >  I think endian conversion is best made explicit in virtio (like
> >  e1000 does explicit conversions to little endian).
>
> That's certainly possible. Though it's hard to see why duplicating e.g.
>
> static void e100_stw_le_phys(target_phys_addr_t addr, uint16_t val)
> {
>      val = cpu_to_le16(val);
>      cpu_physical_memory_write(addr,&val, sizeof(val));
> }
>
> is a better idea than a central utility that does this.
> Maybe the address is not guaranteed to be aligned in the e100
> case.

The general case is dma'ing a structure, not a single field.  That 
doesn't mean we shouldn't have a helper.
Michael S. Tsirkin - Oct. 2, 2011, 12:14 p.m.
On Sun, Oct 02, 2011 at 02:01:10PM +0200, Avi Kivity wrote:
> On 10/02/2011 01:17 PM, Michael S. Tsirkin wrote:
> >On Sun, Oct 02, 2011 at 12:58:35PM +0200, Avi Kivity wrote:
> >>  On 10/02/2011 12:52 PM, Michael S. Tsirkin wrote:
> >>  >On Sun, Oct 02, 2011 at 12:29:08PM +0200, Avi Kivity wrote:
> >>  >>   On 10/02/2011 12:25 PM, Michael S. Tsirkin wrote:
> >>  >>   >On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
> >>  >>   >>    This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
> >>  >>   >>    present, these are just stubs which perform directly cpu physical memory
> >>  >>   >>    accesses.
> >>  >>   >>
> >>  >>   >>    Using these stubs, however, distinguishes PCI device DMA transactions from
> >>  >>   >>    other accesses to physical memory, which will allow PCI IOMMU support to
> >>  >>   >>    be added in one place, rather than updating every PCI driver at that time.
> >>  >>   >>
> >>  >>   >>    That is, it allows us to update individual PCI drivers to support an IOMMU
> >>  >>   >>    without having yet determined the details of how the IOMMU emulation will
> >>  >>   >>    operate.  This will let us remove the most bitrot-sensitive part of an
> >>  >>   >>    IOMMU patch in advance.
> >>  >>   >>
> >>  >>   >>    Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
> >>  >>   >
> >>  >>   >So something I just thought about:
> >>  >>   >
> >>  >>   >all wrappers now go through cpu_physical_memory_rw.
> >>  >>   >This is a problem as e.g. virtio assumes that
> >>  >>   >accesses such as stw are atomic. cpu_physical_memory_rw
> >>  >>   >is a memcpy which makes no such guarantees.
> >>  >>   >
> >>  >>
> >>  >>   Let's change cpu_physical_memory_rw() to provide that guarantee for
> >>  >>   aligned two and four byte accesses.  Having separate paths just for
> >>  >>   that is not maintainable.
> >>  >
> >>  >Well, we also have stX_phys convert to target native endian-ness
> >>  >(nop for KVM but not necessarily for qemu).
> >>  >
> >>  >So if we do what you suggest, this patch will become more correct, but
> >>  >it would still need to duplicate the endian-ness work.
> >>  >
> >>  >For that reason, I think calling stX_phys and friends from pci
> >>  >makes more sense - we get more simple inline wrappers
> >>  >but that code duplication worries me much less than tricky
> >>  >endian-ness hidden within a macro.
> >>  >
> >>
> >>  Good point.  Though this is really a virtio specific issue since
> >>  other devices have explicit endianness (not guest dependent).
> >
> >Hmm, not entirely virtio specific, some devices use stX macros to do the
> >conversion.  E.g. stw_be_phys and stl_le_phys are used in several
> >places.
> 
> These are fine - explicit endianness.

Right. So changing these to e.g. stl_dma and assuming
LE is default seems like a step backwards.

> >>  I think endian conversion is best made explicit in virtio (like
> >>  e1000 does explicit conversions to little endian).
> >
> >That's certainly possible. Though it's hard to see why duplicating e.g.
> >
> >static void e100_stw_le_phys(target_phys_addr_t addr, uint16_t val)
> >{
> >     val = cpu_to_le16(val);
> >     cpu_physical_memory_write(addr,&val, sizeof(val));
> >}
> >
> >is a better idea than a central utility that does this.
> >Maybe the address is not guaranteed to be aligned in the e100
> >case.
> 
> The general case is dma'ing a structure, not a single field.  That
> doesn't mean we shouldn't have a helper.
> 
> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - Oct. 2, 2011, 12:26 p.m.
On 10/02/2011 02:14 PM, Michael S. Tsirkin wrote:
> >
> >  These are fine - explicit endianness.
>
> Right. So changing these to e.g. stl_dma and assuming
> LE is default seems like a step backwards.

Agree.  "l" implies a word with some endianness, not "4 unstructured bytes".
Anthony Liguori - Oct. 3, 2011, 1:10 p.m.
On 10/02/2011 05:25 AM, Michael S. Tsirkin wrote:
> On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
>> This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
>> present, these are just stubs which perform directly cpu physical memory
>> accesses.
>>
>> Using these stubs, however, distinguishes PCI device DMA transactions from
>> other accesses to physical memory, which will allow PCI IOMMU support to
>> be added in one place, rather than updating every PCI driver at that time.
>>
>> That is, it allows us to update individual PCI drivers to support an IOMMU
>> without having yet determined the details of how the IOMMU emulation will
>> operate.  This will let us remove the most bitrot-sensitive part of an
>> IOMMU patch in advance.
>>
>> Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
>
> So something I just thought about:
>
> all wrappers now go through cpu_physical_memory_rw.
> This is a problem as e.g. virtio assumes that
> accesses such as stw are atomic. cpu_physical_memory_rw
> is a memcpy which makes no such guarantees.

That's why I asked for David to add map/unmap functions to the API.

Regards,

Anthony Liguori
Anthony Liguori - Oct. 3, 2011, 1:17 p.m.
On 10/02/2011 07:14 AM, Michael S. Tsirkin wrote:
> On Sun, Oct 02, 2011 at 02:01:10PM +0200, Avi Kivity wrote:
>>> Hmm, not entirely virtio specific, some devices use stX macros to do the
>>> conversion.  E.g. stw_be_phys and stl_le_phys are used in several
>>> places.
>>
>> These are fine - explicit endianness.
>
> Right. So changing these to e.g. stl_dma and assuming
> LE is default seems like a step backwards.

We're generalizing too much.

In general, the device model doesn't need atomic access functions.  That's 
because device model RAM access is not coherent with CPU RAM access.

Virtio is a very, very special case.  virtio requires coherent RAM access.

What we really ought to do is have a variant of the map API that allows for an 
invalidation callback.  That would allow us to map the ring for longer periods 
of time and then we could access the memory directly.

That would not only solve this problem but also improve overall performance.

Regards,

Anthony Liguori
David Gibson - Oct. 12, 2011, 3:07 a.m.
On Sun, Oct 02, 2011 at 02:14:28PM +0200, Michael S. Tsirkin wrote:
> On Sun, Oct 02, 2011 at 02:01:10PM +0200, Avi Kivity wrote:
> > On 10/02/2011 01:17 PM, Michael S. Tsirkin wrote:
> > >On Sun, Oct 02, 2011 at 12:58:35PM +0200, Avi Kivity wrote:
> > >>  On 10/02/2011 12:52 PM, Michael S. Tsirkin wrote:
> > >>  >On Sun, Oct 02, 2011 at 12:29:08PM +0200, Avi Kivity wrote:
> > >>  >>   On 10/02/2011 12:25 PM, Michael S. Tsirkin wrote:
> > >>  >>   >On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
> > >>  >>   >>    This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
> > >>  >>   >>    present, these are just stubs which perform directly cpu physical memory
> > >>  >>   >>    accesses.
> > >>  >>   >>
> > >>  >>   >>    Using these stubs, however, distinguishes PCI device DMA transactions from
> > >>  >>   >>    other accesses to physical memory, which will allow PCI IOMMU support to
> > >>  >>   >>    be added in one place, rather than updating every PCI driver at that time.
> > >>  >>   >>
> > >>  >>   >>    That is, it allows us to update individual PCI drivers to support an IOMMU
> > >>  >>   >>    without having yet determined the details of how the IOMMU emulation will
> > >>  >>   >>    operate.  This will let us remove the most bitrot-sensitive part of an
> > >>  >>   >>    IOMMU patch in advance.
> > >>  >>   >>
> > >>  >>   >>    Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
> > >>  >>   >
> > >>  >>   >So something I just thought about:
> > >>  >>   >
> > >>  >>   >all wrappers now go through cpu_physical_memory_rw.
> > >>  >>   >This is a problem as e.g. virtio assumes that
> > >>  >>   >accesses such as stw are atomic. cpu_physical_memory_rw
> > >>  >>   >is a memcpy which makes no such guarantees.
> > >>  >>   >
> > >>  >>
> > >>  >>   Let's change cpu_physical_memory_rw() to provide that guarantee for
> > >>  >>   aligned two and four byte accesses.  Having separate paths just for
> > >>  >>   that is not maintainable.
> > >>  >
> > >>  >Well, we also have stX_phys convert to target native endian-ness
> > >>  >(nop for KVM but not necessarily for qemu).
> > >>  >
> > >>  >So if we do what you suggest, this patch will become more correct, but
> > >>  >it would still need to duplicate the endian-ness work.
> > >>  >
> > >>  >For that reason, I think calling stX_phys and friends from pci
> > >>  >makes more sense - we get more simple inline wrappers
> > >>  >but that code duplication worries me much less than tricky
> > >>  >endian-ness hidden within a macro.
> > >>  >
> > >>
> > >>  Good point.  Though this is really a virtio specific issue since
> > >>  other devices have explicit endianness (not guest dependent).
> > >
> > >Hmm, not entirely virtio specific, some devices use stX macros to do the
> > >conversion.  E.g. stw_be_phys and stl_le_phys are used in several
> > >places.
> > 
> > These are fine - explicit endianness.
> 
> Right. So changing these to e.g. stl_dma and assuming
> LE is default seems like a step backwards.

Um.. why?  PCI is defined by the spec to be LE, so I don't see that we
need explicit endianness versions for PCI helpers.
David Gibson - Oct. 12, 2011, 3:09 a.m.
On Mon, Oct 03, 2011 at 08:17:05AM -0500, Anthony Liguori wrote:
> On 10/02/2011 07:14 AM, Michael S. Tsirkin wrote:
> >On Sun, Oct 02, 2011 at 02:01:10PM +0200, Avi Kivity wrote:
> >>>Hmm, not entirely virtio specific, some devices use stX macros to do the
> >>>conversion.  E.g. stw_be_phys and stl_le_phys are used in several
> >>>places.
> >>
> >>These are fine - explicit endianness.
> >
> >Right. So changing these to e.g. stl_dma and assuming
> >LE is default seems like a step backwards.
> 
> We're generalizing too much.
> 
> In general, the device model doesn't need atomic access functions.
> That's because device model RAM access is not coherent with CPU RAM
> access.
> 
> Virtio is a very, very special case.  virtio requires coherent RAM
> access.

Right, but it should only need that for the actual rings in the virtio
core.  I was expecting that those would remain as direct physical
memory accesses - precisely because virtio is special - rather than
accesses through any kind of DMA interface.
David Gibson - Oct. 12, 2011, 3:11 a.m.
On Sun, Oct 02, 2011 at 12:52:39PM +0200, Michael S. Tsirkin wrote:
> On Sun, Oct 02, 2011 at 12:29:08PM +0200, Avi Kivity wrote:
> > On 10/02/2011 12:25 PM, Michael S. Tsirkin wrote:
> > >On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
> > >>  This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
> > >>  present, these are just stubs which perform directly cpu physical memory
> > >>  accesses.
> > >>
> > >>  Using these stubs, however, distinguishes PCI device DMA transactions from
> > >>  other accesses to physical memory, which will allow PCI IOMMU support to
> > >>  be added in one place, rather than updating every PCI driver at that time.
> > >>
> > >>  That is, it allows us to update individual PCI drivers to support an IOMMU
> > >>  without having yet determined the details of how the IOMMU emulation will
> > >>  operate.  This will let us remove the most bitrot-sensitive part of an
> > >>  IOMMU patch in advance.
> > >>
> > >>  Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
> > >
> > >So something I just thought about:
> > >
> > >all wrappers now go through cpu_physical_memory_rw.
> > >This is a problem as e.g. virtio assumes that
> > >accesses such as stw are atomic. cpu_physical_memory_rw
> > >is a memcpy which makes no such guarantees.
> > >
> > 
> > Let's change cpu_physical_memory_rw() to provide that guarantee for
> > aligned two and four byte accesses.  Having separate paths just for
> > that is not maintainable.
> 
> Well, we also have stX_phys convert to target native endian-ness
> (nop for KVM but not necessarily for qemu).

Yes.. as do the stX_pci_dma() helpers.  They assume LE, rather than
having two variants, because PCI is an LE spec, and all normal PCI
devices work in LE.  If we need to model some perverse BE PCI device,
it can reswap itself.
Michael S. Tsirkin - Oct. 12, 2011, 7:22 a.m.
On Wed, Oct 12, 2011 at 02:07:46PM +1100, David Gibson wrote:
> Um.. why?  PCI is defined by the spec to be LE, so I don't see that we
> need explicit endianness versions for PCI helpers.

LE in the spec only applies to structures defined by the spec,
that is pci configuration and msix tables in device memory.
Michael S. Tsirkin - Oct. 12, 2011, 8:44 a.m.
On Wed, Oct 12, 2011 at 02:11:37PM +1100, David Gibson wrote:
> On Sun, Oct 02, 2011 at 12:52:39PM +0200, Michael S. Tsirkin wrote:
> > On Sun, Oct 02, 2011 at 12:29:08PM +0200, Avi Kivity wrote:
> > > On 10/02/2011 12:25 PM, Michael S. Tsirkin wrote:
> > > >On Mon, Sep 05, 2011 at 02:34:56PM +1000, David Gibson wrote:
> > > >>  This patch adds functions to pci.[ch] to perform PCI DMA operations.  At
> > > >>  present, these are just stubs which perform directly cpu physical memory
> > > >>  accesses.
> > > >>
> > > >>  Using these stubs, however, distinguishes PCI device DMA transactions from
> > > >>  other accesses to physical memory, which will allow PCI IOMMU support to
> > > >>  be added in one place, rather than updating every PCI driver at that time.
> > > >>
> > > >>  That is, it allows us to update individual PCI drivers to support an IOMMU
> > > >>  without having yet determined the details of how the IOMMU emulation will
> > > >>  operate.  This will let us remove the most bitrot-sensitive part of an
> > > >>  IOMMU patch in advance.
> > > >>
> > > >>  Signed-off-by: David Gibson<david@gibson.dropbear.id.au>
> > > >
> > > >So something I just thought about:
> > > >
> > > >all wrappers now go through cpu_physical_memory_rw.
> > > >This is a problem as e.g. virtio assumes that
> > > >accesses such as stw are atomic. cpu_physical_memory_rw
> > > >is a memcpy which makes no such guarantees.
> > > >
> > > 
> > > Let's change cpu_physical_memory_rw() to provide that guarantee for
> > > aligned two and four byte accesses.  Having separate paths just for
> > > that is not maintainable.
> > 
> > Well, we also have stX_phys convert to target native endian-ness
> > (nop for KVM but not necessarily for qemu).
> 
> Yes.. as do the stX_pci_dma() helpers.  They assume LE, rather than
> having two variants, because PCI is an LE spec, and all normal PCI
> devices work in LE.

IMO, not really. PCI devices do DMA any way they like.  LE is
probably more common because both ARM and x86 processors are LE.

>  If we need to model some perverse BE PCI device,
> it can reswap itself.

An explicit API for this would be cleaner.
Gerd Hoffmann - Oct. 12, 2011, 9:08 a.m.
Hi,

>> Yes.. as do the stX_pci_dma() helpers.  They assume LE, rather than
>> having two variants, because PCI is an LE spec, and all normal PCI
>> devices work in LE.
>
> IMO, not really. PCI devices do DMA any way they like.  LE is
> probably more common because both ARM and x86 processors are LE.

Also having _le_ in the function name makes explicitly clear that the 
functions read/write little endian values and byteswaps if needed, which 
makes the code more readable.  I'd suggest to add it even if there is no 
need for a _be_ companion as devices needing that are rare.

cheers,
   Gerd
Michael S. Tsirkin - Oct. 12, 2011, 9:12 a.m.
On Wed, Oct 12, 2011 at 02:09:26PM +1100, David Gibson wrote:
> On Mon, Oct 03, 2011 at 08:17:05AM -0500, Anthony Liguori wrote:
> > On 10/02/2011 07:14 AM, Michael S. Tsirkin wrote:
> > >On Sun, Oct 02, 2011 at 02:01:10PM +0200, Avi Kivity wrote:
> > >>>Hmm, not entirely virtio specific, some devices use stX macros to do the
> > >>>conversion.  E.g. stw_be_phys and stl_le_phys are used in several
> > >>>places.
> > >>
> > >>These are fine - explicit endianness.
> > >
> > >Right. So changing these to e.g. stl_dma and assuming
> > >LE is default seems like a step backwards.
> > 
> > We're generalizing too much.
> > 
> > In general, the device model doesn't need atomic access functions.

Anthony, are you sure? PCI both provides atomic operations for devices (likely
uncommon). PCI express spec strongly recommends at least dword update
granularity for both reads and writes.
Some guests might depend on this.

> > That's because device model RAM access is not coherent with CPU RAM
> > access.
> > Virtio is a very, very special case.  virtio requires coherent RAM
> > access.

E.g., e1000 driver seems to allocate its rings in coherent memory.
Required? Your guess is as good as mine. It seems to work fine
ATM without these guarantees.

> Right, but it should only need that for the actual rings in the virtio
> core.  I was expecting that those would remain as direct physical
> memory accesses - precisely because virtio is special - rather than
> accesses through any kind of DMA interface.

At the moment, yes. Further, that was just an example I know about.
How about msi/msix? We don't want to
split these writes as that would confuse the APIC.

> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
David Gibson - Oct. 12, 2011, 3:43 p.m.
On Wed, Oct 12, 2011 at 09:22:01AM +0200, Michael S. Tsirkin wrote:
> On Wed, Oct 12, 2011 at 02:07:46PM +1100, David Gibson wrote:
> > Um.. why?  PCI is defined by the spec to be LE, so I don't see that we
> > need explicit endianness versions for PCI helpers.
> 
> LE in the spec only applies to structures defined by the spec,
> that is pci configuration and msix tables in device memory.

Well, true.  But when was the last time you saw a PCI device with BE
registers?  I think it's a rare enough case that the individual device
models can reswap themselves.
David Gibson - Oct. 12, 2011, 3:45 p.m.
On Thu, Oct 13, 2011 at 02:43:06AM +1100, David Gibson wrote:
> On Wed, Oct 12, 2011 at 09:22:01AM +0200, Michael S. Tsirkin wrote:
> > On Wed, Oct 12, 2011 at 02:07:46PM +1100, David Gibson wrote:
> > > Um.. why?  PCI is defined by the spec to be LE, so I don't see that we
> > > need explicit endianness versions for PCI helpers.
> > 
> > LE in the spec only applies to structures defined by the spec,
> > that is pci configuration and msix tables in device memory.
> 
> Well, true.  But when was the last time you saw a PCI device with BE
> registers?  I think it's a rare enough case that the individual device
> models can reswap themselves.

s/BE registers/BE DMA accessed structures/
David Gibson - Oct. 14, 2011, 2:14 a.m.
On Mon, Oct 03, 2011 at 08:17:05AM -0500, Anthony Liguori wrote:
> On 10/02/2011 07:14 AM, Michael S. Tsirkin wrote:
> >On Sun, Oct 02, 2011 at 02:01:10PM +0200, Avi Kivity wrote:
> >>>Hmm, not entirely virtio specific, some devices use stX macros to do the
> >>>conversion.  E.g. stw_be_phys and stl_le_phys are used in several
> >>>places.
> >>
> >>These are fine - explicit endianness.
> >
> >Right. So changing these to e.g. stl_dma and assuming
> >LE is default seems like a step backwards.
> 
> We're generalizing too much.
> 
> In general, the device model doesn't need atomic access functions.
> That's because device model RAM access is not coherent with CPU RAM
> access.

Ok, so the next spin of these patches will have explicit LE and BE
versions of the accessors by popular demand.  I'm still using
cpu_physical_memory_rw() as the backend though, because I can't see a
case where a device could safely _require_ an emulated DMA access to
be atomic.

> Virtio is a very, very special case.  virtio requires coherent RAM access.

Right.  Virtio's access to memory is *not* emulated PCI DMA, it's
god-like hypervisor access to guest system memory.  It should
correctly bypass any IOMMU, and so should remain as
cpu_physical_memory_rw() or the atomic accessors, rather than being
converted to this new API.
Michael S. Tsirkin - Oct. 16, 2011, 12:50 p.m.
On Fri, Oct 14, 2011 at 01:14:07PM +1100, David Gibson wrote:
> On Mon, Oct 03, 2011 at 08:17:05AM -0500, Anthony Liguori wrote:
> > On 10/02/2011 07:14 AM, Michael S. Tsirkin wrote:
> > >On Sun, Oct 02, 2011 at 02:01:10PM +0200, Avi Kivity wrote:
> > >>>Hmm, not entirely virtio specific, some devices use stX macros to do the
> > >>>conversion.  E.g. stw_be_phys and stl_le_phys are used in several
> > >>>places.
> > >>
> > >>These are fine - explicit endianness.
> > >
> > >Right. So changing these to e.g. stl_dma and assuming
> > >LE is default seems like a step backwards.
> > 
> > We're generalizing too much.
> > 
> > In general, the device model doesn't need atomic access functions.
> > That's because device model RAM access is not coherent with CPU RAM
> > access.
> 
> Ok, so the next spin of these patches will have explicit LE and BE
> versions of the accessors by popular demand.  I'm still using
> cpu_physical_memory_rw() as the backend though, because I can't see a
> case where a device could safely _require_ an emulated DMA access to
> be atomic.

You don't?
PCI spec supports atomic operations. It also strongly recommends
not splitting accesses below dword boundary.

> > Virtio is a very, very special case.  virtio requires coherent RAM access.
> 
> Right.  Virtio's access to memory is *not* emulated PCI DMA, it's
> god-like hypervisor access to guest system memory.  It should
> correctly bypass any IOMMU, and so should remain as
> cpu_physical_memory_rw() or the atomic accessors, rather than being
> converted to this new API.
> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
Avi Kivity - Oct. 16, 2011, 1:15 p.m.
On 10/14/2011 04:14 AM, David Gibson wrote:
> > Virtio is a very, very special case.  virtio requires coherent RAM access.
>
> Right.  Virtio's access to memory is *not* emulated PCI DMA, it's
> god-like hypervisor access to guest system memory.  It should
> correctly bypass any IOMMU, and so should remain as
> cpu_physical_memory_rw() or the atomic accessors, rather than being
> converted to this new API.

virtio should definitely not bypass an iommu.  A guest may assign a
virtio device to nested guests, and would wish it confined by the
emulated iommu.

More generally, a guest sees a virtio device as just another pci device,
and has no way to tell that it bypasses the iommu.
David Gibson - Oct. 18, 2011, 1:46 a.m.
On Sun, Oct 16, 2011 at 03:15:53PM +0200, Avi Kivity wrote:
> On 10/14/2011 04:14 AM, David Gibson wrote:
> > > Virtio is a very, very special case.  virtio requires coherent RAM access.
> >
> > Right.  Virtio's access to memory is *not* emulated PCI DMA, it's
> > god-like hypervisor access to guest system memory.  It should
> > correctly bypass any IOMMU, and so should remain as
> > cpu_physical_memory_rw() or the atomic accessors, rather than being
> > converted to this new API.
> 
> virtio should definitely not bypass an iommu.

So, I just had a chat with Rusty about this.  Perhaps it shouldn't,
but it does.  The spec is in terms of guest physical addresses, not
bus/DMA addresses, and more to the point the Linux driver does *not*
do the necessary dma_map() and unmap operations to treat this as a PCI
DMA.  So like it or not, god-like hypervisor access rather than
emulated PCI DMA is what it does.

>  A guest may assign a
> virtio device to nested guests, and would wish it confined by the
> emulated iommu.

Well, that would be nice, but it can't be done.  It could be fixed,
but it would be an incompatible change so it would need a new feature
bit corresponding changes in the Linux driver to do the dma map/unmap
if it accepts the "respect IOMMU" feature.

> More generally, a guest sees a virtio device as just another pci device,
> and has no way to tell that it bypasses the iommu.

Well, except the fact that the driver knows its a virtio device,
because it's a virtio driver.  It's not like you can write a driver
that uses PCI DMA without knowing the particulars of the device you're
using.
Rusty Russell - Oct. 19, 2011, 12:31 a.m.
On Tue, 18 Oct 2011 12:46:50 +1100, David Gibson <dwg@au1.ibm.com> wrote:
> On Sun, Oct 16, 2011 at 03:15:53PM +0200, Avi Kivity wrote:
> > On 10/14/2011 04:14 AM, David Gibson wrote:
> > > > Virtio is a very, very special case.  virtio requires coherent RAM access.
> > >
> > > Right.  Virtio's access to memory is *not* emulated PCI DMA, it's
> > > god-like hypervisor access to guest system memory.  It should
> > > correctly bypass any IOMMU, and so should remain as
> > > cpu_physical_memory_rw() or the atomic accessors, rather than being
> > > converted to this new API.
> > 
> > virtio should definitely not bypass an iommu.
> 
> So, I just had a chat with Rusty about this.  Perhaps it shouldn't,
> but it does.  The spec is in terms of guest physical addresses, not
> bus/DMA addresses, and more to the point the Linux driver does *not*
> do the necessary dma_map() and unmap operations to treat this as a PCI
> DMA.  So like it or not, god-like hypervisor access rather than
> emulated PCI DMA is what it does.

Yep, it shouldn't but it does.  Can't break it now without a feature
bit, and there's no particular reason to add it until someone really
wants it.

Cheers,
Rusty.
Michael S. Tsirkin - Oct. 19, 2011, 1:22 a.m.
On Tue, Oct 18, 2011 at 12:46:50PM +1100, David Gibson wrote:
> On Sun, Oct 16, 2011 at 03:15:53PM +0200, Avi Kivity wrote:
> > On 10/14/2011 04:14 AM, David Gibson wrote:
> > > > Virtio is a very, very special case.  virtio requires coherent RAM access.
> > >
> > > Right.  Virtio's access to memory is *not* emulated PCI DMA, it's
> > > god-like hypervisor access to guest system memory.  It should
> > > correctly bypass any IOMMU, and so should remain as
> > > cpu_physical_memory_rw() or the atomic accessors, rather than being
> > > converted to this new API.
> > 
> > virtio should definitely not bypass an iommu.
> 
> So, I just had a chat with Rusty about this.  Perhaps it shouldn't,
> but it does.  The spec is in terms of guest physical addresses, not
> bus/DMA addresses, and more to the point the Linux driver does *not*
> do the necessary dma_map() and unmap operations to treat this as a PCI
> DMA.  So like it or not, god-like hypervisor access rather than
> emulated PCI DMA is what it does.

Fine, but I'm convinced virtio is not unique in that
it wants atomic accesses.

I just looked at hw/rtl8139.c as one example.
It uses a high bit in a 32 bit register to signal
descriptor ownership. Thus we need to read that
bit first, or read the register atomically.

Current code does cpu_physical_memory_read
which does neither of these things, but
it seems to be a bug. An atomic load would
be the best solution.
Avi Kivity - Oct. 19, 2011, 9:10 a.m.
On 10/18/2011 03:46 AM, David Gibson wrote:
> On Sun, Oct 16, 2011 at 03:15:53PM +0200, Avi Kivity wrote:
> > On 10/14/2011 04:14 AM, David Gibson wrote:
> > > > Virtio is a very, very special case.  virtio requires coherent RAM access.
> > >
> > > Right.  Virtio's access to memory is *not* emulated PCI DMA, it's
> > > god-like hypervisor access to guest system memory.  It should
> > > correctly bypass any IOMMU, and so should remain as
> > > cpu_physical_memory_rw() or the atomic accessors, rather than being
> > > converted to this new API.
> > 
> > virtio should definitely not bypass an iommu.
>
> So, I just had a chat with Rusty about this.  Perhaps it shouldn't,
> but it does.  The spec is in terms of guest physical addresses, not
> bus/DMA addresses, and more to the point the Linux driver does *not*
> do the necessary dma_map() and unmap operations to treat this as a PCI
> DMA.  So like it or not, god-like hypervisor access rather than
> emulated PCI DMA is what it does.

Wow, how did we manage to break virtio in so many different ways?

Is there a way to unbreak it?  On x86 it will continue to work if we
rewrite the spec in terms of pci dma, what about non-x86?

>
> >  A guest may assign a
> > virtio device to nested guests, and would wish it confined by the
> > emulated iommu.
>
> Well, that would be nice, but it can't be done.  It could be fixed,
> but it would be an incompatible change so it would need a new feature
> bit corresponding changes in the Linux driver to do the dma map/unmap
> if it accepts the "respect IOMMU" feature.

Needs to be done IMO.

>
> > More generally, a guest sees a virtio device as just another pci device,
> > and has no way to tell that it bypasses the iommu.
>
> Well, except the fact that the driver knows its a virtio device,
> because it's a virtio driver.  It's not like you can write a driver
> that uses PCI DMA without knowing the particulars of the device you're
> using.

virtio-pci knows it's pci, there's no excuse.
David Gibson - Oct. 20, 2011, 2:58 a.m.
On Wed, Oct 19, 2011 at 11:10:15AM +0200, Avi Kivity wrote:
> On 10/18/2011 03:46 AM, David Gibson wrote:
> > On Sun, Oct 16, 2011 at 03:15:53PM +0200, Avi Kivity wrote:
> > > On 10/14/2011 04:14 AM, David Gibson wrote:
> > > > > Virtio is a very, very special case.  virtio requires coherent RAM access.
> > > >
> > > > Right.  Virtio's access to memory is *not* emulated PCI DMA, it's
> > > > god-like hypervisor access to guest system memory.  It should
> > > > correctly bypass any IOMMU, and so should remain as
> > > > cpu_physical_memory_rw() or the atomic accessors, rather than being
> > > > converted to this new API.
> > > 
> > > virtio should definitely not bypass an iommu.
> >
> > So, I just had a chat with Rusty about this.  Perhaps it shouldn't,
> > but it does.  The spec is in terms of guest physical addresses, not
> > bus/DMA addresses, and more to the point the Linux driver does *not*
> > do the necessary dma_map() and unmap operations to treat this as a PCI
> > DMA.  So like it or not, god-like hypervisor access rather than
> > emulated PCI DMA is what it does.
> 
> Wow, how did we manage to break virtio in so many different ways?
> 
> Is there a way to unbreak it?

Yes, using a feature bit.

>  On x86 it will continue to work if we
> rewrite the spec in terms of pci dma, what about non-x86?

No, anything with a non-optional IOMMU will break horribly.  That's
why we need a feature bit.

> > >  A guest may assign a
> > > virtio device to nested guests, and would wish it confined by the
> > > emulated iommu.
> >
> > Well, that would be nice, but it can't be done.  It could be fixed,
> > but it would be an incompatible change so it would need a new feature
> > bit corresponding changes in the Linux driver to do the dma map/unmap
> > if it accepts the "respect IOMMU" feature.
> 
> Needs to be done IMO.

Well, sure, but my point is that I'm not volunteering for it.  Someone
who actually needs the feature can do the work.

Patch

diff --git a/dma.h b/dma.h
index a6db5ba..06e91cb 100644
--- a/dma.h
+++ b/dma.h
@@ -15,6 +15,8 @@ 
 #include "hw/hw.h"
 #include "block.h"
 
+typedef target_phys_addr_t dma_addr_t;
+
 typedef struct {
     target_phys_addr_t base;
     target_phys_addr_t len;
diff --git a/hw/pci.c b/hw/pci.c
index 1cdcbb7..0be7611 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -2211,3 +2211,34 @@  MemoryRegion *pci_address_space(PCIDevice *dev)
 {
     return dev->bus->address_space_mem;
 }
+
+#define PCI_DMA_DEFINE_LDST(_lname, _sname, _bits) \
+    uint##_bits##_t ld##_lname##_pci_dma(PCIDevice *dev, dma_addr_t addr) \
+    { \
+        uint##_bits##_t val; \
+        pci_dma_read(dev, addr, &val, sizeof(val)); \
+        return le##_bits##_to_cpu(val); \
+    } \
+    void st##_sname##_pci_dma(PCIDevice *dev, \
+                              dma_addr_t addr, uint##_bits##_t val) \
+    { \
+        val = cpu_to_le##_bits(val); \
+        pci_dma_write(dev, addr, &val, sizeof(val)); \
+    }
+
+uint8_t ldub_pci_dma(PCIDevice *dev, dma_addr_t addr)
+{
+    uint8_t val;
+
+    pci_dma_read(dev, addr, &val, sizeof(val));
+    return val;
+}
+
+void stb_pci_dma(PCIDevice *dev, dma_addr_t addr, uint8_t val)
+{
+    pci_dma_write(dev, addr, &val, sizeof(val));
+}
+
+PCI_DMA_DEFINE_LDST(uw, w, 16);
+PCI_DMA_DEFINE_LDST(l, l, 32);
+PCI_DMA_DEFINE_LDST(q, q, 64);
diff --git a/hw/pci.h b/hw/pci.h
index 391217e..4426e9d 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -6,6 +6,7 @@ 
 
 #include "qdev.h"
 #include "memory.h"
+#include "dma.h"
 
 /* PCI includes legacy ISA access.  */
 #include "isa.h"
@@ -492,4 +493,36 @@  static inline uint32_t pci_config_size(const PCIDevice *d)
     return pci_is_express(d) ? PCIE_CONFIG_SPACE_SIZE : PCI_CONFIG_SPACE_SIZE;
 }
 
+/* DMA access functions */
+static inline int pci_dma_rw(PCIDevice *dev, dma_addr_t addr,
+                             void *buf, dma_addr_t len, int is_write)
+{
+    cpu_physical_memory_rw(addr, buf, len, is_write);
+    return 0;
+}
+
+static inline int pci_dma_read(PCIDevice *dev, dma_addr_t addr,
+                               void *buf, dma_addr_t len)
+{
+    return pci_dma_rw(dev, addr, buf, len, 0);
+}
+
+static inline int pci_dma_write(PCIDevice *dev, dma_addr_t addr,
+                                const void *buf, dma_addr_t len)
+{
+    return pci_dma_rw(dev, addr, (void *) buf, len, 1);
+}
+
+#define PCI_DMA_DECLARE_LDST(_lname, _sname, _bits) \
+    uint##_bits##_t ld##_lname##_pci_dma(PCIDevice *dev, dma_addr_t addr); \
+    void st##_sname##_pci_dma(PCIDevice *dev, dma_addr_t addr, \
+                              uint##_bits##_t val);            \
+
+PCI_DMA_DECLARE_LDST(ub, b, 8);
+PCI_DMA_DECLARE_LDST(uw, w, 16);
+PCI_DMA_DECLARE_LDST(l, l, 32);
+PCI_DMA_DECLARE_LDST(q, q, 64);
+
+#undef DECLARE_LDST_DMA
+
 #endif