Patchwork [RFC] dma_rw.h (was Re: [PATCH 0/7] AMD IOMMU emulation patchset v4)

login
register
mail settings
Submitter Michael S. Tsirkin
Date Sept. 13, 2010, 8:01 p.m.
Message ID <20100913200120.GA20918@redhat.com>
Download mbox | patch
Permalink /patch/64645/
State New
Headers show

Comments

Michael S. Tsirkin - Sept. 13, 2010, 8:01 p.m.
So I think the following will give the idea of what an API
might look like that will let us avoid the scary hacks in
e.g. the ide layer and other generic layers that need to do DMA,
without either binding us to pci, adding more complexity with
callbacks, or losing type safety with casts and void*.

Basically we have DMADevice that we can use container_of on
to get a PCIDevice from, and DMAMmu that will get instanciated
in a specific MMU.

This is not complete code - just a header - I might complete
this later if/when there's interest or hopefully someone interested
in iommu emulation will.

Notes:
the IOMMU_PERM_RW code seem unused, so I replaced
this with plain is_write. Is it ever useful?

It seems that invalidate callback should be able to
get away with just a device, so I switched to that
from a void pointer for type safety.
Seems enough for the users I saw.

I saw devices do stl_le_phys and such, these
might need to be wrapped as well.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---
Anthony Liguori - Sept. 13, 2010, 8:45 p.m.
On 09/13/2010 03:01 PM, Michael S. Tsirkin wrote:
> So I think the following will give the idea of what an API
> might look like that will let us avoid the scary hacks in
> e.g. the ide layer and other generic layers that need to do DMA,
> without either binding us to pci, adding more complexity with
> callbacks, or losing type safety with casts and void*.
>
> Basically we have DMADevice that we can use container_of on
> to get a PCIDevice from, and DMAMmu that will get instanciated
> in a specific MMU.
>
> This is not complete code - just a header - I might complete
> this later if/when there's interest or hopefully someone interested
> in iommu emulation will.
>
> Notes:
> the IOMMU_PERM_RW code seem unused, so I replaced
> this with plain is_write. Is it ever useful?
>
> It seems that invalidate callback should be able to
> get away with just a device, so I switched to that
> from a void pointer for type safety.
> Seems enough for the users I saw.
>
> I saw devices do stl_le_phys and such, these
> might need to be wrapped as well.
>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>    

One of the troubles with an interface like this is that I'm not sure a 
generic model universally works.

For instance, I know some PCI busses do transparent byte swapping.  For 
this to work, there has to be a notion of generic memory reads/writes 
vs. reads of a 32-bit, 16-bit, and 8-bit value.

With a generic API, we lose the flexibility to do this type of bus 
interface.

Regards,

Anthony Liguori

> ---
>
> diff --git a/hw/dma_rw.h b/hw/dma_rw.h
> new file mode 100644
> index 0000000..d63fd17
> --- /dev/null
> +++ b/hw/dma_rw.h
> @@ -0,0 +1,122 @@
> +#ifndef DMA_RW_H
> +#define DMA_RW_H
> +
> +#include "qemu-common.h"
> +
> +/* We currently only have pci mmus, but using
> +   a generic type makes it possible to use this
> +   e.g. from the generic ide code without callbacks. */
> +typedef uint64_t dma_addr_t;
> +
> +typedef struct DMAMmu DMAMmu;
> +typedef struct DMADevice DMADevice;
> +
> +typedef int DMATranslateFunc(DMAMmu *mmu,
> +                             DMADevice *dev,
> +                             dma_addr_t addr,
> +                             dma_addr_t *paddr,
> +                             dma_addr_t *len,
> +                             int is_write);
> +
> +typedef int DMAInvalidateMapFunc(DMADevice *);
> +struct DMAMmu {
> +	/* invalidate, etc. */
> +	DmaTranslateFunc *translate;
> +};
> +
> +struct DMADevice {
> +	DMAMmu *mmu;
> +	DMAInvalidateMapFunc *invalidate;
> +};
> +
> +void dma_device_init(DMADevice *, DMAMmu *, DMAInvalidateMapFunc *);
> +
> +static inline void dma_memory_rw(DMADevice *dev,
> +				 dma_addr_t addr,
> +				 void *buf,
> +				 uint32_t len,
> +				 int is_write)
> +{
> +    uint32_t plen;
> +    /* Fast-path non-iommu.
> +     * More importantly, makes it obvious what this function does. */
> +    if (!dev->mmu) {
> +    	cpu_physical_memory_rw(paddr, buf, plen, is_write);
> +    	return;
> +    }
> +    while (len) {
> +        err = dev->mmu->translate(iommu, dev, addr,&paddr,&plen, is_write);
> +        if (err) {
> +            return;
> +        }
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen>  len) {
> +            plen = len;
> +        }
> +
> +        cpu_physical_memory_rw(paddr, buf, plen, is_write);
> +
> +        len -= plen;
> +        addr += plen;
> +        buf += plen;
> +    }
> +}
> +
> +void *dma_memory_map(DMADevice *dev,
> +                            dma_addr_t addr,
> +                            uint32_t *len,
> +                            int is_write);
> +void dma_memory_unmap(DMADevice *dev,
> +		      void *buffer,
> +		      uint32_t len,
> +		      int is_write,
> +		      uint32_t access_len);
> +
> +
> ++#define DEFINE_DMA_LD(suffix, size)                                       \
> ++uint##size##_t dma_ld##suffix(DMADevice *dev, dma_addr_t addr)            \
> ++{                                                                         \
> ++    int err;                                                              \
> ++    target_phys_addr_t paddr, plen;                                       \
> ++    if (!dev->mmu) {                                                      \
> ++        return ld##suffix##_phys(addr, val);                              \
> ++    }                                                                     \
> ++                                                                          \
> ++    err = dev->mmu->translate(dev->bus->iommu, dev,                       \
> ++                              addr,&paddr,&plen, IOMMU_PERM_READ);      \
> ++    if (err || (plen<  size / 8))                                         \
> ++        return 0;                                                         \
> ++                                                                          \
> ++    return ld##suffix##_phys(paddr);                                      \
> ++}
> ++
> ++#define DEFINE_DMA_ST(suffix, size)                                       \
> ++void dma_st##suffix(DMADevice *dev, dma_addr_t addr, uint##size##_t val)  \
> ++{                                                                         \
> ++    int err;                                                              \
> ++    target_phys_addr_t paddr, plen;                                       \
> ++                                                                          \
> ++    if (!dev->mmu) {                                                      \
> ++        st##suffix##_phys(addr, val);                                     \
> ++        return;                                                           \
> ++    }                                                                     \
> ++    err = dev->mmu->translate(dev->bus->iommu, dev,                       \
> ++                              addr,&paddr,&plen, IOMMU_PERM_WRITE);     \
> ++    if (err || (plen<  size / 8))                                         \
> ++        return;                                                           \
> ++                                                                          \
> ++    st##suffix##_phys(paddr, val);                                        \
> ++}
> +
> +DEFINE_DMA_LD(ub, 8)
> +DEFINE_DMA_LD(uw, 16)
> +DEFINE_DMA_LD(l, 32)
> +DEFINE_DMA_LD(q, 64)
> +
> +DEFINE_DMA_ST(b, 8)
> +DEFINE_DMA_ST(w, 16)
> +DEFINE_DMA_ST(l, 32)
> +DEFINE_DMA_ST(q, 64)
> +
> +#endif
> diff --git a/hw/pci.h b/hw/pci.h
> index 1c6075e..9737f0e 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -5,6 +5,7 @@
>   #include "qobject.h"
>
>   #include "qdev.h"
> +#include "dma_rw.h"
>
>   /* PCI includes legacy ISA access.  */
>   #include "isa.h"
> @@ -119,6 +120,10 @@ enum {
>
>   struct PCIDevice {
>       DeviceState qdev;
> +
> +    /* For devices that do DMA. */
> +    DMADevice dma;
> +
>       /* PCI config space */
>       uint8_t *config;
>
>
>
Eduard - Gabriel Munteanu - Sept. 16, 2010, 7:06 a.m.
On Mon, Sep 13, 2010 at 10:01:20PM +0200, Michael S. Tsirkin wrote:
> So I think the following will give the idea of what an API
> might look like that will let us avoid the scary hacks in
> e.g. the ide layer and other generic layers that need to do DMA,
> without either binding us to pci, adding more complexity with
> callbacks, or losing type safety with casts and void*.
> 
> Basically we have DMADevice that we can use container_of on
> to get a PCIDevice from, and DMAMmu that will get instanciated
> in a specific MMU.
> 
> This is not complete code - just a header - I might complete
> this later if/when there's interest or hopefully someone interested
> in iommu emulation will.

Hi,

I personally like this approach better. It also seems to make poisoning
cpu_physical_memory_*() easier if we convert every device to this API.
We could then ban cpu_physical_memory_*(), perhaps by requiring a
#define and #ifdef-ing those declarations.

> Notes:
> the IOMMU_PERM_RW code seem unused, so I replaced
> this with plain is_write. Is it ever useful?

The original idea made provisions for stuff like full R/W memory maps.
In that case cpu_physical_memory_map() would call the translation /
checking function with perms == IOMMU_PERM_RW. That's not there yet so
it can be removed at the moment, especially since it only affects these
helpers.

Also, I'm not sure if there are other sorts of accesses besides reads
and writes we want to check or translate.

> It seems that invalidate callback should be able to
> get away with just a device, so I switched to that
> from a void pointer for type safety.
> Seems enough for the users I saw.

I think this makes matters too complicated. Normally, a single DMADevice
should be embedded within a <bus>Device, so doing this makes it really
hard to invalidate a specific map when there are more of them. It forces
device code to act as a bus, provide fake 'DMADevice's for each map and
dispatch translation to the real DMATranslateFunc. I see no other way.

If you really want more type-safety (although I think this is a case of
a true opaque identifying something only device code understands), I
have another proposal: have a DMAMap embedded in the opaque. Example
from dma-helpers.c:

typedef struct {
	DMADevice *owner;
	[...]
} DMAMap;

typedef struct {
	[...]
	DMAMap map;
	[...]
} DMAAIOCB;

/* The callback. */
static void dma_bdrv_cancel(DMAMap *map)
{
	DMAAIOCB *dbs = container_of(map, DMAAIOCB, map);

	[...]
}

The upside is we only need to pass the DMAMap. That can also contain
details of the actual map in case the device wants to release only the
relevant range and remap the rest.

> I saw devices do stl_le_phys and such, these
> might need to be wrapped as well.

stl_le_phys() is defined and used only by hw/eepro100.c. That's already
dealt with by converting the device.


	Thanks,
	Eduard

> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> ---
> 
> diff --git a/hw/dma_rw.h b/hw/dma_rw.h
> new file mode 100644
> index 0000000..d63fd17
> --- /dev/null
> +++ b/hw/dma_rw.h
> @@ -0,0 +1,122 @@
> +#ifndef DMA_RW_H
> +#define DMA_RW_H
> +
> +#include "qemu-common.h"
> +
> +/* We currently only have pci mmus, but using
> +   a generic type makes it possible to use this
> +   e.g. from the generic ide code without callbacks. */
> +typedef uint64_t dma_addr_t;
> +
> +typedef struct DMAMmu DMAMmu;
> +typedef struct DMADevice DMADevice;
> +
> +typedef int DMATranslateFunc(DMAMmu *mmu,
> +                             DMADevice *dev,
> +                             dma_addr_t addr,
> +                             dma_addr_t *paddr,
> +                             dma_addr_t *len,
> +                             int is_write);
> +
> +typedef int DMAInvalidateMapFunc(DMADevice *);
> +struct DMAMmu {
> +	/* invalidate, etc. */
> +	DmaTranslateFunc *translate;
> +};
> +
> +struct DMADevice {
> +	DMAMmu *mmu;
> +	DMAInvalidateMapFunc *invalidate;
> +};
> +
> +void dma_device_init(DMADevice *, DMAMmu *, DMAInvalidateMapFunc *);
> +
> +static inline void dma_memory_rw(DMADevice *dev,
> +				 dma_addr_t addr,
> +				 void *buf,
> +				 uint32_t len,
> +				 int is_write)
> +{
> +    uint32_t plen;
> +    /* Fast-path non-iommu.
> +     * More importantly, makes it obvious what this function does. */
> +    if (!dev->mmu) {
> +    	cpu_physical_memory_rw(paddr, buf, plen, is_write);
> +    	return;
> +    }
> +    while (len) {
> +        err = dev->mmu->translate(iommu, dev, addr, &paddr, &plen, is_write);
> +        if (err) {
> +            return;
> +        }
> +                                      
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len) {
> +            plen = len;
> +        }
> +    
> +        cpu_physical_memory_rw(paddr, buf, plen, is_write);
> +
> +        len -= plen;
> +        addr += plen;
> +        buf += plen;
> +    }
> +}
> +
> +void *dma_memory_map(DMADevice *dev,
> +                            dma_addr_t addr,
> +                            uint32_t *len,
> +                            int is_write);
> +void dma_memory_unmap(DMADevice *dev,
> +		      void *buffer,
> +		      uint32_t len,
> +		      int is_write,
> +		      uint32_t access_len);
> +
> +
> ++#define DEFINE_DMA_LD(suffix, size)                                       \
> ++uint##size##_t dma_ld##suffix(DMADevice *dev, dma_addr_t addr)            \
> ++{                                                                         \
> ++    int err;                                                              \
> ++    target_phys_addr_t paddr, plen;                                       \
> ++    if (!dev->mmu) {                                                      \
> ++        return ld##suffix##_phys(addr, val);                              \
> ++    }                                                                     \
> ++                                                                          \
> ++    err = dev->mmu->translate(dev->bus->iommu, dev,                       \
> ++                              addr, &paddr, &plen, IOMMU_PERM_READ);      \
> ++    if (err || (plen < size / 8))                                         \
> ++        return 0;                                                         \
> ++                                                                          \
> ++    return ld##suffix##_phys(paddr);                                      \
> ++}
> ++
> ++#define DEFINE_DMA_ST(suffix, size)                                       \
> ++void dma_st##suffix(DMADevice *dev, dma_addr_t addr, uint##size##_t val)  \
> ++{                                                                         \
> ++    int err;                                                              \
> ++    target_phys_addr_t paddr, plen;                                       \
> ++                                                                          \
> ++    if (!dev->mmu) {                                                      \
> ++        st##suffix##_phys(addr, val);                                     \
> ++        return;                                                           \
> ++    }                                                                     \
> ++    err = dev->mmu->translate(dev->bus->iommu, dev,                       \
> ++                              addr, &paddr, &plen, IOMMU_PERM_WRITE);     \
> ++    if (err || (plen < size / 8))                                         \
> ++        return;                                                           \
> ++                                                                          \
> ++    st##suffix##_phys(paddr, val);                                        \
> ++}
> +
> +DEFINE_DMA_LD(ub, 8)
> +DEFINE_DMA_LD(uw, 16)
> +DEFINE_DMA_LD(l, 32)
> +DEFINE_DMA_LD(q, 64)
> +
> +DEFINE_DMA_ST(b, 8)
> +DEFINE_DMA_ST(w, 16)
> +DEFINE_DMA_ST(l, 32)
> +DEFINE_DMA_ST(q, 64)
> +
> +#endif
> diff --git a/hw/pci.h b/hw/pci.h
> index 1c6075e..9737f0e 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -5,6 +5,7 @@
>  #include "qobject.h"
>  
>  #include "qdev.h"
> +#include "dma_rw.h"
>  
>  /* PCI includes legacy ISA access.  */
>  #include "isa.h"
> @@ -119,6 +120,10 @@ enum {
>  
>  struct PCIDevice {
>      DeviceState qdev;
> +
> +    /* For devices that do DMA. */
> +    DMADevice dma;
> +
>      /* PCI config space */
>      uint8_t *config;
>
Eduard - Gabriel Munteanu - Sept. 16, 2010, 7:12 a.m.
On Mon, Sep 13, 2010 at 03:45:34PM -0500, Anthony Liguori wrote:
> On 09/13/2010 03:01 PM, Michael S. Tsirkin wrote:
> > So I think the following will give the idea of what an API
> > might look like that will let us avoid the scary hacks in
> > e.g. the ide layer and other generic layers that need to do DMA,
> > without either binding us to pci, adding more complexity with
> > callbacks, or losing type safety with casts and void*.
> >
> > Basically we have DMADevice that we can use container_of on
> > to get a PCIDevice from, and DMAMmu that will get instanciated
> > in a specific MMU.
> >
> > This is not complete code - just a header - I might complete
> > this later if/when there's interest or hopefully someone interested
> > in iommu emulation will.
> >
> > Notes:
> > the IOMMU_PERM_RW code seem unused, so I replaced
> > this with plain is_write. Is it ever useful?
> >
> > It seems that invalidate callback should be able to
> > get away with just a device, so I switched to that
> > from a void pointer for type safety.
> > Seems enough for the users I saw.
> >
> > I saw devices do stl_le_phys and such, these
> > might need to be wrapped as well.
> >
> > Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> >    
> 
> One of the troubles with an interface like this is that I'm not sure a 
> generic model universally works.
> 
> For instance, I know some PCI busses do transparent byte swapping.  For 
> this to work, there has to be a notion of generic memory reads/writes 
> vs. reads of a 32-bit, 16-bit, and 8-bit value.
> 
> With a generic API, we lose the flexibility to do this type of bus 
> interface.
> 
> Regards,
> 
> Anthony Liguori
> 

[snip]

I suppose additional callbacks that do the actual R/W could solve this.
If those aren't present, default to cpu_physical_memory_*().

It should be easy for such a callback to decide on a case-by-case basis
depending on the R/W transaction size, if this is ever needed.


	Eduard
Michael S. Tsirkin - Sept. 16, 2010, 9:20 a.m.
On Thu, Sep 16, 2010 at 10:06:16AM +0300, Eduard - Gabriel Munteanu wrote:
> On Mon, Sep 13, 2010 at 10:01:20PM +0200, Michael S. Tsirkin wrote:
> > So I think the following will give the idea of what an API
> > might look like that will let us avoid the scary hacks in
> > e.g. the ide layer and other generic layers that need to do DMA,
> > without either binding us to pci, adding more complexity with
> > callbacks, or losing type safety with casts and void*.
> > 
> > Basically we have DMADevice that we can use container_of on
> > to get a PCIDevice from, and DMAMmu that will get instanciated
> > in a specific MMU.
> > 
> > This is not complete code - just a header - I might complete
> > this later if/when there's interest or hopefully someone interested
> > in iommu emulation will.
> 
> Hi,
> 
> I personally like this approach better. It also seems to make poisoning
> cpu_physical_memory_*() easier if we convert every device to this API.
> We could then ban cpu_physical_memory_*(), perhaps by requiring a
> #define and #ifdef-ing those declarations.
> 
> > Notes:
> > the IOMMU_PERM_RW code seem unused, so I replaced
> > this with plain is_write. Is it ever useful?
> 
> The original idea made provisions for stuff like full R/W memory maps.
> In that case cpu_physical_memory_map() would call the translation /
> checking function with perms == IOMMU_PERM_RW. That's not there yet so
> it can be removed at the moment, especially since it only affects these
> helpers.
> 
> Also, I'm not sure if there are other sorts of accesses besides reads
> and writes we want to check or translate.
> 
> > It seems that invalidate callback should be able to
> > get away with just a device, so I switched to that
> > from a void pointer for type safety.
> > Seems enough for the users I saw.
> 
> I think this makes matters too complicated. Normally, a single DMADevice
> should be embedded within a <bus>Device,

No, DMADevice is a device that does DMA.
So e.g. a PCI device would embed one.
Remember, traslations are per device, right?
DMAMmu is part of the iommu object.

> so doing this makes it really
> hard to invalidate a specific map when there are more of them. It forces
> device code to act as a bus, provide fake 'DMADevice's for each map and
> dispatch translation to the real DMATranslateFunc. I see no other way.
> 
> If you really want more type-safety (although I think this is a case of
> a true opaque identifying something only device code understands), I
> have another proposal: have a DMAMap embedded in the opaque. Example
> from dma-helpers.c:
> 
> typedef struct {
> 	DMADevice *owner;
> 	[...]
> } DMAMap;
> 
> typedef struct {
> 	[...]
> 	DMAMap map;
> 	[...]
> } DMAAIOCB;
> 
> /* The callback. */
> static void dma_bdrv_cancel(DMAMap *map)
> {
> 	DMAAIOCB *dbs = container_of(map, DMAAIOCB, map);
> 
> 	[...]
> }
> 
> The upside is we only need to pass the DMAMap. That can also contain
> details of the actual map in case the device wants to release only the
> relevant range and remap the rest.

Fine.
Or maybe DMAAIOCB (just make some letters lower case: DMAIocb?).
Everyone will use it anyway, right?

> > I saw devices do stl_le_phys and such, these
> > might need to be wrapped as well.
> 
> stl_le_phys() is defined and used only by hw/eepro100.c. That's already
> dealt with by converting the device.
> 

I see. Need to get around to adding some prefix to it to make this clear.

> 	Thanks,
> 	Eduard
> 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > 
> > ---
> > 
> > diff --git a/hw/dma_rw.h b/hw/dma_rw.h
> > new file mode 100644
> > index 0000000..d63fd17
> > --- /dev/null
> > +++ b/hw/dma_rw.h
> > @@ -0,0 +1,122 @@
> > +#ifndef DMA_RW_H
> > +#define DMA_RW_H
> > +
> > +#include "qemu-common.h"
> > +
> > +/* We currently only have pci mmus, but using
> > +   a generic type makes it possible to use this
> > +   e.g. from the generic ide code without callbacks. */
> > +typedef uint64_t dma_addr_t;
> > +
> > +typedef struct DMAMmu DMAMmu;
> > +typedef struct DMADevice DMADevice;
> > +
> > +typedef int DMATranslateFunc(DMAMmu *mmu,
> > +                             DMADevice *dev,
> > +                             dma_addr_t addr,
> > +                             dma_addr_t *paddr,
> > +                             dma_addr_t *len,
> > +                             int is_write);
> > +
> > +typedef int DMAInvalidateMapFunc(DMADevice *);
> > +struct DMAMmu {
> > +	/* invalidate, etc. */
> > +	DmaTranslateFunc *translate;
> > +};
> > +
> > +struct DMADevice {
> > +	DMAMmu *mmu;
> > +	DMAInvalidateMapFunc *invalidate;
> > +};
> > +
> > +void dma_device_init(DMADevice *, DMAMmu *, DMAInvalidateMapFunc *);
> > +
> > +static inline void dma_memory_rw(DMADevice *dev,
> > +				 dma_addr_t addr,
> > +				 void *buf,
> > +				 uint32_t len,
> > +				 int is_write)
> > +{
> > +    uint32_t plen;
> > +    /* Fast-path non-iommu.
> > +     * More importantly, makes it obvious what this function does. */
> > +    if (!dev->mmu) {
> > +    	cpu_physical_memory_rw(paddr, buf, plen, is_write);
> > +    	return;
> > +    }
> > +    while (len) {
> > +        err = dev->mmu->translate(iommu, dev, addr, &paddr, &plen, is_write);
> > +        if (err) {
> > +            return;
> > +        }
> > +                                      
> > +        /* The translation might be valid for larger regions. */
> > +        if (plen > len) {
> > +            plen = len;
> > +        }
> > +    
> > +        cpu_physical_memory_rw(paddr, buf, plen, is_write);
> > +
> > +        len -= plen;
> > +        addr += plen;
> > +        buf += plen;
> > +    }
> > +}
> > +
> > +void *dma_memory_map(DMADevice *dev,
> > +                            dma_addr_t addr,
> > +                            uint32_t *len,
> > +                            int is_write);
> > +void dma_memory_unmap(DMADevice *dev,
> > +		      void *buffer,
> > +		      uint32_t len,
> > +		      int is_write,
> > +		      uint32_t access_len);
> > +
> > +
> > ++#define DEFINE_DMA_LD(suffix, size)                                       \
> > ++uint##size##_t dma_ld##suffix(DMADevice *dev, dma_addr_t addr)            \
> > ++{                                                                         \
> > ++    int err;                                                              \
> > ++    target_phys_addr_t paddr, plen;                                       \
> > ++    if (!dev->mmu) {                                                      \
> > ++        return ld##suffix##_phys(addr, val);                              \
> > ++    }                                                                     \
> > ++                                                                          \
> > ++    err = dev->mmu->translate(dev->bus->iommu, dev,                       \
> > ++                              addr, &paddr, &plen, IOMMU_PERM_READ);      \
> > ++    if (err || (plen < size / 8))                                         \
> > ++        return 0;                                                         \
> > ++                                                                          \
> > ++    return ld##suffix##_phys(paddr);                                      \
> > ++}
> > ++
> > ++#define DEFINE_DMA_ST(suffix, size)                                       \
> > ++void dma_st##suffix(DMADevice *dev, dma_addr_t addr, uint##size##_t val)  \
> > ++{                                                                         \
> > ++    int err;                                                              \
> > ++    target_phys_addr_t paddr, plen;                                       \
> > ++                                                                          \
> > ++    if (!dev->mmu) {                                                      \
> > ++        st##suffix##_phys(addr, val);                                     \
> > ++        return;                                                           \
> > ++    }                                                                     \
> > ++    err = dev->mmu->translate(dev->bus->iommu, dev,                       \
> > ++                              addr, &paddr, &plen, IOMMU_PERM_WRITE);     \
> > ++    if (err || (plen < size / 8))                                         \
> > ++        return;                                                           \
> > ++                                                                          \
> > ++    st##suffix##_phys(paddr, val);                                        \
> > ++}
> > +
> > +DEFINE_DMA_LD(ub, 8)
> > +DEFINE_DMA_LD(uw, 16)
> > +DEFINE_DMA_LD(l, 32)
> > +DEFINE_DMA_LD(q, 64)
> > +
> > +DEFINE_DMA_ST(b, 8)
> > +DEFINE_DMA_ST(w, 16)
> > +DEFINE_DMA_ST(l, 32)
> > +DEFINE_DMA_ST(q, 64)
> > +
> > +#endif
> > diff --git a/hw/pci.h b/hw/pci.h
> > index 1c6075e..9737f0e 100644
> > --- a/hw/pci.h
> > +++ b/hw/pci.h
> > @@ -5,6 +5,7 @@
> >  #include "qobject.h"
> >  
> >  #include "qdev.h"
> > +#include "dma_rw.h"
> >  
> >  /* PCI includes legacy ISA access.  */
> >  #include "isa.h"
> > @@ -119,6 +120,10 @@ enum {
> >  
> >  struct PCIDevice {
> >      DeviceState qdev;
> > +
> > +    /* For devices that do DMA. */
> > +    DMADevice dma;
> > +
> >      /* PCI config space */
> >      uint8_t *config;
> >
Michael S. Tsirkin - Sept. 16, 2010, 9:35 a.m.
On Mon, Sep 13, 2010 at 03:45:34PM -0500, Anthony Liguori wrote:
> On 09/13/2010 03:01 PM, Michael S. Tsirkin wrote:
> >So I think the following will give the idea of what an API
> >might look like that will let us avoid the scary hacks in
> >e.g. the ide layer and other generic layers that need to do DMA,
> >without either binding us to pci, adding more complexity with
> >callbacks, or losing type safety with casts and void*.
> >
> >Basically we have DMADevice that we can use container_of on
> >to get a PCIDevice from, and DMAMmu that will get instanciated
> >in a specific MMU.
> >
> >This is not complete code - just a header - I might complete
> >this later if/when there's interest or hopefully someone interested
> >in iommu emulation will.
> >
> >Notes:
> >the IOMMU_PERM_RW code seem unused, so I replaced
> >this with plain is_write. Is it ever useful?
> >
> >It seems that invalidate callback should be able to
> >get away with just a device, so I switched to that
> >from a void pointer for type safety.
> >Seems enough for the users I saw.
> >
> >I saw devices do stl_le_phys and such, these
> >might need to be wrapped as well.
> >
> >Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> 
> One of the troubles with an interface like this is that I'm not sure
> a generic model universally works.
> 
> For instance, I know some PCI busses do transparent byte swapping.
> For this to work, there has to be a notion of generic memory
> reads/writes vs. reads of a 32-bit, 16-bit, and 8-bit value.
> 
> With a generic API, we lose the flexibility to do this type of bus
> interface.
> 
> Regards,
> 
> Anthony Liguori

Surely only PCI root can do such tricks.
Anyway, I suspect what you refer to is byte swapping of config cycles
and similar IO done by driver.  If a bus byteswapped a DMA transaction,
this basically breaks DMA as driver will have to go and fix up all data
before passing it up to the OS. Right?

We'd have to add more wrappers to emulate such insanity,
as MMU intentionally only handles translation.


> >---
> >
> >diff --git a/hw/dma_rw.h b/hw/dma_rw.h
> >new file mode 100644
> >index 0000000..d63fd17
> >--- /dev/null
> >+++ b/hw/dma_rw.h
> >@@ -0,0 +1,122 @@
> >+#ifndef DMA_RW_H
> >+#define DMA_RW_H
> >+
> >+#include "qemu-common.h"
> >+
> >+/* We currently only have pci mmus, but using
> >+   a generic type makes it possible to use this
> >+   e.g. from the generic ide code without callbacks. */
> >+typedef uint64_t dma_addr_t;
> >+
> >+typedef struct DMAMmu DMAMmu;
> >+typedef struct DMADevice DMADevice;
> >+
> >+typedef int DMATranslateFunc(DMAMmu *mmu,
> >+                             DMADevice *dev,
> >+                             dma_addr_t addr,
> >+                             dma_addr_t *paddr,
> >+                             dma_addr_t *len,
> >+                             int is_write);
> >+
> >+typedef int DMAInvalidateMapFunc(DMADevice *);
> >+struct DMAMmu {
> >+	/* invalidate, etc. */
> >+	DmaTranslateFunc *translate;
> >+};
> >+
> >+struct DMADevice {
> >+	DMAMmu *mmu;
> >+	DMAInvalidateMapFunc *invalidate;
> >+};
> >+
> >+void dma_device_init(DMADevice *, DMAMmu *, DMAInvalidateMapFunc *);
> >+
> >+static inline void dma_memory_rw(DMADevice *dev,
> >+				 dma_addr_t addr,
> >+				 void *buf,
> >+				 uint32_t len,
> >+				 int is_write)
> >+{
> >+    uint32_t plen;
> >+    /* Fast-path non-iommu.
> >+     * More importantly, makes it obvious what this function does. */
> >+    if (!dev->mmu) {
> >+    	cpu_physical_memory_rw(paddr, buf, plen, is_write);
> >+    	return;
> >+    }
> >+    while (len) {
> >+        err = dev->mmu->translate(iommu, dev, addr,&paddr,&plen, is_write);
> >+        if (err) {
> >+            return;
> >+        }
> >+
> >+        /* The translation might be valid for larger regions. */
> >+        if (plen>  len) {
> >+            plen = len;
> >+        }
> >+
> >+        cpu_physical_memory_rw(paddr, buf, plen, is_write);
> >+
> >+        len -= plen;
> >+        addr += plen;
> >+        buf += plen;
> >+    }
> >+}
> >+
> >+void *dma_memory_map(DMADevice *dev,
> >+                            dma_addr_t addr,
> >+                            uint32_t *len,
> >+                            int is_write);
> >+void dma_memory_unmap(DMADevice *dev,
> >+		      void *buffer,
> >+		      uint32_t len,
> >+		      int is_write,
> >+		      uint32_t access_len);
> >+
> >+
> >++#define DEFINE_DMA_LD(suffix, size)                                       \
> >++uint##size##_t dma_ld##suffix(DMADevice *dev, dma_addr_t addr)            \
> >++{                                                                         \
> >++    int err;                                                              \
> >++    target_phys_addr_t paddr, plen;                                       \
> >++    if (!dev->mmu) {                                                      \
> >++        return ld##suffix##_phys(addr, val);                              \
> >++    }                                                                     \
> >++                                                                          \
> >++    err = dev->mmu->translate(dev->bus->iommu, dev,                       \
> >++                              addr,&paddr,&plen, IOMMU_PERM_READ);      \
> >++    if (err || (plen<  size / 8))                                         \
> >++        return 0;                                                         \
> >++                                                                          \
> >++    return ld##suffix##_phys(paddr);                                      \
> >++}
> >++
> >++#define DEFINE_DMA_ST(suffix, size)                                       \
> >++void dma_st##suffix(DMADevice *dev, dma_addr_t addr, uint##size##_t val)  \
> >++{                                                                         \
> >++    int err;                                                              \
> >++    target_phys_addr_t paddr, plen;                                       \
> >++                                                                          \
> >++    if (!dev->mmu) {                                                      \
> >++        st##suffix##_phys(addr, val);                                     \
> >++        return;                                                           \
> >++    }                                                                     \
> >++    err = dev->mmu->translate(dev->bus->iommu, dev,                       \
> >++                              addr,&paddr,&plen, IOMMU_PERM_WRITE);     \
> >++    if (err || (plen<  size / 8))                                         \
> >++        return;                                                           \
> >++                                                                          \
> >++    st##suffix##_phys(paddr, val);                                        \
> >++}
> >+
> >+DEFINE_DMA_LD(ub, 8)
> >+DEFINE_DMA_LD(uw, 16)
> >+DEFINE_DMA_LD(l, 32)
> >+DEFINE_DMA_LD(q, 64)
> >+
> >+DEFINE_DMA_ST(b, 8)
> >+DEFINE_DMA_ST(w, 16)
> >+DEFINE_DMA_ST(l, 32)
> >+DEFINE_DMA_ST(q, 64)
> >+
> >+#endif
> >diff --git a/hw/pci.h b/hw/pci.h
> >index 1c6075e..9737f0e 100644
> >--- a/hw/pci.h
> >+++ b/hw/pci.h
> >@@ -5,6 +5,7 @@
> >  #include "qobject.h"
> >
> >  #include "qdev.h"
> >+#include "dma_rw.h"
> >
> >  /* PCI includes legacy ISA access.  */
> >  #include "isa.h"
> >@@ -119,6 +120,10 @@ enum {
> >
> >  struct PCIDevice {
> >      DeviceState qdev;
> >+
> >+    /* For devices that do DMA. */
> >+    DMADevice dma;
> >+
> >      /* PCI config space */
> >      uint8_t *config;
> >
> >
Eduard - Gabriel Munteanu - Sept. 16, 2010, 11:15 a.m.
On Thu, Sep 16, 2010 at 11:20:43AM +0200, Michael S. Tsirkin wrote:
> On Thu, Sep 16, 2010 at 10:06:16AM +0300, Eduard - Gabriel Munteanu wrote:

[snip]

> 
> No, DMADevice is a device that does DMA.
> So e.g. a PCI device would embed one.
> Remember, traslations are per device, right?
> DMAMmu is part of the iommu object.
> 

I agree, all I'm saying is a DMADevice is insufficient to invalidate one
of the many maps a given device holds, at least without resorting to
horrible tricks or destroying them all.

> > so doing this makes it really
> > hard to invalidate a specific map when there are more of them. It forces
> > device code to act as a bus, provide fake 'DMADevice's for each map and
> > dispatch translation to the real DMATranslateFunc. I see no other way.
> > 
> > If you really want more type-safety (although I think this is a case of
> > a true opaque identifying something only device code understands), I
> > have another proposal: have a DMAMap embedded in the opaque. Example
> > from dma-helpers.c:
> > 
> > typedef struct {
> > 	DMADevice *owner;
> > 	[...]
> > } DMAMap;
> > 
> > typedef struct {
> > 	[...]
> > 	DMAMap map;
> > 	[...]
> > } DMAAIOCB;
> > 
> > /* The callback. */
> > static void dma_bdrv_cancel(DMAMap *map)
> > {
> > 	DMAAIOCB *dbs = container_of(map, DMAAIOCB, map);
> > 
> > 	[...]
> > }
> > 
> > The upside is we only need to pass the DMAMap. That can also contain
> > details of the actual map in case the device wants to release only the
> > relevant range and remap the rest.
> 
> Fine.
> Or maybe DMAAIOCB (just make some letters lower case: DMAIocb?).
> Everyone will use it anyway, right?

No, you misunderstood me. DMAAIOCB is already there in dma-helpers.c,
it's the opaque I used and it was already there before my patches.

The idea was to define DMAMap and embed it in DMAAIOCB, so we can
upcast from the former to the latter (which is what we actually need).

IDE DMA uses that, but other devices would use whatever they want, even
nothing except the DMAMap. The only requirement is that the container
struct allows device code to acknowledge the invalidation (in case of
AIO, we simply kill that thread and release resources).

Well, perhaps DMAMap isn't the best name, but you get the idea.

> > > I saw devices do stl_le_phys and such, these
> > > might need to be wrapped as well.
> > 
> > stl_le_phys() is defined and used only by hw/eepro100.c. That's already
> > dealt with by converting the device.
> > 
> 
> I see. Need to get around to adding some prefix to it to make this clear.
> 
> > 	Thanks,
> > 	Eduard
> > 

[snip]


	Eduard

Patch

diff --git a/hw/dma_rw.h b/hw/dma_rw.h
new file mode 100644
index 0000000..d63fd17
--- /dev/null
+++ b/hw/dma_rw.h
@@ -0,0 +1,122 @@ 
+#ifndef DMA_RW_H
+#define DMA_RW_H
+
+#include "qemu-common.h"
+
+/* We currently only have pci mmus, but using
+   a generic type makes it possible to use this
+   e.g. from the generic ide code without callbacks. */
+typedef uint64_t dma_addr_t;
+
+typedef struct DMAMmu DMAMmu;
+typedef struct DMADevice DMADevice;
+
+typedef int DMATranslateFunc(DMAMmu *mmu,
+                             DMADevice *dev,
+                             dma_addr_t addr,
+                             dma_addr_t *paddr,
+                             dma_addr_t *len,
+                             int is_write);
+
+typedef int DMAInvalidateMapFunc(DMADevice *);
+struct DMAMmu {
+	/* invalidate, etc. */
+	DmaTranslateFunc *translate;
+};
+
+struct DMADevice {
+	DMAMmu *mmu;
+	DMAInvalidateMapFunc *invalidate;
+};
+
+void dma_device_init(DMADevice *, DMAMmu *, DMAInvalidateMapFunc *);
+
+static inline void dma_memory_rw(DMADevice *dev,
+				 dma_addr_t addr,
+				 void *buf,
+				 uint32_t len,
+				 int is_write)
+{
+    uint32_t plen;
+    /* Fast-path non-iommu.
+     * More importantly, makes it obvious what this function does. */
+    if (!dev->mmu) {
+    	cpu_physical_memory_rw(paddr, buf, plen, is_write);
+    	return;
+    }
+    while (len) {
+        err = dev->mmu->translate(iommu, dev, addr, &paddr, &plen, is_write);
+        if (err) {
+            return;
+        }
+                                      
+        /* The translation might be valid for larger regions. */
+        if (plen > len) {
+            plen = len;
+        }
+    
+        cpu_physical_memory_rw(paddr, buf, plen, is_write);
+
+        len -= plen;
+        addr += plen;
+        buf += plen;
+    }
+}
+
+void *dma_memory_map(DMADevice *dev,
+                            dma_addr_t addr,
+                            uint32_t *len,
+                            int is_write);
+void dma_memory_unmap(DMADevice *dev,
+		      void *buffer,
+		      uint32_t len,
+		      int is_write,
+		      uint32_t access_len);
+
+
++#define DEFINE_DMA_LD(suffix, size)                                       \
++uint##size##_t dma_ld##suffix(DMADevice *dev, dma_addr_t addr)            \
++{                                                                         \
++    int err;                                                              \
++    target_phys_addr_t paddr, plen;                                       \
++    if (!dev->mmu) {                                                      \
++        return ld##suffix##_phys(addr, val);                              \
++    }                                                                     \
++                                                                          \
++    err = dev->mmu->translate(dev->bus->iommu, dev,                       \
++                              addr, &paddr, &plen, IOMMU_PERM_READ);      \
++    if (err || (plen < size / 8))                                         \
++        return 0;                                                         \
++                                                                          \
++    return ld##suffix##_phys(paddr);                                      \
++}
++
++#define DEFINE_DMA_ST(suffix, size)                                       \
++void dma_st##suffix(DMADevice *dev, dma_addr_t addr, uint##size##_t val)  \
++{                                                                         \
++    int err;                                                              \
++    target_phys_addr_t paddr, plen;                                       \
++                                                                          \
++    if (!dev->mmu) {                                                      \
++        st##suffix##_phys(addr, val);                                     \
++        return;                                                           \
++    }                                                                     \
++    err = dev->mmu->translate(dev->bus->iommu, dev,                       \
++                              addr, &paddr, &plen, IOMMU_PERM_WRITE);     \
++    if (err || (plen < size / 8))                                         \
++        return;                                                           \
++                                                                          \
++    st##suffix##_phys(paddr, val);                                        \
++}
+
+DEFINE_DMA_LD(ub, 8)
+DEFINE_DMA_LD(uw, 16)
+DEFINE_DMA_LD(l, 32)
+DEFINE_DMA_LD(q, 64)
+
+DEFINE_DMA_ST(b, 8)
+DEFINE_DMA_ST(w, 16)
+DEFINE_DMA_ST(l, 32)
+DEFINE_DMA_ST(q, 64)
+
+#endif
diff --git a/hw/pci.h b/hw/pci.h
index 1c6075e..9737f0e 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -5,6 +5,7 @@ 
 #include "qobject.h"
 
 #include "qdev.h"
+#include "dma_rw.h"
 
 /* PCI includes legacy ISA access.  */
 #include "isa.h"
@@ -119,6 +120,10 @@  enum {
 
 struct PCIDevice {
     DeviceState qdev;
+
+    /* For devices that do DMA. */
+    DMADevice dma;
+
     /* PCI config space */
     uint8_t *config;