Patchwork [4/7] ide: use the PCI memory access interface

login
register
mail settings
Submitter Eduard - Gabriel Munteanu
Date Aug. 28, 2010, 2:54 p.m.
Message ID <1283007298-10942-5-git-send-email-eduard.munteanu@linux360.ro>
Download mbox | patch
Permalink /patch/62908/
State New
Headers show

Comments

Eduard - Gabriel Munteanu - Aug. 28, 2010, 2:54 p.m.
Emulated PCI IDE controllers now use the memory access interface. This
also allows an emulated IOMMU to translate and check accesses.

Map invalidation results in cancelling DMA transfers. Since the guest OS
can't properly recover the DMA results in case the mapping is changed,
this is a fairly good approximation.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
---
 dma-helpers.c     |   46 +++++++++++++++++++++++++++++++++++++++++-----
 dma.h             |   21 ++++++++++++++++++++-
 hw/ide/core.c     |   15 ++++++++-------
 hw/ide/internal.h |   39 +++++++++++++++++++++++++++++++++++++++
 hw/ide/macio.c    |    4 ++--
 hw/ide/pci.c      |    7 +++++++
 6 files changed, 117 insertions(+), 15 deletions(-)
Michael S. Tsirkin - Sept. 2, 2010, 5:19 a.m.
On Sat, Aug 28, 2010 at 05:54:55PM +0300, Eduard - Gabriel Munteanu wrote:
> Emulated PCI IDE controllers now use the memory access interface. This
> also allows an emulated IOMMU to translate and check accesses.
> 
> Map invalidation results in cancelling DMA transfers. Since the guest OS
> can't properly recover the DMA results in case the mapping is changed,
> this is a fairly good approximation.
> 
> Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> ---
>  dma-helpers.c     |   46 +++++++++++++++++++++++++++++++++++++++++-----
>  dma.h             |   21 ++++++++++++++++++++-
>  hw/ide/core.c     |   15 ++++++++-------
>  hw/ide/internal.h |   39 +++++++++++++++++++++++++++++++++++++++
>  hw/ide/macio.c    |    4 ++--
>  hw/ide/pci.c      |    7 +++++++
>  6 files changed, 117 insertions(+), 15 deletions(-)
> 
> diff --git a/dma-helpers.c b/dma-helpers.c
> index 712ed89..a0dcdb8 100644
> --- a/dma-helpers.c
> +++ b/dma-helpers.c
> @@ -10,12 +10,36 @@
>  #include "dma.h"
>  #include "block_int.h"
>  
> -void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint)
> +static void *qemu_sglist_default_map(void *opaque,
> +                                     QEMUSGInvalMapFunc *inval_cb,
> +                                     void *inval_opaque,
> +                                     target_phys_addr_t addr,
> +                                     target_phys_addr_t *len,
> +                                     int is_write)
> +{
> +    return cpu_physical_memory_map(addr, len, is_write);
> +}
> +
> +static void qemu_sglist_default_unmap(void *opaque,
> +                                      void *buffer,
> +                                      target_phys_addr_t len,
> +                                      int is_write,
> +                                      target_phys_addr_t access_len)
> +{
> +    cpu_physical_memory_unmap(buffer, len, is_write, access_len);
> +}
> +
> +void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint,
> +                      QEMUSGMapFunc *map, QEMUSGUnmapFunc *unmap, void *opaque)
>  {
>      qsg->sg = qemu_malloc(alloc_hint * sizeof(ScatterGatherEntry));
>      qsg->nsg = 0;
>      qsg->nalloc = alloc_hint;
>      qsg->size = 0;
> +
> +    qsg->map = map ? map : qemu_sglist_default_map;
> +    qsg->unmap = unmap ? unmap : qemu_sglist_default_unmap;
> +    qsg->opaque = opaque;
>  }
>  
>  void qemu_sglist_add(QEMUSGList *qsg, target_phys_addr_t base,
> @@ -73,12 +97,23 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs)
>      int i;
>  
>      for (i = 0; i < dbs->iov.niov; ++i) {
> -        cpu_physical_memory_unmap(dbs->iov.iov[i].iov_base,
> -                                  dbs->iov.iov[i].iov_len, !dbs->is_write,
> -                                  dbs->iov.iov[i].iov_len);
> +        dbs->sg->unmap(dbs->sg->opaque,
> +                       dbs->iov.iov[i].iov_base,
> +                       dbs->iov.iov[i].iov_len, !dbs->is_write,
> +                       dbs->iov.iov[i].iov_len);
>      }
>  }
>  
> +static void dma_bdrv_cancel(void *opaque)
> +{
> +    DMAAIOCB *dbs = opaque;
> +
> +    bdrv_aio_cancel(dbs->acb);
> +    dma_bdrv_unmap(dbs);
> +    qemu_iovec_destroy(&dbs->iov);
> +    qemu_aio_release(dbs);
> +}
> +
>  static void dma_bdrv_cb(void *opaque, int ret)
>  {
>      DMAAIOCB *dbs = (DMAAIOCB *)opaque;
> @@ -100,7 +135,8 @@ static void dma_bdrv_cb(void *opaque, int ret)
>      while (dbs->sg_cur_index < dbs->sg->nsg) {
>          cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
>          cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
> -        mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->is_write);
> +        mem = dbs->sg->map(dbs->sg->opaque, dma_bdrv_cancel, dbs,
> +                           cur_addr, &cur_len, !dbs->is_write);
>          if (!mem)
>              break;
>          qemu_iovec_add(&dbs->iov, mem, cur_len);
> diff --git a/dma.h b/dma.h
> index f3bb275..d48f35c 100644
> --- a/dma.h
> +++ b/dma.h
> @@ -15,6 +15,19 @@
>  #include "hw/hw.h"
>  #include "block.h"
>  
> +typedef void QEMUSGInvalMapFunc(void *opaque);
> +typedef void *QEMUSGMapFunc(void *opaque,
> +                            QEMUSGInvalMapFunc *inval_cb,
> +                            void *inval_opaque,
> +                            target_phys_addr_t addr,
> +                            target_phys_addr_t *len,
> +                            int is_write);
> +typedef void QEMUSGUnmapFunc(void *opaque,
> +                             void *buffer,
> +                             target_phys_addr_t len,
> +                             int is_write,
> +                             target_phys_addr_t access_len);
> +
>  typedef struct {
>      target_phys_addr_t base;
>      target_phys_addr_t len;
> @@ -25,9 +38,15 @@ typedef struct {
>      int nsg;
>      int nalloc;
>      target_phys_addr_t size;
> +
> +    QEMUSGMapFunc *map;
> +    QEMUSGUnmapFunc *unmap;
> +    void *opaque;
>  } QEMUSGList;
>  
> -void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint);
> +void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint,
> +                      QEMUSGMapFunc *map, QEMUSGUnmapFunc *unmap,
> +                      void *opaque);
>  void qemu_sglist_add(QEMUSGList *qsg, target_phys_addr_t base,
>                       target_phys_addr_t len);
>  void qemu_sglist_destroy(QEMUSGList *qsg);
> diff --git a/hw/ide/core.c b/hw/ide/core.c
> index af52c2c..024a125 100644
> --- a/hw/ide/core.c
> +++ b/hw/ide/core.c
> @@ -436,7 +436,8 @@ static int dma_buf_prepare(BMDMAState *bm, int is_write)
>      } prd;
>      int l, len;
>  
> -    qemu_sglist_init(&s->sg, s->nsector / (IDE_PAGE_SIZE / 512) + 1);
> +    qemu_sglist_init(&s->sg, s->nsector / (IDE_PAGE_SIZE / 512) + 1,
> +                     bm->map, bm->unmap, bm->opaque);
>      s->io_buffer_size = 0;
>      for(;;) {
>          if (bm->cur_prd_len == 0) {
> @@ -444,7 +445,7 @@ static int dma_buf_prepare(BMDMAState *bm, int is_write)
>              if (bm->cur_prd_last ||
>                  (bm->cur_addr - bm->addr) >= IDE_PAGE_SIZE)
>                  return s->io_buffer_size != 0;
> -            cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
> +            bmdma_memory_read(bm, bm->cur_addr, (uint8_t *)&prd, 8);
>              bm->cur_addr += 8;
>              prd.addr = le32_to_cpu(prd.addr);
>              prd.size = le32_to_cpu(prd.size);
> @@ -527,7 +528,7 @@ static int dma_buf_rw(BMDMAState *bm, int is_write)
>              if (bm->cur_prd_last ||
>                  (bm->cur_addr - bm->addr) >= IDE_PAGE_SIZE)
>                  return 0;
> -            cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
> +            bmdma_memory_read(bm, bm->cur_addr, (uint8_t *)&prd, 8);
>              bm->cur_addr += 8;
>              prd.addr = le32_to_cpu(prd.addr);
>              prd.size = le32_to_cpu(prd.size);
> @@ -542,11 +543,11 @@ static int dma_buf_rw(BMDMAState *bm, int is_write)
>              l = bm->cur_prd_len;
>          if (l > 0) {
>              if (is_write) {
> -                cpu_physical_memory_write(bm->cur_prd_addr,
> -                                          s->io_buffer + s->io_buffer_index, l);
> +                bmdma_memory_write(bm, bm->cur_prd_addr,
> +                                   s->io_buffer + s->io_buffer_index, l);
>              } else {
> -                cpu_physical_memory_read(bm->cur_prd_addr,
> -                                          s->io_buffer + s->io_buffer_index, l);
> +                bmdma_memory_read(bm, bm->cur_prd_addr,
> +                                  s->io_buffer + s->io_buffer_index, l);
>              }
>              bm->cur_prd_addr += l;
>              bm->cur_prd_len -= l;
> diff --git a/hw/ide/internal.h b/hw/ide/internal.h
> index 4165543..f686d38 100644
> --- a/hw/ide/internal.h
> +++ b/hw/ide/internal.h
> @@ -477,6 +477,24 @@ struct IDEDeviceInfo {
>  #define BM_CMD_START     0x01
>  #define BM_CMD_READ      0x08
>  
> +typedef void BMDMAInvalMapFunc(void *opaque);
> +typedef void BMDMARWFunc(void *opaque,
> +                         target_phys_addr_t addr,
> +                         uint8_t *buf,
> +                         target_phys_addr_t len,
> +                         int is_write);
> +typedef void *BMDMAMapFunc(void *opaque,
> +                           BMDMAInvalMapFunc *inval_cb,
> +                           void *inval_opaque,
> +                           target_phys_addr_t addr,
> +                           target_phys_addr_t *len,
> +                           int is_write);
> +typedef void BMDMAUnmapFunc(void *opaque,
> +                            void *buffer,
> +                            target_phys_addr_t len,
> +                            int is_write,
> +                            target_phys_addr_t access_len);
> +
>  struct BMDMAState {
>      uint8_t cmd;
>      uint8_t status;
> @@ -496,8 +514,29 @@ struct BMDMAState {
>      int64_t sector_num;
>      uint32_t nsector;
>      QEMUBH *bh;
> +
> +    BMDMARWFunc *rw;
> +    BMDMAMapFunc *map;
> +    BMDMAUnmapFunc *unmap;
> +    void *opaque;
>  };
>  
> +static inline void bmdma_memory_read(BMDMAState *bm,
> +                                     target_phys_addr_t addr,
> +                                     uint8_t *buf,
> +                                     target_phys_addr_t len)
> +{
> +    bm->rw(bm->opaque, addr, buf, len, 0);
> +}
> +
> +static inline void bmdma_memory_write(BMDMAState *bm,
> +                                      target_phys_addr_t addr,
> +                                      uint8_t *buf,
> +                                      target_phys_addr_t len)
> +{
> +    bm->rw(bm->opaque, addr, buf, len, 1);
> +}
> +

Here again, I am concerned about indirection and pointer chaising on data path.
Can we have an iommu pointer in the device, and do a fast path in case
there is no iommu?

>  static inline IDEState *idebus_active_if(IDEBus *bus)
>  {
>      return bus->ifs + bus->unit;
> diff --git a/hw/ide/macio.c b/hw/ide/macio.c
> index bd1c73e..962ae13 100644
> --- a/hw/ide/macio.c
> +++ b/hw/ide/macio.c
> @@ -79,7 +79,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
>  
>      s->io_buffer_size = io->len;
>  
> -    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
> +    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
>      qemu_sglist_add(&s->sg, io->addr, io->len);
>      io->addr += io->len;
>      io->len = 0;
> @@ -141,7 +141,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
>      s->io_buffer_index = 0;
>      s->io_buffer_size = io->len;
>  
> -    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
> +    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
>      qemu_sglist_add(&s->sg, io->addr, io->len);
>      io->addr += io->len;
>      io->len = 0;
> diff --git a/hw/ide/pci.c b/hw/ide/pci.c
> index 4d95cc5..5879044 100644
> --- a/hw/ide/pci.c
> +++ b/hw/ide/pci.c
> @@ -183,4 +183,11 @@ void pci_ide_create_devs(PCIDevice *dev, DriveInfo **hd_table)
>              continue;
>          ide_create_drive(d->bus+bus[i], unit[i], hd_table[i]);
>      }
> +
> +    for (i = 0; i < 2; i++) {
> +        d->bmdma[i].rw = (void *) pci_memory_rw;
> +        d->bmdma[i].map = (void *) pci_memory_map;
> +        d->bmdma[i].unmap = (void *) pci_memory_unmap;
> +        d->bmdma[i].opaque = dev;
> +    }
>  }

These casts show something is wrong with the API, IMO.

> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eduard - Gabriel Munteanu - Sept. 2, 2010, 9:12 a.m.
On Thu, Sep 02, 2010 at 08:19:11AM +0300, Michael S. Tsirkin wrote:
> On Sat, Aug 28, 2010 at 05:54:55PM +0300, Eduard - Gabriel Munteanu wrote:
> > Emulated PCI IDE controllers now use the memory access interface. This
> > also allows an emulated IOMMU to translate and check accesses.
> > 
> > Map invalidation results in cancelling DMA transfers. Since the guest OS
> > can't properly recover the DMA results in case the mapping is changed,
> > this is a fairly good approximation.
> > 
> > Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> > ---

[snip]

> > +static inline void bmdma_memory_read(BMDMAState *bm,
> > +                                     target_phys_addr_t addr,
> > +                                     uint8_t *buf,
> > +                                     target_phys_addr_t len)
> > +{
> > +    bm->rw(bm->opaque, addr, buf, len, 0);
> > +}
> > +
> > +static inline void bmdma_memory_write(BMDMAState *bm,
> > +                                      target_phys_addr_t addr,
> > +                                      uint8_t *buf,
> > +                                      target_phys_addr_t len)
> > +{
> > +    bm->rw(bm->opaque, addr, buf, len, 1);
> > +}
> > +
> 
> Here again, I am concerned about indirection and pointer chaising on data path.
> Can we have an iommu pointer in the device, and do a fast path in case
> there is no iommu?
> 

See my other reply.

> >  static inline IDEState *idebus_active_if(IDEBus *bus)
> >  {
> >      return bus->ifs + bus->unit;
> > diff --git a/hw/ide/macio.c b/hw/ide/macio.c
> > index bd1c73e..962ae13 100644
> > --- a/hw/ide/macio.c
> > +++ b/hw/ide/macio.c
> > @@ -79,7 +79,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
> >  
> >      s->io_buffer_size = io->len;
> >  
> > -    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
> > +    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
> >      qemu_sglist_add(&s->sg, io->addr, io->len);
> >      io->addr += io->len;
> >      io->len = 0;
> > @@ -141,7 +141,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
> >      s->io_buffer_index = 0;
> >      s->io_buffer_size = io->len;
> >  
> > -    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
> > +    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
> >      qemu_sglist_add(&s->sg, io->addr, io->len);
> >      io->addr += io->len;
> >      io->len = 0;
> > diff --git a/hw/ide/pci.c b/hw/ide/pci.c
> > index 4d95cc5..5879044 100644
> > --- a/hw/ide/pci.c
> > +++ b/hw/ide/pci.c
> > @@ -183,4 +183,11 @@ void pci_ide_create_devs(PCIDevice *dev, DriveInfo **hd_table)
> >              continue;
> >          ide_create_drive(d->bus+bus[i], unit[i], hd_table[i]);
> >      }
> > +
> > +    for (i = 0; i < 2; i++) {
> > +        d->bmdma[i].rw = (void *) pci_memory_rw;
> > +        d->bmdma[i].map = (void *) pci_memory_map;
> > +        d->bmdma[i].unmap = (void *) pci_memory_unmap;
> > +        d->bmdma[i].opaque = dev;
> > +    }
> >  }
> 
> These casts show something is wrong with the API, IMO.
> 

Hm, here's an oversight on my part: I think I should provide explicit
bmdma hooks, since pcibus_t is a uint64_t and target_phys_addr_t is a
uint{32,64}_t depending on the guest machine, so it might be buggy on
32-bit wrt calling conventions. But that introduces yet another
non-inlined function call :-(. That would drop the (void *) cast,
though.


	Eduard
Michael S. Tsirkin - Sept. 2, 2010, 9:58 a.m.
On Thu, Sep 02, 2010 at 12:12:00PM +0300, Eduard - Gabriel Munteanu wrote:
> On Thu, Sep 02, 2010 at 08:19:11AM +0300, Michael S. Tsirkin wrote:
> > On Sat, Aug 28, 2010 at 05:54:55PM +0300, Eduard - Gabriel Munteanu wrote:
> > > Emulated PCI IDE controllers now use the memory access interface. This
> > > also allows an emulated IOMMU to translate and check accesses.
> > > 
> > > Map invalidation results in cancelling DMA transfers. Since the guest OS
> > > can't properly recover the DMA results in case the mapping is changed,
> > > this is a fairly good approximation.
> > > 
> > > Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> > > ---
> 
> [snip]
> 
> > > +static inline void bmdma_memory_read(BMDMAState *bm,
> > > +                                     target_phys_addr_t addr,
> > > +                                     uint8_t *buf,
> > > +                                     target_phys_addr_t len)
> > > +{
> > > +    bm->rw(bm->opaque, addr, buf, len, 0);
> > > +}
> > > +
> > > +static inline void bmdma_memory_write(BMDMAState *bm,
> > > +                                      target_phys_addr_t addr,
> > > +                                      uint8_t *buf,
> > > +                                      target_phys_addr_t len)
> > > +{
> > > +    bm->rw(bm->opaque, addr, buf, len, 1);
> > > +}
> > > +
> > 
> > Here again, I am concerned about indirection and pointer chaising on data path.
> > Can we have an iommu pointer in the device, and do a fast path in case
> > there is no iommu?
> > 
> 
> See my other reply.

I don't insist on this solution, but what other way do you propose to
avoid the overhead for everyone not using an iommu?
I'm all for a solution that would help iommu as well,
but one wasn't yet proposed.

> > >  static inline IDEState *idebus_active_if(IDEBus *bus)
> > >  {
> > >      return bus->ifs + bus->unit;
> > > diff --git a/hw/ide/macio.c b/hw/ide/macio.c
> > > index bd1c73e..962ae13 100644
> > > --- a/hw/ide/macio.c
> > > +++ b/hw/ide/macio.c
> > > @@ -79,7 +79,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
> > >  
> > >      s->io_buffer_size = io->len;
> > >  
> > > -    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
> > > +    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
> > >      qemu_sglist_add(&s->sg, io->addr, io->len);
> > >      io->addr += io->len;
> > >      io->len = 0;
> > > @@ -141,7 +141,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
> > >      s->io_buffer_index = 0;
> > >      s->io_buffer_size = io->len;
> > >  
> > > -    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
> > > +    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
> > >      qemu_sglist_add(&s->sg, io->addr, io->len);
> > >      io->addr += io->len;
> > >      io->len = 0;
> > > diff --git a/hw/ide/pci.c b/hw/ide/pci.c
> > > index 4d95cc5..5879044 100644
> > > --- a/hw/ide/pci.c
> > > +++ b/hw/ide/pci.c
> > > @@ -183,4 +183,11 @@ void pci_ide_create_devs(PCIDevice *dev, DriveInfo **hd_table)
> > >              continue;
> > >          ide_create_drive(d->bus+bus[i], unit[i], hd_table[i]);
> > >      }
> > > +
> > > +    for (i = 0; i < 2; i++) {
> > > +        d->bmdma[i].rw = (void *) pci_memory_rw;
> > > +        d->bmdma[i].map = (void *) pci_memory_map;
> > > +        d->bmdma[i].unmap = (void *) pci_memory_unmap;
> > > +        d->bmdma[i].opaque = dev;
> > > +    }
> > >  }
> > 
> > These casts show something is wrong with the API, IMO.
> > 
> 
> Hm, here's an oversight on my part: I think I should provide explicit
> bmdma hooks, since pcibus_t is a uint64_t and target_phys_addr_t is a
> uint{32,64}_t depending on the guest machine, so it might be buggy on
> 32-bit wrt calling conventions. But that introduces yet another
> non-inlined function call :-(. That would drop the (void *) cast,
> though.
> 
> 
> 	Eduard

So we get away with it without casts but only because C compiler
will let us silently convert the types, possibly discarding
data in the process. Or we'll add a check that will try and detect
this, but there's no good way to report a DMA error to user.
IOW, if our code only works because target fits in pcibus, what good
is the abstraction and using distinct types?

This is why I think we need a generic DMA APIs using dma addresses.
Eduard - Gabriel Munteanu - Sept. 2, 2010, 3:01 p.m.
On Thu, Sep 02, 2010 at 12:58:13PM +0300, Michael S. Tsirkin wrote:
> On Thu, Sep 02, 2010 at 12:12:00PM +0300, Eduard - Gabriel Munteanu wrote:
> > On Thu, Sep 02, 2010 at 08:19:11AM +0300, Michael S. Tsirkin wrote:
> > > On Sat, Aug 28, 2010 at 05:54:55PM +0300, Eduard - Gabriel Munteanu wrote:
> 
> I don't insist on this solution, but what other way do you propose to
> avoid the overhead for everyone not using an iommu?
> I'm all for a solution that would help iommu as well,
> but one wasn't yet proposed.
> 

Hm, we could get even better performance by simply making the IOMMU a
compile-time option. It also avoids problems in case some device hasn't
been converted yet, and involves little to no tradeoffs. What do you
think?

AFAICT, there are few uses for the IOMMU besides development and
avantgarde stuff, as you note. So distributions can continue supplying
prebuilt QEMU/KVM packages compiled with the IOMMU turned off for the
time being. The only practical (commercial) use right now would be in
the case of private virtual servers, which could be divided further into
nested guests (though real IOMMU hardware isn't widespread yet).

Blue Swirl, in the light of this, do you agree on making it a
compile-time option?

> > > >  static inline IDEState *idebus_active_if(IDEBus *bus)
> > > >  {
> > > >      return bus->ifs + bus->unit;
> > > > diff --git a/hw/ide/macio.c b/hw/ide/macio.c
> > > > index bd1c73e..962ae13 100644
> > > > --- a/hw/ide/macio.c
> > > > +++ b/hw/ide/macio.c
> > > > @@ -79,7 +79,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
> > > >  
> > > >      s->io_buffer_size = io->len;
> > > >  
> > > > -    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
> > > > +    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
> > > >      qemu_sglist_add(&s->sg, io->addr, io->len);
> > > >      io->addr += io->len;
> > > >      io->len = 0;
> > > > @@ -141,7 +141,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
> > > >      s->io_buffer_index = 0;
> > > >      s->io_buffer_size = io->len;
> > > >  
> > > > -    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
> > > > +    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
> > > >      qemu_sglist_add(&s->sg, io->addr, io->len);
> > > >      io->addr += io->len;
> > > >      io->len = 0;
> > > > diff --git a/hw/ide/pci.c b/hw/ide/pci.c
> > > > index 4d95cc5..5879044 100644
> > > > --- a/hw/ide/pci.c
> > > > +++ b/hw/ide/pci.c
> > > > @@ -183,4 +183,11 @@ void pci_ide_create_devs(PCIDevice *dev, DriveInfo **hd_table)
> > > >              continue;
> > > >          ide_create_drive(d->bus+bus[i], unit[i], hd_table[i]);
> > > >      }
> > > > +
> > > > +    for (i = 0; i < 2; i++) {
> > > > +        d->bmdma[i].rw = (void *) pci_memory_rw;
> > > > +        d->bmdma[i].map = (void *) pci_memory_map;
> > > > +        d->bmdma[i].unmap = (void *) pci_memory_unmap;
> > > > +        d->bmdma[i].opaque = dev;
> > > > +    }
> > > >  }
> > > 
> > > These casts show something is wrong with the API, IMO.
> > > 
> > 
> > Hm, here's an oversight on my part: I think I should provide explicit
> > bmdma hooks, since pcibus_t is a uint64_t and target_phys_addr_t is a
> > uint{32,64}_t depending on the guest machine, so it might be buggy on
> > 32-bit wrt calling conventions. But that introduces yet another
> > non-inlined function call :-(. That would drop the (void *) cast,
> > though.
> > 
> > 
> > 	Eduard
> 
> So we get away with it without casts but only because C compiler
> will let us silently convert the types, possibly discarding
> data in the process. Or we'll add a check that will try and detect
> this, but there's no good way to report a DMA error to user.
> IOW, if our code only works because target fits in pcibus, what good
> is the abstraction and using distinct types?
> 
> This is why I think we need a generic DMA APIs using dma addresses.

The API was made so that it doesn't report errors. That's because the
PCI bus doesn't provide any possibility of doing so (real devices can't
retry transfers in case an I/O page fault occurs).

In my previous generic IOMMU layer implementation pci_memory_*()
returned non-zero on failure, but I decided to drop it when switching to
a PCI-only (rather a PCI-specific) approach.

In case target_phys_addr_t no longer fits in pcibus_t by a simple
implicit conversion, those explicit bmdma hooks I was going to add will
do the necessary conversions.

The idea of using distinct types is two-fold: let the programmer know
not to rely on them being the same thing, and let the compiler prevent
him from shooting himself in the foot (like I did). Even if there is a
dma_addr_t, some piece of code still needs to provide glue and
conversion between the DMA code and bus-specific code.


	Eduard
Avi Kivity - Sept. 2, 2010, 3:24 p.m.
On 09/02/2010 06:01 PM, Eduard - Gabriel Munteanu wrote:
> On Thu, Sep 02, 2010 at 12:58:13PM +0300, Michael S. Tsirkin wrote:
>> On Thu, Sep 02, 2010 at 12:12:00PM +0300, Eduard - Gabriel Munteanu wrote:
>>> On Thu, Sep 02, 2010 at 08:19:11AM +0300, Michael S. Tsirkin wrote:
>>>> On Sat, Aug 28, 2010 at 05:54:55PM +0300, Eduard - Gabriel Munteanu wrote:
>> I don't insist on this solution, but what other way do you propose to
>> avoid the overhead for everyone not using an iommu?
>> I'm all for a solution that would help iommu as well,
>> but one wasn't yet proposed.
>>
> Hm, we could get even better performance by simply making the IOMMU a
> compile-time option. It also avoids problems in case some device hasn't
> been converted yet, and involves little to no tradeoffs. What do you
> think?
>
> AFAICT, there are few uses for the IOMMU besides development and
> avantgarde stuff, as you note. So distributions can continue supplying
> prebuilt QEMU/KVM packages compiled with the IOMMU turned off for the
> time being. The only practical (commercial) use right now would be in
> the case of private virtual servers, which could be divided further into
> nested guests (though real IOMMU hardware isn't widespread yet).
>
> Blue Swirl, in the light of this, do you agree on making it a
> compile-time option?

That's not a practical long term solution.  Eventually everything gets 
turned on.

I don't really see a problem with the additional indirection.  By the 
time we reach actual hardware to satisfy the request, we'll have gone 
through many such indirections; modern processors deal very well with them.
Michael S. Tsirkin - Sept. 2, 2010, 3:31 p.m.
On Thu, Sep 02, 2010 at 06:01:35PM +0300, Eduard - Gabriel Munteanu wrote:
> On Thu, Sep 02, 2010 at 12:58:13PM +0300, Michael S. Tsirkin wrote:
> > On Thu, Sep 02, 2010 at 12:12:00PM +0300, Eduard - Gabriel Munteanu wrote:
> > > On Thu, Sep 02, 2010 at 08:19:11AM +0300, Michael S. Tsirkin wrote:
> > > > On Sat, Aug 28, 2010 at 05:54:55PM +0300, Eduard - Gabriel Munteanu wrote:
> > 
> > I don't insist on this solution, but what other way do you propose to
> > avoid the overhead for everyone not using an iommu?
> > I'm all for a solution that would help iommu as well,
> > but one wasn't yet proposed.
> > 
> 
> Hm, we could get even better performance by simply making the IOMMU a
> compile-time option. It also avoids problems in case some device hasn't
> been converted yet, and involves little to no tradeoffs. What do you
> think?
> 
> AFAICT, there are few uses for the IOMMU besides development and
> avantgarde stuff, as you note. So distributions can continue supplying
> prebuilt QEMU/KVM packages compiled with the IOMMU turned off for the
> time being. The only practical (commercial) use right now would be in
> the case of private virtual servers, which could be divided further into
> nested guests (though real IOMMU hardware isn't widespread yet).
> 
> Blue Swirl, in the light of this, do you agree on making it a
> compile-time option?
> 
> > > > >  static inline IDEState *idebus_active_if(IDEBus *bus)
> > > > >  {
> > > > >      return bus->ifs + bus->unit;
> > > > > diff --git a/hw/ide/macio.c b/hw/ide/macio.c
> > > > > index bd1c73e..962ae13 100644
> > > > > --- a/hw/ide/macio.c
> > > > > +++ b/hw/ide/macio.c
> > > > > @@ -79,7 +79,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
> > > > >  
> > > > >      s->io_buffer_size = io->len;
> > > > >  
> > > > > -    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
> > > > > +    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
> > > > >      qemu_sglist_add(&s->sg, io->addr, io->len);
> > > > >      io->addr += io->len;
> > > > >      io->len = 0;
> > > > > @@ -141,7 +141,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
> > > > >      s->io_buffer_index = 0;
> > > > >      s->io_buffer_size = io->len;
> > > > >  
> > > > > -    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
> > > > > +    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
> > > > >      qemu_sglist_add(&s->sg, io->addr, io->len);
> > > > >      io->addr += io->len;
> > > > >      io->len = 0;
> > > > > diff --git a/hw/ide/pci.c b/hw/ide/pci.c
> > > > > index 4d95cc5..5879044 100644
> > > > > --- a/hw/ide/pci.c
> > > > > +++ b/hw/ide/pci.c
> > > > > @@ -183,4 +183,11 @@ void pci_ide_create_devs(PCIDevice *dev, DriveInfo **hd_table)
> > > > >              continue;
> > > > >          ide_create_drive(d->bus+bus[i], unit[i], hd_table[i]);
> > > > >      }
> > > > > +
> > > > > +    for (i = 0; i < 2; i++) {
> > > > > +        d->bmdma[i].rw = (void *) pci_memory_rw;
> > > > > +        d->bmdma[i].map = (void *) pci_memory_map;
> > > > > +        d->bmdma[i].unmap = (void *) pci_memory_unmap;
> > > > > +        d->bmdma[i].opaque = dev;
> > > > > +    }
> > > > >  }
> > > > 
> > > > These casts show something is wrong with the API, IMO.
> > > > 
> > > 
> > > Hm, here's an oversight on my part: I think I should provide explicit
> > > bmdma hooks, since pcibus_t is a uint64_t and target_phys_addr_t is a
> > > uint{32,64}_t depending on the guest machine, so it might be buggy on
> > > 32-bit wrt calling conventions. But that introduces yet another
> > > non-inlined function call :-(. That would drop the (void *) cast,
> > > though.
> > > 
> > > 
> > > 	Eduard
> > 
> > So we get away with it without casts but only because C compiler
> > will let us silently convert the types, possibly discarding
> > data in the process. Or we'll add a check that will try and detect
> > this, but there's no good way to report a DMA error to user.
> > IOW, if our code only works because target fits in pcibus, what good
> > is the abstraction and using distinct types?
> > 
> > This is why I think we need a generic DMA APIs using dma addresses.
> 
> The API was made so that it doesn't report errors. That's because the
> PCI bus doesn't provide any possibility of doing so (real devices can't
> retry transfers in case an I/O page fault occurs).

This is what I am saying. We can't deal with errors.

> In my previous generic IOMMU layer implementation pci_memory_*()
> returned non-zero on failure, but I decided to drop it when switching to
> a PCI-only (rather a PCI-specific) approach.
> 
> In case target_phys_addr_t no longer fits in pcibus_t by a simple
> implicit conversion, those explicit bmdma hooks I was going to add will
> do the necessary conversions.
> 
> The idea of using distinct types is two-fold: let the programmer know
> not to rely on them being the same thing, and let the compiler prevent
> him from shooting himself in the foot (like I did). Even if there is a
> dma_addr_t, some piece of code still needs to provide glue and
> conversion between the DMA code and bus-specific code.
> 
> 
> 	Eduard

Nothing I see here is bus-specific, really. Without an mmu addresses
that make sense are target addresses, with iommu - whatever iommu
supports. So make iommu work with dma_addr_t and do the conversion.
Michael S. Tsirkin - Sept. 2, 2010, 3:39 p.m.
On Thu, Sep 02, 2010 at 06:24:25PM +0300, Avi Kivity wrote:
> That's not a practical long term solution.  Eventually everything
> gets turned on.

That's why I wanted a simple !iommu check and fallback.
This way unless it's really used there's no overhead.

> I don't really see a problem with the additional indirection.  By
> the time we reach actual hardware to satisfy the request,
> we'll have gone through many such indirections; modern processors deal
> very well with them.
> 
> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - Sept. 2, 2010, 4:07 p.m.
On 09/02/2010 06:39 PM, Michael S. Tsirkin wrote:
> On Thu, Sep 02, 2010 at 06:24:25PM +0300, Avi Kivity wrote:
>> That's not a practical long term solution.  Eventually everything
>> gets turned on.
> That's why I wanted a simple !iommu check and fallback.
> This way unless it's really used there's no overhead.
>

It's not very different from an indirect function call.  Modern branch 
predictors store the target function address and supply it ahead of time.

I've never seen a function call instruction in a profile.

Patch

diff --git a/dma-helpers.c b/dma-helpers.c
index 712ed89..a0dcdb8 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -10,12 +10,36 @@ 
 #include "dma.h"
 #include "block_int.h"
 
-void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint)
+static void *qemu_sglist_default_map(void *opaque,
+                                     QEMUSGInvalMapFunc *inval_cb,
+                                     void *inval_opaque,
+                                     target_phys_addr_t addr,
+                                     target_phys_addr_t *len,
+                                     int is_write)
+{
+    return cpu_physical_memory_map(addr, len, is_write);
+}
+
+static void qemu_sglist_default_unmap(void *opaque,
+                                      void *buffer,
+                                      target_phys_addr_t len,
+                                      int is_write,
+                                      target_phys_addr_t access_len)
+{
+    cpu_physical_memory_unmap(buffer, len, is_write, access_len);
+}
+
+void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint,
+                      QEMUSGMapFunc *map, QEMUSGUnmapFunc *unmap, void *opaque)
 {
     qsg->sg = qemu_malloc(alloc_hint * sizeof(ScatterGatherEntry));
     qsg->nsg = 0;
     qsg->nalloc = alloc_hint;
     qsg->size = 0;
+
+    qsg->map = map ? map : qemu_sglist_default_map;
+    qsg->unmap = unmap ? unmap : qemu_sglist_default_unmap;
+    qsg->opaque = opaque;
 }
 
 void qemu_sglist_add(QEMUSGList *qsg, target_phys_addr_t base,
@@ -73,12 +97,23 @@  static void dma_bdrv_unmap(DMAAIOCB *dbs)
     int i;
 
     for (i = 0; i < dbs->iov.niov; ++i) {
-        cpu_physical_memory_unmap(dbs->iov.iov[i].iov_base,
-                                  dbs->iov.iov[i].iov_len, !dbs->is_write,
-                                  dbs->iov.iov[i].iov_len);
+        dbs->sg->unmap(dbs->sg->opaque,
+                       dbs->iov.iov[i].iov_base,
+                       dbs->iov.iov[i].iov_len, !dbs->is_write,
+                       dbs->iov.iov[i].iov_len);
     }
 }
 
+static void dma_bdrv_cancel(void *opaque)
+{
+    DMAAIOCB *dbs = opaque;
+
+    bdrv_aio_cancel(dbs->acb);
+    dma_bdrv_unmap(dbs);
+    qemu_iovec_destroy(&dbs->iov);
+    qemu_aio_release(dbs);
+}
+
 static void dma_bdrv_cb(void *opaque, int ret)
 {
     DMAAIOCB *dbs = (DMAAIOCB *)opaque;
@@ -100,7 +135,8 @@  static void dma_bdrv_cb(void *opaque, int ret)
     while (dbs->sg_cur_index < dbs->sg->nsg) {
         cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
         cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
-        mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->is_write);
+        mem = dbs->sg->map(dbs->sg->opaque, dma_bdrv_cancel, dbs,
+                           cur_addr, &cur_len, !dbs->is_write);
         if (!mem)
             break;
         qemu_iovec_add(&dbs->iov, mem, cur_len);
diff --git a/dma.h b/dma.h
index f3bb275..d48f35c 100644
--- a/dma.h
+++ b/dma.h
@@ -15,6 +15,19 @@ 
 #include "hw/hw.h"
 #include "block.h"
 
+typedef void QEMUSGInvalMapFunc(void *opaque);
+typedef void *QEMUSGMapFunc(void *opaque,
+                            QEMUSGInvalMapFunc *inval_cb,
+                            void *inval_opaque,
+                            target_phys_addr_t addr,
+                            target_phys_addr_t *len,
+                            int is_write);
+typedef void QEMUSGUnmapFunc(void *opaque,
+                             void *buffer,
+                             target_phys_addr_t len,
+                             int is_write,
+                             target_phys_addr_t access_len);
+
 typedef struct {
     target_phys_addr_t base;
     target_phys_addr_t len;
@@ -25,9 +38,15 @@  typedef struct {
     int nsg;
     int nalloc;
     target_phys_addr_t size;
+
+    QEMUSGMapFunc *map;
+    QEMUSGUnmapFunc *unmap;
+    void *opaque;
 } QEMUSGList;
 
-void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint);
+void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint,
+                      QEMUSGMapFunc *map, QEMUSGUnmapFunc *unmap,
+                      void *opaque);
 void qemu_sglist_add(QEMUSGList *qsg, target_phys_addr_t base,
                      target_phys_addr_t len);
 void qemu_sglist_destroy(QEMUSGList *qsg);
diff --git a/hw/ide/core.c b/hw/ide/core.c
index af52c2c..024a125 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -436,7 +436,8 @@  static int dma_buf_prepare(BMDMAState *bm, int is_write)
     } prd;
     int l, len;
 
-    qemu_sglist_init(&s->sg, s->nsector / (IDE_PAGE_SIZE / 512) + 1);
+    qemu_sglist_init(&s->sg, s->nsector / (IDE_PAGE_SIZE / 512) + 1,
+                     bm->map, bm->unmap, bm->opaque);
     s->io_buffer_size = 0;
     for(;;) {
         if (bm->cur_prd_len == 0) {
@@ -444,7 +445,7 @@  static int dma_buf_prepare(BMDMAState *bm, int is_write)
             if (bm->cur_prd_last ||
                 (bm->cur_addr - bm->addr) >= IDE_PAGE_SIZE)
                 return s->io_buffer_size != 0;
-            cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+            bmdma_memory_read(bm, bm->cur_addr, (uint8_t *)&prd, 8);
             bm->cur_addr += 8;
             prd.addr = le32_to_cpu(prd.addr);
             prd.size = le32_to_cpu(prd.size);
@@ -527,7 +528,7 @@  static int dma_buf_rw(BMDMAState *bm, int is_write)
             if (bm->cur_prd_last ||
                 (bm->cur_addr - bm->addr) >= IDE_PAGE_SIZE)
                 return 0;
-            cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+            bmdma_memory_read(bm, bm->cur_addr, (uint8_t *)&prd, 8);
             bm->cur_addr += 8;
             prd.addr = le32_to_cpu(prd.addr);
             prd.size = le32_to_cpu(prd.size);
@@ -542,11 +543,11 @@  static int dma_buf_rw(BMDMAState *bm, int is_write)
             l = bm->cur_prd_len;
         if (l > 0) {
             if (is_write) {
-                cpu_physical_memory_write(bm->cur_prd_addr,
-                                          s->io_buffer + s->io_buffer_index, l);
+                bmdma_memory_write(bm, bm->cur_prd_addr,
+                                   s->io_buffer + s->io_buffer_index, l);
             } else {
-                cpu_physical_memory_read(bm->cur_prd_addr,
-                                          s->io_buffer + s->io_buffer_index, l);
+                bmdma_memory_read(bm, bm->cur_prd_addr,
+                                  s->io_buffer + s->io_buffer_index, l);
             }
             bm->cur_prd_addr += l;
             bm->cur_prd_len -= l;
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index 4165543..f686d38 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -477,6 +477,24 @@  struct IDEDeviceInfo {
 #define BM_CMD_START     0x01
 #define BM_CMD_READ      0x08
 
+typedef void BMDMAInvalMapFunc(void *opaque);
+typedef void BMDMARWFunc(void *opaque,
+                         target_phys_addr_t addr,
+                         uint8_t *buf,
+                         target_phys_addr_t len,
+                         int is_write);
+typedef void *BMDMAMapFunc(void *opaque,
+                           BMDMAInvalMapFunc *inval_cb,
+                           void *inval_opaque,
+                           target_phys_addr_t addr,
+                           target_phys_addr_t *len,
+                           int is_write);
+typedef void BMDMAUnmapFunc(void *opaque,
+                            void *buffer,
+                            target_phys_addr_t len,
+                            int is_write,
+                            target_phys_addr_t access_len);
+
 struct BMDMAState {
     uint8_t cmd;
     uint8_t status;
@@ -496,8 +514,29 @@  struct BMDMAState {
     int64_t sector_num;
     uint32_t nsector;
     QEMUBH *bh;
+
+    BMDMARWFunc *rw;
+    BMDMAMapFunc *map;
+    BMDMAUnmapFunc *unmap;
+    void *opaque;
 };
 
+static inline void bmdma_memory_read(BMDMAState *bm,
+                                     target_phys_addr_t addr,
+                                     uint8_t *buf,
+                                     target_phys_addr_t len)
+{
+    bm->rw(bm->opaque, addr, buf, len, 0);
+}
+
+static inline void bmdma_memory_write(BMDMAState *bm,
+                                      target_phys_addr_t addr,
+                                      uint8_t *buf,
+                                      target_phys_addr_t len)
+{
+    bm->rw(bm->opaque, addr, buf, len, 1);
+}
+
 static inline IDEState *idebus_active_if(IDEBus *bus)
 {
     return bus->ifs + bus->unit;
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index bd1c73e..962ae13 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -79,7 +79,7 @@  static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
 
     s->io_buffer_size = io->len;
 
-    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
+    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
     qemu_sglist_add(&s->sg, io->addr, io->len);
     io->addr += io->len;
     io->len = 0;
@@ -141,7 +141,7 @@  static void pmac_ide_transfer_cb(void *opaque, int ret)
     s->io_buffer_index = 0;
     s->io_buffer_size = io->len;
 
-    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1);
+    qemu_sglist_init(&s->sg, io->len / MACIO_PAGE_SIZE + 1, NULL, NULL, NULL);
     qemu_sglist_add(&s->sg, io->addr, io->len);
     io->addr += io->len;
     io->len = 0;
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index 4d95cc5..5879044 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -183,4 +183,11 @@  void pci_ide_create_devs(PCIDevice *dev, DriveInfo **hd_table)
             continue;
         ide_create_drive(d->bus+bus[i], unit[i], hd_table[i]);
     }
+
+    for (i = 0; i < 2; i++) {
+        d->bmdma[i].rw = (void *) pci_memory_rw;
+        d->bmdma[i].map = (void *) pci_memory_map;
+        d->bmdma[i].unmap = (void *) pci_memory_unmap;
+        d->bmdma[i].opaque = dev;
+    }
 }