Patchwork [v2] Basic Intel IOMMU DMAR emulation

login
register
mail settings
Submitter Nadav Amit
Date April 14, 2010, 10:21 a.m.
Message ID <6094116E-6CAD-4828-83F5-F4F22AE3C24F@gmail.com>
Download mbox | patch
Permalink /patch/50145/
State New
Headers show

Comments

Nadav Amit - April 14, 2010, 10:21 a.m.
This version addresses the compatibility issues raised by Paul Brooks, and 
loads the DMAR table without addition command line parameters.
Caching-mode is now disabled (it's not required for the current implementation).

---

This patch enables basic Intel IOMMU (VT-d) emulation for DMA remappings.
Registers invalidation is supported, as well as partial queued invalidation. In
addition the structure allows other IOMMU architectures to easily connect to
the IOMMU indirection emulation.

In general the patch emulates on one hand the expected behavior of the
registers, and performs the translation of DMA accesses according to the paging
structures.

Currently only IDE device is supported (SCSI and E1000 should follow).

The current patch has the following (additional) limitations:
1. Queued invalidation does not support interrupts on wait descriptors
2. DMAR faults are not supported
3. No protection of read-only registers in the IOMMU.
4. No protection according to protection memory ranges

Signed-off-by: Nadav Amit <nadav.amit at gmail.com>
---
 Makefile.target  |    1 +
 cpu-common.h     |   26 ++
 dma-helpers.c    |   20 +-
 dma.h            |    6 +-
 exec.c           |  100 +++++++-
 hw/DMAR.dat      |  Bin 0 -> 28 bytes
 hw/ide/core.c    |   31 ++-
 hw/intel_iommu.c |  705 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/intel_iommu.h |   18 ++
 hw/pc.c          |    2 +
 vl.c             |    8 +
 11 files changed, 884 insertions(+), 33 deletions(-)
 create mode 100644 hw/DMAR.dat
 create mode 100644 hw/intel_iommu.c
 create mode 100644 hw/intel_iommu.h
Blue Swirl - April 14, 2010, 6:28 p.m.
On 4/14/10, Nadav Amit <nadav.amit@gmail.com> wrote:
> This version addresses the compatibility issues raised by Paul Brooks, and
>  loads the DMAR table without addition command line parameters.
>  Caching-mode is now disabled (it's not required for the current implementation).
>
>  ---
>
>  This patch enables basic Intel IOMMU (VT-d) emulation for DMA remappings.
>  Registers invalidation is supported, as well as partial queued invalidation. In
>  addition the structure allows other IOMMU architectures to easily connect to
>  the IOMMU indirection emulation.
>
>  In general the patch emulates on one hand the expected behavior of the
>  registers, and performs the translation of DMA accesses according to the paging
>  structures.
>
>  Currently only IDE device is supported (SCSI and E1000 should follow).
>
>  The current patch has the following (additional) limitations:
>  1. Queued invalidation does not support interrupts on wait descriptors
>  2. DMAR faults are not supported
>  3. No protection of read-only registers in the IOMMU.
>  4. No protection according to protection memory ranges

I think the approach is not generic enough. Every device must be
converted (which may be inevitable) but we are still limited to
translation depth of one. With the same conversion effort, I'd like to
have a stackable design, which means that static states can't be used.

For example, it would be great if the Sparc32 specific memory accesses
and PCI memory accesses could be combined in pcnet.c. On Sparc32, the
translation stack depth is greater than just one.

>
>  Signed-off-by: Nadav Amit <nadav.amit at gmail.com>

Malformed address.

>  ---
>   Makefile.target  |    1 +
>   cpu-common.h     |   26 ++
>   dma-helpers.c    |   20 +-
>   dma.h            |    6 +-
>   exec.c           |  100 +++++++-
>   hw/DMAR.dat      |  Bin 0 -> 28 bytes
>   hw/ide/core.c    |   31 ++-
>   hw/intel_iommu.c |  705 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   hw/intel_iommu.h |   18 ++
>   hw/pc.c          |    2 +
>   vl.c             |    8 +
>   11 files changed, 884 insertions(+), 33 deletions(-)
>   create mode 100644 hw/DMAR.dat
>   create mode 100644 hw/intel_iommu.c
>   create mode 100644 hw/intel_iommu.h
>
>  diff --git a/Makefile.target b/Makefile.target
>  index 4d88543..79c9ff9 100644
>  --- a/Makefile.target
>  +++ b/Makefile.target
>  @@ -233,6 +233,7 @@ obj-i386-y += testdev.o
>
>   obj-i386-$(CONFIG_KVM_PIT) += i8254-kvm.o
>   obj-i386-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += device-assignment.o
>  +obj-i386-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += intel_iommu.o

CONFIG_KVM_DEVICE_ASSIGNMENT?

>
>   # Hardware support
>   obj-ia64-y += ide.o pckbd.o vga.o $(SOUND_HW) dma.o $(AUDIODRV)
>  diff --git a/cpu-common.h b/cpu-common.h
>  index 6cae15b..0484b40 100644
>  --- a/cpu-common.h
>  +++ b/cpu-common.h
>  @@ -12,6 +12,8 @@
>
>   #if !defined(CONFIG_USER_ONLY)
>
>  +struct DeviceState;

Is this needed? We have the typedef in qemu-common.h and the
definition in hw/qdev.h.

>  +
>   /* address in the RAM (different from a physical address) */
>   typedef unsigned long ram_addr_t;
>
>  @@ -47,6 +49,9 @@ void cpu_unregister_io_memory(int table_address);
>
>   void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>                              int len, int is_write);
>  +void cpu_physical_memory_rw_io(target_phys_addr_t addr, uint8_t *buf,
>  +                              int len, int is_write,
>  +                               struct DeviceState* qdev, int* err);

struct DeviceState *qdev, int *err

>   static inline void cpu_physical_memory_read(target_phys_addr_t addr,
>                                              uint8_t *buf, int len)
>   {
>  @@ -57,9 +62,30 @@ static inline void cpu_physical_memory_write(target_phys_addr_t addr,
>   {
>      cpu_physical_memory_rw(addr, (uint8_t *)buf, len, 1);
>   }
>  +static inline void cpu_physical_memory_read_io(target_phys_addr_t addr,
>  +                                            uint8_t *buf, int len,
>  +                                           struct DeviceState* qdev,
>  +                                           int* err)
>  +{
>  +    cpu_physical_memory_rw_io(addr, buf, len, 0, qdev, err);
>  +}
>  +static inline void cpu_physical_memory_write_io(target_phys_addr_t addr,
>  +                                            uint8_t *buf, int len,
>  +                                           struct DeviceState* qdev,
>  +                                           int* err)
>  +{
>  +    cpu_physical_memory_rw_io(addr, buf, len, 1, qdev, err);
>  +}
>  +
>  +void set_iommu_translation_fn(target_phys_addr_t(*fn)(target_phys_addr_t,
>  +    int, struct DeviceState*, int*));

Please name the parameters. I'd suppose you can typedef the function
pointer and use that here too.

>  +
>   void *cpu_physical_memory_map(target_phys_addr_t addr,
>                                target_phys_addr_t *plen,
>                                int is_write);
>  +void *cpu_physical_memory_map_io(target_phys_addr_t addr,
>  +                              target_phys_addr_t *plen,
>  +                              int is_write, struct DeviceState* qdev, int* err);
>   void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len,
>                                 int is_write, target_phys_addr_t access_len);
>   void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque));
>  diff --git a/dma-helpers.c b/dma-helpers.c
>  index d4fc077..2b8bcf1 100644
>  --- a/dma-helpers.c
>  +++ b/dma-helpers.c
>  @@ -47,6 +47,7 @@ typedef struct {
>      target_phys_addr_t sg_cur_byte;
>      QEMUIOVector iov;
>      QEMUBH *bh;
>  +    DeviceState* qdev;
>   } DMAAIOCB;
>
>   static void dma_bdrv_cb(void *opaque, int ret);
>  @@ -84,6 +85,7 @@ static void dma_bdrv_cb(void *opaque, int ret)
>      DMAAIOCB *dbs = (DMAAIOCB *)opaque;
>      target_phys_addr_t cur_addr, cur_len;
>      void *mem;
>  +    int err;
>
>      dbs->acb = NULL;
>      dbs->sector_num += dbs->iov.size / 512;
>  @@ -100,8 +102,9 @@ static void dma_bdrv_cb(void *opaque, int ret)
>      while (dbs->sg_cur_index < dbs->sg->nsg) {
>          cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
>          cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
>  -        mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->is_write);
>  -        if (!mem)
>  +        mem = cpu_physical_memory_map_io(cur_addr, &cur_len, !dbs->is_write,
>  +           dbs->qdev, &err);
>  +        if (!mem || err)
>              break;
>          qemu_iovec_add(&dbs->iov, mem, cur_len);
>          dbs->sg_cur_byte += cur_len;
>  @@ -147,7 +150,7 @@ static AIOPool dma_aio_pool = {
>   static BlockDriverAIOCB *dma_bdrv_io(
>      BlockDriverState *bs, QEMUSGList *sg, uint64_t sector_num,
>      BlockDriverCompletionFunc *cb, void *opaque,
>  -    int is_write)
>  +    int is_write, DeviceState* qdev)
>   {
>      DMAAIOCB *dbs =  qemu_aio_get(&dma_aio_pool, bs, cb, opaque);
>
>  @@ -159,6 +162,7 @@ static BlockDriverAIOCB *dma_bdrv_io(
>      dbs->sg_cur_byte = 0;
>      dbs->is_write = is_write;
>      dbs->bh = NULL;
>  +    dbs->qdev = qdev;
>      qemu_iovec_init(&dbs->iov, sg->nsg);
>      /*
>       * DMA flushing is handled in dma_bdrv_cb() calling dma_bdrv_unmap()
>  @@ -175,14 +179,16 @@ static BlockDriverAIOCB *dma_bdrv_io(
>
>   BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
>                                  QEMUSGList *sg, uint64_t sector,
>  -                                void (*cb)(void *opaque, int ret), void *opaque)
>  +                                void (*cb)(void *opaque, int ret), void *opaque,
>  +                               DeviceState* qdev)
>   {
>  -    return dma_bdrv_io(bs, sg, sector, cb, opaque, 0);
>  +    return dma_bdrv_io(bs, sg, sector, cb, opaque, 0, qdev);
>   }
>
>   BlockDriverAIOCB *dma_bdrv_write(BlockDriverState *bs,
>                                   QEMUSGList *sg, uint64_t sector,
>  -                                 void (*cb)(void *opaque, int ret), void *opaque)
>  +                                 void (*cb)(void *opaque, int ret), void *opaque,
>  +                                DeviceState* qdev)
>   {
>  -    return dma_bdrv_io(bs, sg, sector, cb, opaque, 1);
>  +    return dma_bdrv_io(bs, sg, sector, cb, opaque, 1, qdev);
>   }
>  diff --git a/dma.h b/dma.h
>  index f3bb275..231a2c1 100644
>  --- a/dma.h
>  +++ b/dma.h
>  @@ -34,8 +34,10 @@ void qemu_sglist_destroy(QEMUSGList *qsg);
>
>   BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
>                                  QEMUSGList *sg, uint64_t sector,
>  -                                BlockDriverCompletionFunc *cb, void *opaque);
>  +                                BlockDriverCompletionFunc *cb, void *opaque,
>  +                               DeviceState* qdev);
>   BlockDriverAIOCB *dma_bdrv_write(BlockDriverState *bs,
>                                   QEMUSGList *sg, uint64_t sector,
>  -                                 BlockDriverCompletionFunc *cb, void *opaque);
>  +                                 BlockDriverCompletionFunc *cb, void *opaque,
>  +                                DeviceState* qdev);
>   #endif
>  diff --git a/exec.c b/exec.c
>  index de2fbea..b663f58 100644
>  --- a/exec.c
>  +++ b/exec.c
>  @@ -186,6 +186,24 @@ unsigned long qemu_real_host_page_size;
>   unsigned long qemu_host_page_bits;
>   unsigned long qemu_host_page_size;
>   unsigned long qemu_host_page_mask;
>  +target_phys_addr_t (*iommu_phy_addr_translate)(target_phys_addr_t addr,
>  +       int is_write, DeviceState* qdev, int* err);
>  +
>  +static target_phys_addr_t no_iommu_translate(target_phys_addr_t addr,
>  +    int is_write, DeviceState* qdev, int* err)
>  +{
>  +    return addr;
>  +}
>  +
>  +static void init_iommu_translation(void)
>  +{
>  +    iommu_phy_addr_translate = no_iommu_translate;
>  +}
>  +
>  +void set_iommu_translation_fn(target_phys_addr_t(*fn)(target_phys_addr_t, int, DeviceState*, int*))
>  +{
>  +    iommu_phy_addr_translate = fn;
>  +}
>
>   /* This is a multi-level map on the virtual address space.
>     The bottom level has pointers to PageDesc.  */
>  @@ -551,6 +569,7 @@ static void code_gen_alloc(unsigned long tb_size)
>     size. */
>   void cpu_exec_init_all(unsigned long tb_size)
>   {
>  +    init_iommu_translation();
>      cpu_gen_init();
>      code_gen_alloc(tb_size);
>      code_gen_ptr = code_gen_buffer;
>  @@ -3371,8 +3390,8 @@ int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
>   }
>
>   #else
>  -void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>  -                            int len, int is_write)
>  +static void __cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,

Please don't use names starting with underscores.

>  +                            int len, int is_write, DeviceState* qdev, int* err)
>   {
>      int l, io_index;
>      uint8_t *ptr;
>  @@ -3380,10 +3399,26 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>      target_phys_addr_t page;
>      unsigned long pd;
>      PhysPageDesc *p;
>  +    target_phys_addr_t addr0;
>  +    int single_err;
>  +    int temp;
>  +    if (err == NULL)
>  +        err = &temp;
>  +    *err = 0;
>  +
>
>      while (len > 0) {
>  -        page = addr & TARGET_PAGE_MASK;
>  -        l = (page + TARGET_PAGE_SIZE) - addr;
>  +       single_err = 0;
>  +       addr0 = addr;
>  +       if (qdev != NULL) {
>  +           *err = 0;
>  +           addr0 = iommu_phy_addr_translate(addr, is_write, qdev, &single_err);
>  +            *err |= single_err;
>  +            if (single_err)
>  +               continue;
>  +        }
>  +        page = addr0 & TARGET_PAGE_MASK;
>  +        l = (page + TARGET_PAGE_SIZE) - addr0;
>          if (l > len)
>              l = len;
>          p = phys_page_find(page >> TARGET_PAGE_BITS);
>  @@ -3395,10 +3430,10 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>
>          if (is_write) {
>              if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
>  -                target_phys_addr_t addr1 = addr;
>  +                target_phys_addr_t addr1 = addr0;
>                  io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
>                  if (p)
>  -                    addr1 = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
>  +                    addr1 = (addr0 & ~TARGET_PAGE_MASK) + p->region_offset;
>                  /* XXX: could force cpu_single_env to NULL to avoid
>                     potential bugs */
>                  if (l >= 4 && ((addr1 & 3) == 0)) {
>  @@ -3439,7 +3474,7 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>          } else {
>              if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
>                  !(pd & IO_MEM_ROMD)) {
>  -                target_phys_addr_t addr1 = addr;
>  +                target_phys_addr_t addr1 = addr0;
>                  /* I/O case */
>                  io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
>                  if (p)
>  @@ -3473,6 +3508,19 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>      }
>   }
>
>  +void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>  +                            int len, int is_write)
>  +{
>  +    __cpu_physical_memory_rw(addr, buf, len, is_write, NULL, NULL);
>  +}
>  +
>  +void cpu_physical_memory_rw_io(target_phys_addr_t addr, uint8_t *buf,
>  +                            int len, int is_write, DeviceState* qdev,
>  +                           int* err)
>  +{
>  +    __cpu_physical_memory_rw(addr, buf, len, is_write, qdev, err);
>  +}
>  +
>   /* used for ROM loading : can write in RAM and ROM */
>   void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>                                     const uint8_t *buf, int len)
>  @@ -3565,9 +3613,9 @@ static void cpu_notify_map_clients(void)
>   * Use cpu_register_map_client() to know when retrying the map operation is
>   * likely to succeed.
>   */
>  -void *cpu_physical_memory_map(target_phys_addr_t addr,
>  +static void *__cpu_physical_memory_map(target_phys_addr_t addr,
>                                target_phys_addr_t *plen,
>  -                              int is_write)
>  +                              int is_write, DeviceState* qdev, int* err)
>   {
>      target_phys_addr_t len = *plen;
>      target_phys_addr_t done = 0;
>  @@ -3578,10 +3626,18 @@ void *cpu_physical_memory_map(target_phys_addr_t addr,
>      unsigned long pd;
>      PhysPageDesc *p;
>      unsigned long addr1;
>  +    target_phys_addr_t addr0 = addr;
>  +    int temp;
>  +
>  +    if (err == NULL)
>  +       err = &temp;
>
>      while (len > 0) {
>  -        page = addr & TARGET_PAGE_MASK;
>  -        l = (page + TARGET_PAGE_SIZE) - addr;
>  +       addr0 = addr;
>  +       if (qdev != NULL)
>  +            addr0 = iommu_phy_addr_translate(addr, is_write, qdev, err);
>  +        page = addr0 & TARGET_PAGE_MASK;
>  +        l = (page + TARGET_PAGE_SIZE) - addr0;
>          if (l > len)
>              l = len;
>          p = phys_page_find(page >> TARGET_PAGE_BITS);
>  @@ -3596,14 +3652,14 @@ void *cpu_physical_memory_map(target_phys_addr_t addr,
>                  break;
>              }
>              bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, TARGET_PAGE_SIZE);
>  -            bounce.addr = addr;
>  +            bounce.addr = addr0;
>              bounce.len = l;
>              if (!is_write) {
>  -                cpu_physical_memory_rw(addr, bounce.buffer, l, 0);
>  +                cpu_physical_memory_rw(addr0, bounce.buffer, l, 0);
>              }
>              ptr = bounce.buffer;
>          } else {
>  -            addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
>  +            addr1 = (pd & TARGET_PAGE_MASK) + (addr0 & ~TARGET_PAGE_MASK);
>              ptr = qemu_get_ram_ptr(addr1);
>          }
>          if (!done) {
>  @@ -3620,6 +3676,22 @@ void *cpu_physical_memory_map(target_phys_addr_t addr,
>      return ret;
>   }
>
>  +void *cpu_physical_memory_map_io(target_phys_addr_t addr,
>  +                              target_phys_addr_t *plen,
>  +                              int is_write, DeviceState* qdev, int* err)
>  +{
>  +     return __cpu_physical_memory_map(addr,
>  +            plen, is_write, qdev, err);
>  +}
>  +
>  +
>  +void *cpu_physical_memory_map(target_phys_addr_t addr,
>  +                              target_phys_addr_t *plen,
>  +                              int is_write)
>  +{
>  +    return __cpu_physical_memory_map(addr, plen, is_write, NULL, NULL);
>  +}
>  +
>   /* Unmaps a memory region previously mapped by cpu_physical_memory_map().
>   * Will also mark the memory as dirty if is_write == 1.  access_len gives
>   * the amount of memory that was actually read or written by the caller.
>  diff --git a/hw/DMAR.dat b/hw/DMAR.dat
>  new file mode 100644
>  index 0000000000000000000000000000000000000000..db5010ce79ab960b3cbd69a12c67863246d26538
>  GIT binary patch
>  literal 28
>  TcmY#pKn4N~j9~7?e;^6~4!8mY
>
>  literal 0
>  HcmV?d00001
>
>  diff --git a/hw/ide/core.c b/hw/ide/core.c
>  index 67480bb..4df7cd0 100644
>  --- a/hw/ide/core.c
>  +++ b/hw/ide/core.c
>  @@ -34,6 +34,7 @@
>   #include "dma.h"
>
>   #include <hw/ide/internal.h>
>  +#include <hw/ide/pci.h>
>
>   static int smart_attributes[][5] = {
>      /* id,  flags, val, wrst, thrsh */
>  @@ -432,7 +433,7 @@ static int dma_buf_prepare(BMDMAState *bm, int is_write)
>          uint32_t addr;
>          uint32_t size;
>      } prd;
>  -    int l, len;
>  +    int l, len, err;
>
>      qemu_sglist_init(&s->sg, s->nsector / (TARGET_PAGE_SIZE/512) + 1);
>      s->io_buffer_size = 0;
>  @@ -442,7 +443,10 @@ static int dma_buf_prepare(BMDMAState *bm, int is_write)
>              if (bm->cur_prd_last ||
>                  (bm->cur_addr - bm->addr) >= 4096)
>                  return s->io_buffer_size != 0;
>  -            cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
>  +            cpu_physical_memory_read_io(bm->cur_addr, (uint8_t *)&prd, 8,
>  +               &bm->pci_dev->dev.qdev, &err);
>  +           if (err)
>  +               return 0;
>              bm->cur_addr += 8;
>              prd.addr = le32_to_cpu(prd.addr);
>              prd.size = le32_to_cpu(prd.size);
>  @@ -514,7 +518,7 @@ static int dma_buf_rw(BMDMAState *bm, int is_write)
>          uint32_t addr;
>          uint32_t size;
>      } prd;
>  -    int l, len;
>  +    int l, len, err;
>
>      for(;;) {
>          l = s->io_buffer_size - s->io_buffer_index;
>  @@ -525,7 +529,10 @@ static int dma_buf_rw(BMDMAState *bm, int is_write)
>              if (bm->cur_prd_last ||
>                  (bm->cur_addr - bm->addr) >= 4096)
>                  return 0;
>  -            cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
>  +            cpu_physical_memory_read_io(bm->cur_addr, (uint8_t *)&prd, 8,
>  +               &bm->pci_dev->dev.qdev, &err);
>  +           if (err)
>  +               return 0;

New code should always use curly braces, see CODING_STYLE.

>              bm->cur_addr += 8;
>              prd.addr = le32_to_cpu(prd.addr);
>              prd.size = le32_to_cpu(prd.size);
>  @@ -540,11 +547,13 @@ static int dma_buf_rw(BMDMAState *bm, int is_write)
>              l = bm->cur_prd_len;
>          if (l > 0) {
>              if (is_write) {
>  -                cpu_physical_memory_write(bm->cur_prd_addr,
>  -                                          s->io_buffer + s->io_buffer_index, l);
>  +                cpu_physical_memory_write_io(bm->cur_prd_addr,
>  +                                          s->io_buffer + s->io_buffer_index, l,
>  +                                         &bm->pci_dev->dev.qdev, &err);
>              } else {
>  -                cpu_physical_memory_read(bm->cur_prd_addr,
>  -                                          s->io_buffer + s->io_buffer_index, l);
>  +                cpu_physical_memory_read_io(bm->cur_prd_addr,
>  +                                          s->io_buffer + s->io_buffer_index, l,
>  +                                         &bm->pci_dev->dev.qdev, &err);
>              }
>              bm->cur_prd_addr += l;
>              bm->cur_prd_len -= l;
>  @@ -600,7 +609,8 @@ static void ide_read_dma_cb(void *opaque, int ret)
>   #ifdef DEBUG_AIO
>      printf("aio_read: sector_num=%" PRId64 " n=%d\n", sector_num, n);
>   #endif
>  -    bm->aiocb = dma_bdrv_read(s->bs, &s->sg, sector_num, ide_read_dma_cb, bm);
>  +    bm->aiocb = dma_bdrv_read(s->bs, &s->sg, sector_num, ide_read_dma_cb, bm,
>  +                             &bm->pci_dev->dev.qdev);
>      ide_dma_submit_check(s, ide_read_dma_cb, bm);
>   }
>
>  @@ -746,7 +756,8 @@ static void ide_write_dma_cb(void *opaque, int ret)
>   #ifdef DEBUG_AIO
>      printf("aio_write: sector_num=%" PRId64 " n=%d\n", sector_num, n);
>   #endif
>  -    bm->aiocb = dma_bdrv_write(s->bs, &s->sg, sector_num, ide_write_dma_cb, bm);
>  +    bm->aiocb = dma_bdrv_write(s->bs, &s->sg, sector_num, ide_write_dma_cb, bm,
>  +                              &bm->pci_dev->dev.qdev);
>      ide_dma_submit_check(s, ide_write_dma_cb, bm);
>   }
>
>  diff --git a/hw/intel_iommu.c b/hw/intel_iommu.c
>  new file mode 100644
>  index 0000000..891597d
>  --- /dev/null
>  +++ b/hw/intel_iommu.c
>  @@ -0,0 +1,705 @@
>  +#include <stddef.h>
>  +#include <assert.h>
>  +#include <stdint.h>
>  +
>  +#include "intel_iommu.h"
>  +#include "../qemu-option.h"
>  +#include "pci.h"
>  +#include "pc.h"
>  +
>  +#define DMAR_DEVICES_NUM 1
>  +#define DMAR_REG_BASE 0xFED10000
>  +
>  +#define CIRG_GLOBAL_INVALIDATION               1
>  +#define CIRG_DOMAIN_SELECTIVE_INVALIDATION     2
>  +#define CIRG_DEVICE_SELECTIVE_INVALIDATION     3
>  +
>  +#define IIRG_GLOBAL_INVALIDATION               1
>  +#define IIRG_DOMAIN_SELECTIVE_INVALIDATION     2
>  +#define IIRG_DOMAIN_PAGE_SELECTIVE_INVALIDATION        3
>  +
>  +#define pr_debug(fmt, ...)                                             \
>  +       do {                                                            \
>  +               fprintf( stderr, "%s : " fmt "\n",                      \
>  +                       __func__, ##__VA_ARGS__);                       \
>  +               fflush (stderr);                                        \
>  +       } while (0)
>  +
>  +
>  +/* Registers definitions taken from Intel's IOMMU driver */
>  +#define        DMAR_VER_REG    0x0     /* Arch version supported by this IOMMU */
>  +#define        DMAR_CAP_REG    0x8     /* Hardware supported capabilities */
>  +#define        DMAR_ECAP_REG   0x10    /* Extended capabilities supported */
>  +#define        DMAR_GCMD_REG   0x18    /* Global command register */
>  +#define        DMAR_GSTS_REG   0x1c    /* Global status register */
>  +#define        DMAR_RTADDR_REG 0x20    /* Root entry table */
>  +#define        DMAR_CCMD_REG   0x28    /* Context command reg */
>  +#define        DMAR_FSTS_REG   0x34    /* Fault Status register */
>  +#define        DMAR_FECTL_REG  0x38    /* Fault control register */
>  +#define        DMAR_FEDATA_REG 0x3c    /* Fault event interrupt data register */
>  +#define        DMAR_FEADDR_REG 0x40    /* Fault event interrupt addr register */
>  +#define        DMAR_FEUADDR_REG 0x44   /* Upper address register */
>  +#define        DMAR_AFLOG_REG  0x58    /* Advanced Fault control */
>  +#define        DMAR_PMEN_REG   0x64    /* Enable Protected Memory Region */
>  +#define        DMAR_PLMBASE_REG 0x68   /* PMRR Low addr */
>  +#define        DMAR_PLMLIMIT_REG 0x6c  /* PMRR low limit */
>  +#define        DMAR_PHMBASE_REG 0x70   /* pmrr high base addr */
>  +#define        DMAR_PHMLIMIT_REG 0x78  /* pmrr high limit */
>  +#define DMAR_IQH_REG   0x80    /* Invalidation queue head register */
>  +#define DMAR_IQT_REG   0x88    /* Invalidation queue tail register */
>  +#define DMAR_IQ_SHIFT  4       /* Invalidation queue head/tail shift */
>  +#define DMAR_IQA_REG   0x90    /* Invalidation queue addr register */
>  +#define DMAR_ICS_REG   0x98    /* Invalidation complete status register */
>  +#define DMAR_IRTA_REG  0xb8    /* Interrupt remapping table addr register */
>  +#define DMAR_IVA_REG   0xc0
>  +#define DMAR_IOTLB_REG 0xc8
>  +#define DMAR_FRR_REG   0xd0
>  +
>  +#define field_mask(s,e)                ((1UL << (s))*((1UL << ((e)-(s)+1)) - 1))
>  +#define clear_field(r,s,e)     *(r) = ((*r) & ~field_mask(s,e))
>  +
>  +static inline uint8_t get_devfn(uint16_t sid)
>  +{
>  +       return (sid & 0xFF);
>  +}
>  +
>  +static inline uint8_t get_busnr(uint16_t sid)
>  +{
>  +       return (sid>>8)&0xFF;

Too few spaces.

>  +}
>  +
>  +static inline void read_reg32(uint8_t* dmar_regs, uint32_t reg,
>  +                             uint64_t* val)
>  +{
>  +       *val = le32_to_cpu(*((uint32_t*)(dmar_regs+reg)));
>  +}
>  +
>  +static inline void read_reg64(uint8_t* dmar_regs, uint32_t reg,
>  +                             uint64_t* val)
>  +{
>  +       *val = le64_to_cpu(*((uint64_t*)(dmar_regs+reg)));
>  +}
>  +
>  +static inline void write_reg32(uint8_t* dmar_regs, uint32_t reg,
>  +                              uint32_t val)
>  +{
>  +       *(uint32_t*)(dmar_regs+reg) = cpu_to_le32(val);
>  +}
>  +
>  +static inline void write_reg64(uint8_t* dmar_regs, uint32_t reg,
>  +                              uint64_t val)
>  +{
>  +       *(uint64_t*)(dmar_regs+reg) = cpu_to_le64(val);
>  +}

You should store the registers in native CPU format, there is no need
to convert endianness here. The "blob" approach should be replaced
with normal array access.

>  +
>  +#define IOMMU_SUB_FIELD(r,t,f,s,e)     \
>  +static inline void encode_ ## r ## _ ## f (uint64_t *l, uint64_t v) \
>  +{ \
>  +       uint64_t l_ = *((uint64_t*)l+((s)/64)); \
>  +       clear_field(&(l_), (s)%64, (e)%64); \
>  +       l_ |= (v<<((s)%64)) & field_mask((s)%64, (e)%64); \
>  +       *(((uint64_t*)l)+(s)/64) = l_; \

I'd use longer names and avoid also trailing underscores. Too few spaces.

>  +} \
>  +\
>  +static inline uint64_t decode_ ## r ## _ ## f (uint64_t *l) \
>  +{ \
>  +       return ((*(l+(s)/64)) & (field_mask((s)%64, (e)%64))) \
>  +               >> ((s)%64); \
>  +}
>  +
>  +
>  +#define IOMMU_REG_FIELD(r,f,s,e)       IOMMU_SUB_FIELD(r,reg,f,s,e)
>  +#define IOMMU_IQ_FIELD(r,f,s,e)                IOMMU_SUB_FIELD(r,iq,f,s,e)
>  +#define IOMMU_PTE_FIELD(r,f,s,e)       IOMMU_SUB_FIELD(r,pt,f,s,e)
>  +#define IOMMU_RE_FIELD(r,f,s,e)                IOMMU_SUB_FIELD(r,pt,f,s,e)
>  +#define IOMMU_CE_FIELD(r,f,s,e)                IOMMU_SUB_FIELD(r,pt,f,s,e)
>  +
>  +IOMMU_REG_FIELD(iqa, qs, 0, 2)

Interesting approach to field names. I'm afraid it's very different
from other QEMU code.

>  +IOMMU_REG_FIELD(iqa, iqa, 12, 63)
>  +IOMMU_REG_FIELD(iqh, qh, 4, 18)
>  +IOMMU_REG_FIELD(iqt, qt, 4, 18)
>  +IOMMU_REG_FIELD(ver, min, 0, 3)
>  +IOMMU_REG_FIELD(cap, cm, 7, 7)
>  +IOMMU_REG_FIELD(cap, sagaw, 8, 12)
>  +IOMMU_REG_FIELD(cap, mgaw, 16, 21)
>  +IOMMU_REG_FIELD(cap, psi, 39, 39)
>  +IOMMU_REG_FIELD(cap, nfr, 40, 47)
>  +IOMMU_REG_FIELD(cap, mamv, 48, 53)
>  +IOMMU_REG_FIELD(cap, fro, 24, 33)
>  +IOMMU_REG_FIELD(ecap, c, 0, 0)
>  +IOMMU_REG_FIELD(ecap, qi, 1, 1)
>  +IOMMU_REG_FIELD(ecap, ch, 5, 5)
>  +IOMMU_REG_FIELD(ecap, pt, 6, 6)
>  +IOMMU_REG_FIELD(ecap, iro, 8, 17)
>  +IOMMU_REG_FIELD(ccmd, caig, 59, 60)
>  +IOMMU_REG_FIELD(ccmd, cirg, 61, 62)
>  +IOMMU_REG_FIELD(ccmd, icc, 63, 63)
>  +IOMMU_REG_FIELD(pmen, prs, 0, 0)
>  +IOMMU_REG_FIELD(pmen, epm, 31, 31)
>  +IOMMU_REG_FIELD(gcmd, cfi, 23, 23)
>  +IOMMU_REG_FIELD(gcmd, sirtp, 24, 24)
>  +IOMMU_REG_FIELD(gcmd, ire, 25, 25)
>  +IOMMU_REG_FIELD(gcmd, qie, 26, 26)
>  +IOMMU_REG_FIELD(gcmd, wbf, 27, 27)
>  +IOMMU_REG_FIELD(gcmd, eafl, 28, 28)
>  +IOMMU_REG_FIELD(gcmd, sfl, 29, 29)
>  +IOMMU_REG_FIELD(gcmd, srtp, 30, 30)
>  +IOMMU_REG_FIELD(gcmd, te, 31, 31)
>  +IOMMU_REG_FIELD(gsts, cfis, 23, 23)
>  +IOMMU_REG_FIELD(gsts, irtps, 24, 24)
>  +IOMMU_REG_FIELD(gsts, ires, 25, 25)
>  +IOMMU_REG_FIELD(gsts, qies, 26, 26)
>  +IOMMU_REG_FIELD(gsts, wbfs, 27, 27)
>  +IOMMU_REG_FIELD(gsts, afls, 28, 28)
>  +IOMMU_REG_FIELD(gsts, fls, 29, 29)
>  +IOMMU_REG_FIELD(gsts, rtps, 30, 30)
>  +IOMMU_REG_FIELD(gsts, tes, 31, 31)
>  +IOMMU_REG_FIELD(iotlb, iaig, 57, 58)
>  +IOMMU_REG_FIELD(iotlb, iirg, 60, 61)
>  +IOMMU_REG_FIELD(iotlb, ivt, 63, 63)
>  +IOMMU_REG_FIELD(iva, am, 0, 5)
>  +IOMMU_REG_FIELD(iva, ih, 6, 6)
>  +IOMMU_REG_FIELD(iva, addr, 12, 63)
>  +IOMMU_REG_FIELD(rtaddr, rta, 12, 63)
>  +
>  +enum { MGAW = 48};
>  +enum { FRCD_REG_NUM = 4 };
>  +enum { DMAR_REGS_PAGE = 0x1000, DMAR_REGS_PAGE_SHIFT = 12, LEVEL_STRIDE=9,
>  +       VTD_PAGE_SHIFT=12} ;
>  +enum { DEVFN_ENTRIES_NUM = 256 };
>  +enum { IOMMU_WRITE = 2, IOMMU_READ = 1};
>  +
>  +#define VTD_PAGE_SIZE (1<<VTD_PAGE_SHIFT)
>  +
>  +/* Mapping related structures */
>  +
>  +enum { TRASNLATE_UNTRANSLATED = 0, TRANSLATE_ALL = 1, TRANSLATE_PASS_THROUGH = 2, TRANSLATE_RESERVED = 3};

TRASNLATE is a typo.

>  +
>  +static inline void convert_le128_to_cpu(uint64_t u128[2])
>  +{
>  +       *u128 = le64_to_cpu(*u128);
>  +       *(u128+1) = le64_to_cpu(*(u128+1));
>  +}

This can't be correct on big endian hosts.

>  +
>  +IOMMU_PTE_FIELD(pte, r, 0, 0)
>  +IOMMU_PTE_FIELD(pte, w, 1, 1)
>  +IOMMU_PTE_FIELD(pte, sp, 7, 7)
>  +IOMMU_PTE_FIELD(pte, snp, 11, 11)
>  +IOMMU_PTE_FIELD(pte, addr, 12, 51)
>  +IOMMU_PTE_FIELD(pte, tm, 62, 62)
>  +
>  +IOMMU_CE_FIELD(ce, p, 0, 0)
>  +IOMMU_CE_FIELD(ce, t, 2, 3)
>  +IOMMU_CE_FIELD(ce, asr, 12, 63)
>  +IOMMU_CE_FIELD(ce, aw, 64, 66)
>  +
>  +IOMMU_RE_FIELD(re, p, 0, 0)
>  +IOMMU_RE_FIELD(re, ctp, 12, 63)
>  +
>  +IOMMU_IQ_FIELD(iq_desc, id, 0, 3)
>  +IOMMU_IQ_FIELD(iq_desc, iflag, 4, 4)
>  +IOMMU_IQ_FIELD(iq_desc, sw, 5, 5)
>  +IOMMU_IQ_FIELD(iq_desc, wait_stat_addr, 66, 127)
>  +IOMMU_IQ_FIELD(iq_desc, wait_stat_data, 32, 63)
>  +
>  +enum {SPS_21_BIT = 1, SPS_30_BIT = 2, SPS_39_BIT = 4, SPS_48_BIT = 8}; /* sagaw values */
>  +enum {AGAW_30_BIT = 1, AGAW_39_BIT = 2, AGAW_48_BIT = 4, AGAW_57_BIT = 8, AGAW_64_BIT = 16}; /* agaw values */
>  +
>  +/* Invalidation queue descriptors */
>  +enum {         IQ_CONTEXT_INVD_DESC_ID = 1,
>  +       IQ_IOTLB_INVD_DESC_ID = 2,
>  +       IQ_DEV_IOTLB_INVD_DESC_ID = 3,
>  +       IQ_INT_CACHE_INV_DESC_ID = 4,
>  +       IQ_INVD_WAIT_DESC_ID = 5 };
>  +
>  +enum {DMAR_REGS_SIZE = 0x100};
>  +
>  +struct dmar_status
>  +{
>  +       uint8_t dmar_regs[DMAR_REGS_SIZE];
>  +};
>  +
>  +struct iommu_state {
>  +       int mmio_index;
>  +       int invalidation_queue;
>  +       int enabled;
>  +       struct dmar_status dmar_status[DMAR_DEVICES_NUM];
>  +};
>  +
>  +static struct iommu_state iommu_state;                         /* Static state */

Static states should be avoided.

>  +
>  +static
>  +uint64_t __iommu_phy_addr_translate(target_phys_addr_t addr,
>  +                            uint8_t* access_perm, DeviceState* qdev,
>  +                           uint64_t* size);

Leading underscores.

>  +
>  +static
>  +int intel_iommu_process_iq(struct dmar_status* dmar_status);
>  +
>  +static
>  +void iommu_reset_regs(uint8_t* dmar) {
>  +       uint64_t ecap=0, cap=0,  ver=0;
>  +       memset(dmar, 0, sizeof(*dmar));
>  +       encode_ver_min(&ver, 1);
>  +       write_reg32(dmar, DMAR_VER_REG, ver);
>  +       encode_cap_mgaw(&cap, MGAW-1);
>  +       encode_cap_sagaw(&cap, AGAW_30_BIT | AGAW_39_BIT | AGAW_48_BIT);
>  +       encode_cap_nfr(&cap, FRCD_REG_NUM);
>  +       encode_cap_psi(&cap, 1); /* enabling page selective invalidation */
>  +       encode_cap_mamv(&cap, 63); /* setting mamv to its maximal value */
>  +       encode_cap_cm(&cap, 0); /* caching mode is disabled to support Linux */
>  +       encode_cap_fro(&cap, DMAR_FRR_REG / 16);
>  +       write_reg64(dmar, DMAR_CAP_REG, cap);
>  +       encode_ecap_pt(&ecap, 1);
>  +       encode_ecap_ch(&ecap, 1);
>  +       encode_ecap_c(&ecap, 1);
>  +       encode_ecap_iro(&ecap, DMAR_IVA_REG / 16);
>  +       encode_ecap_qi(&ecap, iommu_state.invalidation_queue);
>  +       write_reg64(dmar, DMAR_ECAP_REG, ecap);
>  +}
>  +
>  +static void
>  +intel_iommu_write_status_update(struct dmar_status* dmar_status,
>  +                               target_phys_addr_t addr)
>  +{
>  +       int offset = addr & 0xFFF & ~3;
>  +       uint8_t* dmar_regs = dmar_status->dmar_regs;
>  +
>  +       switch (offset)
>  +       {

Braces must go to previous line.

>  +       case DMAR_GCMD_REG:     {

I'd put the brace to next line, CODING_STYLE does not cover this.

>  +               uint64_t gsts;
>  +               uint64_t gcmd;
>  +               read_reg32(dmar_regs, DMAR_GCMD_REG, &gcmd);
>  +               read_reg32(dmar_regs, DMAR_GSTS_REG, &gsts);
>  +               encode_gsts_fls(&gsts, decode_gcmd_sfl(&gcmd));
>  +               if (decode_gcmd_srtp(&gcmd))
>  +                       encode_gsts_rtps(&gsts, 1);

Missing braces.

>  +               encode_gsts_wbfs(&gsts, decode_gcmd_wbf(&gcmd));
>  +               encode_gsts_tes(&gsts, decode_gcmd_te(&gcmd));
>  +               encode_gcmd_srtp(&gcmd, 0);
>  +               if (iommu_state.invalidation_queue && decode_gsts_qies(&gsts) == 0 &&
>  +                       decode_gcmd_qie(&gcmd) == 1) {
>  +                       encode_gsts_qies(&gsts, 1);
>  +               }
>  +               write_reg32(dmar_regs, DMAR_GSTS_REG, gsts);
>  +               write_reg32(dmar_regs, DMAR_GCMD_REG, gcmd);
>  +               break;
>  +               }

The indentation seems to be off, braces can't be on the same level as
preceding code block.

>  +       case DMAR_CCMD_REG:
>  +       case DMAR_CCMD_REG+4: {
>  +               uint64_t ccmd;
>  +               read_reg64(dmar_regs, DMAR_CCMD_REG, &ccmd);
>  +               encode_ccmd_caig(&ccmd, decode_ccmd_cirg(&ccmd));
>  +               encode_ccmd_icc(&ccmd, 0);
>  +               write_reg64(dmar_regs, DMAR_CCMD_REG, ccmd);
>  +               break;
>  +               }
>  +       case DMAR_PMEN_REG: {
>  +               uint64_t pmen;
>  +               read_reg32(dmar_regs, DMAR_PMEN_REG, &pmen);
>  +               encode_pmen_prs(&pmen, decode_pmen_epm(&pmen));
>  +               write_reg32(dmar_regs, DMAR_PMEN_REG, pmen);
>  +               break;
>  +               }
>  +       case DMAR_IOTLB_REG:
>  +       case DMAR_IOTLB_REG+4:
>  +       case DMAR_IVA_REG:
>  +       case DMAR_IVA_REG+4: {
>  +               uint64_t iotlb;
>  +               uint64_t iva;
>  +               read_reg64(dmar_regs, DMAR_IOTLB_REG, &iotlb);
>  +               read_reg64(dmar_regs, DMAR_IVA_REG, &iva);
>  +               if (decode_iotlb_ivt(&iotlb) == 1) {
>  +                       encode_iva_addr(&iva, 0);
>  +                       encode_iva_am(&iva, 0);
>  +                       encode_iva_ih(&iva, 0);
>  +                       encode_iotlb_iaig(&iotlb, decode_iotlb_iirg(&iotlb));
>  +                       encode_iotlb_ivt(&iotlb, 0);
>  +                       write_reg64(dmar_regs, DMAR_IOTLB_REG, iotlb);
>  +                       write_reg64(dmar_regs, DMAR_IVA_REG, iva);
>  +               }
>  +               break;
>  +               }
>  +       case DMAR_IQT_REG:
>  +       case DMAR_IQT_REG+4: {
>  +               uint64_t gsts;
>  +               read_reg32(dmar_regs, DMAR_GSTS_REG, &gsts);
>  +               if (iommu_state.invalidation_queue &&
>  +                       decode_gsts_qies(&gsts) == 1)
>  +                       intel_iommu_process_iq(dmar_status);
>  +               break;
>  +               }
>  +       default:
>  +               ;
>  +       }
>  +
>  +}
>  +
>  +
>  +/*
>  +Software is expected to access 32-bit registers as aligned doublewords. For example, to modify a
>  +field (e.g., bit or byte) in a 32-bit register, the entire doubleword is read, the appropriate field(s)
>  +are modified, and the entire doubleword is written back.
>  +*/
>  +
>  +static void
>  +intel_iommu_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>  +{
>  +       struct iommu_state* d = opaque;
>  +       unsigned int offset = (addr & 0xFFF);
>  +       unsigned int dmar_idx = (addr & ~(DMAR_REGS_PAGE-1)) >> DMAR_REGS_PAGE_SHIFT;
>  +       uint8_t* dmar_regs;
>  +
>  +       if (dmar_idx >= DMAR_DEVICES_NUM)
>  +               return;
>  +
>  +       dmar_regs = d->dmar_status[dmar_idx].dmar_regs;
>  +       if (offset > DMAR_REGS_SIZE-4)
>  +               return;                 /* outside the registers area - reserved */
>  +       *(uint32_t*)(((void*)dmar_regs)+offset ) = val;

Ugly, please use normal array access.

>  +       intel_iommu_write_status_update(&(d->dmar_status[dmar_idx]), addr);
>  +}
>  +
>  +static void
>  +intel_iommu_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
>  +{
>  +    // emulate hw without byte enables: no RMW

CODING_STYLE does not allow C99 comments.

>  +    intel_iommu_writel(opaque, addr & ~3,
>  +                      (val & 0xffff) << (8*(addr & 3)));
>  +}
>  +
>  +static void
>  +intel_iommu_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
>  +{
>  +    // emulate hw without byte enables: no RMW
>  +    intel_iommu_writel(opaque, addr & ~3,
>  +                      (val & 0xff) << (8*(addr & 3)));
>  +}
>  +
>  +
>  +static uint32_t
>  +intel_iommu_readl(void *opaque, target_phys_addr_t addr)
>  +{
>  +    struct iommu_state *d = opaque;
>  +    unsigned int offset = (addr & 0xfff);
>  +       unsigned int dmar_idx = (addr & ~(DMAR_REGS_PAGE-1)) >> DMAR_REGS_PAGE_SHIFT;

The indentation goes wild here, compare to previous line.

>  +       if (offset > DMAR_REGS_SIZE-4) {
>  +               return 0; /* outside the boundary of the registers */
>  +       }
>  +       return *(uint32_t*)(d->dmar_status[dmar_idx].dmar_regs+offset);
>  +}
>  +
>  +static uint32_t
>  +intel_iommu_readb(void *opaque, target_phys_addr_t addr)
>  +{
>  +    return ((intel_iommu_readl(opaque, addr & ~3)) >>
>  +            (8 * (addr & 3))) & 0xff;
>  +}
>  +
>  +static uint32_t
>  +intel_iommu_readw(void *opaque, target_phys_addr_t addr)
>  +{
>  +    return ((intel_iommu_readl(opaque, addr & ~3)) >>
>  +            (8 * (addr & 3))) & 0xffff;
>  +}
>  +
>  +static
>  +void iommu_save(QEMUFile *f, void *opaque)
>  +{
>  +       const unsigned char *d = opaque;
>  +       int i;
>  +       for (i=0; i<sizeof(struct iommu_state); i++) {
>  +               qemu_put_8s(f, &d[i]);
>  +       }
>  +}
>  +
>  +static
>  +int iommu_load(QEMUFile *f, void *opaque, int version_id)
>  +{
>  +       uint8_t *d = opaque;
>  +       int i;
>  +       for (i=0; i<sizeof(struct iommu_state); i++) {
>  +               qemu_get_8s(f, &d[i]);

Does not work.

>  +       }
>  +       return 0;
>  +}
>  +
>  +
>  +static
>  +void iommu_reset(void *opaque)
>  +{
>  +       int i;
>  +       struct iommu_state *iommu_state = opaque;
>  +       for (i=0; i<DMAR_DEVICES_NUM; i++)
>  +       {
>  +               iommu_reset_regs(iommu_state->dmar_status[i].dmar_regs);
>  +       }
>  +
>  +}
>  +
>  +static CPUWriteMemoryFunc *intel_iommu_write[] = {

Missing const.

>  +    intel_iommu_writeb,        intel_iommu_writew, intel_iommu_writel

Please put each function to a separate line.

>  +};
>  +
>  +static CPUReadMemoryFunc *intel_iommu_read[] = {
>  +    intel_iommu_readb, intel_iommu_readw, intel_iommu_readl
>  +};
>  +
>  +static
>  +void iommu_mmio_map(struct iommu_state* iommu_state) {
>  +    target_phys_addr_t addr = DMAR_REG_BASE;
>  +    ram_addr_t regs_size = DMAR_REGS_PAGE * DMAR_DEVICES_NUM;
>  +    iommu_state->mmio_index = cpu_register_io_memory(intel_iommu_read,
>  +          intel_iommu_write, iommu_state);
>  +    cpu_register_physical_memory(addr, regs_size, iommu_state->mmio_index);
>  +
>  +}
>  +
>  +int intel_iommu_configure(const char *opt) {

Noo.

>  +    struct iommu_state *is = &iommu_state;
>  +    if (!opt)
>  +       return -1;
>  +    if (!strcmp(opt, "on"))
>  +       is->enabled = 1;
>  +    else if (!strcmp(opt, "queue")) {
>  +       is->enabled = 1;
>  +       is->invalidation_queue = 1;
>  +    }
>  +    else {

} else {

>  +       return -1;
>  +    }
>  +
>  +    if (is->enabled)
>  +       acpi_table_add("sig=DMAR,data=hw/DMAR.dat");
>  +
>  +    return 0;
>  +}
>  +
>  +
>  +/* Translation related functions */
>  +static inline int aw_to_levels(int aw) {
>  +       return aw+2;
>  +}
>  +
>  +
>  +static inline int level_shift(int level) {
>  +       return VTD_PAGE_SHIFT+(level*LEVEL_STRIDE);
>  +}
>  +
>  +static inline uint64_t level_size(int level) {
>  +       return (1ULL<<level_shift(level));
>  +}
>  +
>  +static inline uint64_t level_offset(uint64_t addr, int level) {
>  +       return (addr >> level_shift(level))&((1ULL<<LEVEL_STRIDE)-1);
>  +}
>  +
>  +static inline uint64_t level_mask(int level) {
>  +       return (((uint64_t)1)<<level_shift(level))-1;
>  +}
>  +
>  +/* Main translation function */
>  +static
>  +target_phys_addr_t __iommu_phy_addr_translate(target_phys_addr_t addr,
>  +                            uint8_t* access_perm, DeviceState* qdev, uint64_t* size)
>  +{
>  +       uint64_t pte;
>  +       int level;
>  +       uint8_t* dmar_regs = (iommu_state.dmar_status[0].dmar_regs);
>  +       uint64_t temp, addr_offset, gsts, rtaddr, pte_addr;
>  +       PCIDevice* dev = DO_UPCAST(PCIDevice, qdev, qdev);
>  +       uint8_t devfn = get_devfn(dev->devfn);
>  +       uint8_t busnr = get_busnr(dev->devfn);
>  +       uint64_t re[2], ce[2];
>  +       read_reg32(dmar_regs, DMAR_GSTS_REG, &gsts);
>  +       if (!size)
>  +               size = &temp;
>  +       *access_perm = IOMMU_READ | IOMMU_WRITE;
>  +       /* Reading the root entry */
>  +       read_reg64(dmar_regs, DMAR_RTADDR_REG, &rtaddr);
>  +
>  +       cpu_physical_memory_rw(rtaddr+(busnr*sizeof(re)), (uint8_t *)re, sizeof(re), 0);
>  +       convert_le128_to_cpu(re);
>  +
>  +       if (!decode_gsts_tes(&gsts)/*dmar_regs->gsts.tes*/) {

Strange comments.

>  +               return addr;                            /* Translation is disabled */
>  +       }
>  +
>  +       if (!decode_gsts_rtps(&gsts)/*dmar_regs->gsts.rtps*/) {
>  +               *access_perm = 0;
>  +               return -1;
>  +       }
>  +       if (!decode_re_p(re)) {
>  +               *access_perm = 0;
>  +               return -1;
>  +       }
>  +
>  +       /* Reading the relevant context entry */
>  +       cpu_physical_memory_rw(((decode_re_ctp(re)<<VTD_PAGE_SHIFT)+devfn*sizeof(ce)),
>  +                       (uint8_t *)ce, sizeof(ce), 0);
>  +       convert_le128_to_cpu(ce);
>  +
>  +       if (!decode_ce_p(ce)) {
>  +               *access_perm = 0;
>  +               return -1;
>  +       }
>  +
>  +       if (decode_ce_t(ce) == TRANSLATE_RESERVED) {
>  +               *access_perm = 0;
>  +               return -1;
>  +       }
>  +       if (decode_ce_t(ce) == TRANSLATE_PASS_THROUGH) {
>  +               return addr;
>  +       }
>  +
>  +       /* Analyzing the levels number from the context */
>  +       level = aw_to_levels(decode_ce_aw(ce));
>  +
>  +       pte_addr = decode_ce_asr(ce) << DMAR_REGS_PAGE_SHIFT;
>  +
>  +       while (level>0 && *access_perm != 0) {
>  +               pte_addr += level_offset(addr, level-1)*sizeof(pte);
>  +               cpu_physical_memory_rw(pte_addr,
>  +                               (uint8_t *)&pte, sizeof(pte), 0);
>  +               pte = le64_to_cpu(pte);
>  +               pte_addr = decode_pte_addr(&pte);
>  +               pte_addr <<= VTD_PAGE_SHIFT;
>  +               if (!decode_pte_r(&pte))
>  +                       (*access_perm) &= (~IOMMU_READ);
>  +               if (!decode_pte_w(&pte))
>  +                       (*access_perm) &= (~IOMMU_WRITE);
>  +               if (decode_pte_sp(&pte))
>  +                       break;          /* Super page */
>  +               level--;
>  +       }
>  +       addr_offset = addr&level_mask(level);
>  +       *size = level_size(level) - addr_offset;
>  +       /* shift left also if super-page */
>  +       return (pte_addr<<(LEVEL_STRIDE*level))+(addr&level_mask(level));
>  +}
>  +
>  +static uint64_t intel_iommu_phy_addr_translate(target_phys_addr_t addr,
>  +                            int is_write, DeviceState* qdev, int* err)
>  +{
>  +       uint8_t access_perm;
>  +       uint16_t req_access_perm = is_write ? IOMMU_WRITE : IOMMU_READ;
>  +       uint64_t phy_addr;
>  +
>  +       phy_addr = __iommu_phy_addr_translate(addr, &access_perm, qdev, NULL);
>  +       if (err && (req_access_perm & access_perm)==0)
>  +               *err = -1;
>  +
>  +       return phy_addr;
>  +}
>  +
>  +
>  +static inline
>  +uint8_t* get_dmar_regs(struct dmar_status* dmar_status,
>  +       uint16_t dmar_unit)
>  +{
>  +       return dmar_status->dmar_regs;
>  +}
>  +
>  +static
>  +void intel_iommu_iq_wait(struct dmar_status* dmar_status,
>  +       uint64_t desc[2])
>  +{
>  +       if (decode_iq_desc_iflag(desc)) {
>  +               pr_debug("interrupts are not supported");
>  +               return;
>  +       }
>  +       if (decode_iq_desc_sw((desc))) {
>  +               uint32_t data = decode_iq_desc_wait_stat_data(desc);
>  +               uint64_t addr =
>  +                       decode_iq_desc_wait_stat_addr(desc);
>  +               cpu_physical_memory_rw(addr << 2,
>  +                       (uint8_t *)&data, sizeof(data), 1);
>  +       }
>  +}
>  +
>  +static
>  +int iommu_process_iq_desc (uint64_t desc[2],
>  +       struct dmar_status* dmar_status)
>  +{
>  +       int r = 0;
>  +       uint32_t id = decode_iq_desc_id(desc);
>  +       switch (id)
>  +       {
>  +       case IQ_CONTEXT_INVD_DESC_ID: {
>  +               break;
>  +               }
>  +       case IQ_IOTLB_INVD_DESC_ID: {
>  +               break;
>  +               }
>  +       case IQ_INVD_WAIT_DESC_ID: {
>  +               intel_iommu_iq_wait(dmar_status, desc);
>  +               break;
>  +               }
>  +       case IQ_DEV_IOTLB_INVD_DESC_ID: {
>  +               pr_debug("IQ_DEV_IOTLB_INVD_DESC_ID is not implemented");
>  +               break;
>  +               }
>  +       case IQ_INT_CACHE_INV_DESC_ID: {
>  +               pr_debug("IQ_INT_CACHE_INV_DESC_ID is not implemented");
>  +               break;
>  +               }
>  +       default:
>  +               r = -1;
>  +               pr_debug("invalid descriptor id %x", id);
>  +       }
>  +       return r;
>  +}
>  +
>  +static
>  +int intel_iommu_process_iq(struct dmar_status* dmar_status)
>  +{
>  +       uint8_t* dmar_regs =
>  +               dmar_status->dmar_regs;
>  +       uint64_t iqa, iqt, iqh;
>  +       uint64_t qt, qh;
>  +       size_t len, desc_size;
>  +       target_phys_addr_t map_len, qaddr;
>  +       uint64 desc[2];
>  +
>  +       read_reg64(dmar_regs, DMAR_IQA_REG, &iqa);
>  +       read_reg64(dmar_regs, DMAR_IQT_REG, &iqt);
>  +       read_reg64(dmar_regs, DMAR_IQH_REG, &iqh);
>  +       len = 1 << (8 + decode_iqa_qs(&iqa));
>  +       desc_size = sizeof(desc);
>  +       map_len = len * desc_size;
>  +       qaddr = decode_iqa_iqa(&iqa) << VTD_PAGE_SHIFT;
>  +       qh = decode_iqh_qh(&iqh);
>  +       qt = decode_iqt_qt(&iqt);
>  +
>  +       if (map_len < len * desc_size) {
>  +               pr_debug("could not map entire invalidation queue");
>  +               exit(1);
>  +       }
>  +
>  +       while (qh != qt) {
>  +               cpu_physical_memory_rw(qaddr + qh*desc_size,
>  +                       (uint8_t *)&desc, sizeof(desc), 0);
>  +               convert_le128_to_cpu(desc);
>  +               iommu_process_iq_desc(desc, dmar_status);
>  +               qh = (qh+1)%len;
>  +       }
>  +       encode_iqh_qh(&iqh, qh);
>  +       write_reg64(dmar_regs, DMAR_IQH_REG, iqh);
>  +       return 0;
>  +}
>  +
>  +void intel_iommu_init(void) {
>  +    struct iommu_state *is = &iommu_state;
>  +    static const char info_str[] = "intel-iommu";
>  +    if (!is->enabled)
>  +       return;
>  +
>  +    set_iommu_translation_fn(intel_iommu_phy_addr_translate);
>  +    iommu_mmio_map(is);
>  +    register_savevm(info_str, -1, 1, iommu_save, iommu_load, is);
>  +    qemu_register_reset(iommu_reset, is);
>  +    iommu_reset(is);
>  +}
>  +
>  diff --git a/hw/intel_iommu.h b/hw/intel_iommu.h
>  new file mode 100644
>  index 0000000..e8235c7
>  --- /dev/null
>  +++ b/hw/intel_iommu.h
>  @@ -0,0 +1,18 @@
>  +#ifndef _INTEL_IOMMU_H_
>  +#define _INTEL_IOMMU_H_

Please use QEMU_HW_INTEL_IOMMU_H.

>  +
>  +#include "hw.h"
>  +
>  +struct QemuOpts;
>  +
>  +/* Init the IOMMU emulation */
>  +void intel_iommu_init(void);
>  +
>  +/* Configure the iommu */
>  +int intel_iommu_configure(const char* opt);
>  +
>  +/* Translate the physical address */
>  +/*uint64_t iommu_phy_addr_translate(target_phys_addr_t addr,
>  +                            int is_write, int devfn, int* err);
>  +*/
>  +#endif
>  diff --git a/hw/pc.c b/hw/pc.c
>  index 0aebae9..2652ab5 100644
>  --- a/hw/pc.c
>  +++ b/hw/pc.c
>  @@ -46,6 +46,7 @@
>   #include "elf.h"
>   #include "multiboot.h"
>   #include "device-assignment.h"
>  +#include "intel_iommu.h"
>
>   #include "kvm.h"
>
>  @@ -1028,6 +1029,7 @@ static void pc_init1(ram_addr_t ram_size,
>
>      isa_dev = isa_create_simple("i8042");
>      DMA_init(0);
>  +    intel_iommu_init();
>   #ifdef HAS_AUDIO
>      audio_init(pci_enabled ? pci_bus : NULL, isa_irq);
>   #endif
>  diff --git a/vl.c b/vl.c
>  index d959fdb..613a58e 100644
>  --- a/vl.c
>  +++ b/vl.c
>  @@ -136,6 +136,7 @@ int main(int argc, char **argv)
>   #include "hw/xen.h"
>   #include "hw/qdev.h"
>   #include "hw/loader.h"
>  +#include "hw/intel_iommu.h"
>   #include "bt-host.h"
>   #include "net.h"
>   #include "net/slirp.h"
>  @@ -5750,6 +5751,13 @@ int main(int argc, char **argv, char **envp)
>                      fclose(fp);
>                      break;
>                  }
>  +           case QEMU_OPTION_intel_iommu:
>  +               {
>  +                   if (intel_iommu_configure(optarg) < 0) {
>  +                       fprintf(stderr, "Wrong intel_iommu configuration\n");
>  +                       exit(1);
>  +                   }
>  +               }
>              }
>          }
>      }
>
> --
>  1.6.3.3
>
>
>
>
>

Patch

diff --git a/Makefile.target b/Makefile.target
index 4d88543..79c9ff9 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -233,6 +233,7 @@  obj-i386-y += testdev.o
 
 obj-i386-$(CONFIG_KVM_PIT) += i8254-kvm.o
 obj-i386-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += device-assignment.o
+obj-i386-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += intel_iommu.o
 
 # Hardware support
 obj-ia64-y += ide.o pckbd.o vga.o $(SOUND_HW) dma.o $(AUDIODRV)
diff --git a/cpu-common.h b/cpu-common.h
index 6cae15b..0484b40 100644
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -12,6 +12,8 @@ 
 
 #if !defined(CONFIG_USER_ONLY)
 
+struct DeviceState;
+
 /* address in the RAM (different from a physical address) */
 typedef unsigned long ram_addr_t;
 
@@ -47,6 +49,9 @@  void cpu_unregister_io_memory(int table_address);
 
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                             int len, int is_write);
+void cpu_physical_memory_rw_io(target_phys_addr_t addr, uint8_t *buf,
+			       int len, int is_write, 
+                               struct DeviceState* qdev, int* err); 
 static inline void cpu_physical_memory_read(target_phys_addr_t addr,
                                             uint8_t *buf, int len)
 {
@@ -57,9 +62,30 @@  static inline void cpu_physical_memory_write(target_phys_addr_t addr,
 {
     cpu_physical_memory_rw(addr, (uint8_t *)buf, len, 1);
 }
+static inline void cpu_physical_memory_read_io(target_phys_addr_t addr,
+                                            uint8_t *buf, int len, 
+					    struct DeviceState* qdev, 
+					    int* err)
+{
+    cpu_physical_memory_rw_io(addr, buf, len, 0, qdev, err);
+}
+static inline void cpu_physical_memory_write_io(target_phys_addr_t addr,
+                                            uint8_t *buf, int len,
+					    struct DeviceState* qdev, 
+					    int* err)
+{
+    cpu_physical_memory_rw_io(addr, buf, len, 1, qdev, err);
+}
+
+void set_iommu_translation_fn(target_phys_addr_t(*fn)(target_phys_addr_t, 
+    int, struct DeviceState*, int*));
+
 void *cpu_physical_memory_map(target_phys_addr_t addr,
                               target_phys_addr_t *plen,
                               int is_write);
+void *cpu_physical_memory_map_io(target_phys_addr_t addr,
+                              target_phys_addr_t *plen,
+                              int is_write, struct DeviceState* qdev, int* err);
 void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len,
                                int is_write, target_phys_addr_t access_len);
 void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque));
diff --git a/dma-helpers.c b/dma-helpers.c
index d4fc077..2b8bcf1 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -47,6 +47,7 @@  typedef struct {
     target_phys_addr_t sg_cur_byte;
     QEMUIOVector iov;
     QEMUBH *bh;
+    DeviceState* qdev;
 } DMAAIOCB;
 
 static void dma_bdrv_cb(void *opaque, int ret);
@@ -84,6 +85,7 @@  static void dma_bdrv_cb(void *opaque, int ret)
     DMAAIOCB *dbs = (DMAAIOCB *)opaque;
     target_phys_addr_t cur_addr, cur_len;
     void *mem;
+    int err;
 
     dbs->acb = NULL;
     dbs->sector_num += dbs->iov.size / 512;
@@ -100,8 +102,9 @@  static void dma_bdrv_cb(void *opaque, int ret)
     while (dbs->sg_cur_index < dbs->sg->nsg) {
         cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
         cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
-        mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->is_write);
-        if (!mem)
+        mem = cpu_physical_memory_map_io(cur_addr, &cur_len, !dbs->is_write,
+	    dbs->qdev, &err);
+        if (!mem || err)
             break;
         qemu_iovec_add(&dbs->iov, mem, cur_len);
         dbs->sg_cur_byte += cur_len;
@@ -147,7 +150,7 @@  static AIOPool dma_aio_pool = {
 static BlockDriverAIOCB *dma_bdrv_io(
     BlockDriverState *bs, QEMUSGList *sg, uint64_t sector_num,
     BlockDriverCompletionFunc *cb, void *opaque,
-    int is_write)
+    int is_write, DeviceState* qdev)
 {
     DMAAIOCB *dbs =  qemu_aio_get(&dma_aio_pool, bs, cb, opaque);
 
@@ -159,6 +162,7 @@  static BlockDriverAIOCB *dma_bdrv_io(
     dbs->sg_cur_byte = 0;
     dbs->is_write = is_write;
     dbs->bh = NULL;
+    dbs->qdev = qdev;
     qemu_iovec_init(&dbs->iov, sg->nsg);
     /*
      * DMA flushing is handled in dma_bdrv_cb() calling dma_bdrv_unmap()
@@ -175,14 +179,16 @@  static BlockDriverAIOCB *dma_bdrv_io(
 
 BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
                                 QEMUSGList *sg, uint64_t sector,
-                                void (*cb)(void *opaque, int ret), void *opaque)
+                                void (*cb)(void *opaque, int ret), void *opaque,
+				DeviceState* qdev)
 {
-    return dma_bdrv_io(bs, sg, sector, cb, opaque, 0);
+    return dma_bdrv_io(bs, sg, sector, cb, opaque, 0, qdev);
 }
 
 BlockDriverAIOCB *dma_bdrv_write(BlockDriverState *bs,
                                  QEMUSGList *sg, uint64_t sector,
-                                 void (*cb)(void *opaque, int ret), void *opaque)
+                                 void (*cb)(void *opaque, int ret), void *opaque,
+				 DeviceState* qdev)
 {
-    return dma_bdrv_io(bs, sg, sector, cb, opaque, 1);
+    return dma_bdrv_io(bs, sg, sector, cb, opaque, 1, qdev);
 }
diff --git a/dma.h b/dma.h
index f3bb275..231a2c1 100644
--- a/dma.h
+++ b/dma.h
@@ -34,8 +34,10 @@  void qemu_sglist_destroy(QEMUSGList *qsg);
 
 BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
                                 QEMUSGList *sg, uint64_t sector,
-                                BlockDriverCompletionFunc *cb, void *opaque);
+                                BlockDriverCompletionFunc *cb, void *opaque,
+				DeviceState* qdev);
 BlockDriverAIOCB *dma_bdrv_write(BlockDriverState *bs,
                                  QEMUSGList *sg, uint64_t sector,
-                                 BlockDriverCompletionFunc *cb, void *opaque);
+                                 BlockDriverCompletionFunc *cb, void *opaque,
+				 DeviceState* qdev);
 #endif
diff --git a/exec.c b/exec.c
index de2fbea..b663f58 100644
--- a/exec.c
+++ b/exec.c
@@ -186,6 +186,24 @@  unsigned long qemu_real_host_page_size;
 unsigned long qemu_host_page_bits;
 unsigned long qemu_host_page_size;
 unsigned long qemu_host_page_mask;
+target_phys_addr_t (*iommu_phy_addr_translate)(target_phys_addr_t addr,
+	int is_write, DeviceState* qdev, int* err);
+
+static target_phys_addr_t no_iommu_translate(target_phys_addr_t addr, 
+    int is_write, DeviceState* qdev, int* err)
+{
+    return addr;
+}
+
+static void init_iommu_translation(void)
+{
+    iommu_phy_addr_translate = no_iommu_translate;
+}
+
+void set_iommu_translation_fn(target_phys_addr_t(*fn)(target_phys_addr_t, int, DeviceState*, int*))
+{
+    iommu_phy_addr_translate = fn;
+}
 
 /* This is a multi-level map on the virtual address space.
    The bottom level has pointers to PageDesc.  */
@@ -551,6 +569,7 @@  static void code_gen_alloc(unsigned long tb_size)
    size. */
 void cpu_exec_init_all(unsigned long tb_size)
 {
+    init_iommu_translation();
     cpu_gen_init();
     code_gen_alloc(tb_size);
     code_gen_ptr = code_gen_buffer;
@@ -3371,8 +3390,8 @@  int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
 }
 
 #else
-void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write)
+static void __cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
+                            int len, int is_write, DeviceState* qdev, int* err)
 {
     int l, io_index;
     uint8_t *ptr;
@@ -3380,10 +3399,26 @@  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
     target_phys_addr_t page;
     unsigned long pd;
     PhysPageDesc *p;
+    target_phys_addr_t addr0;
+    int single_err;
+    int temp;
+    if (err == NULL)
+        err = &temp;
+    *err = 0;
+
 
     while (len > 0) {
-        page = addr & TARGET_PAGE_MASK;
-        l = (page + TARGET_PAGE_SIZE) - addr;
+	single_err = 0;
+	addr0 = addr;
+	if (qdev != NULL) {
+	    *err = 0;
+	    addr0 = iommu_phy_addr_translate(addr, is_write, qdev, &single_err);
+            *err |= single_err;
+            if (single_err)
+        	continue;
+        }
+        page = addr0 & TARGET_PAGE_MASK;
+        l = (page + TARGET_PAGE_SIZE) - addr0;
         if (l > len)
             l = len;
         p = phys_page_find(page >> TARGET_PAGE_BITS);
@@ -3395,10 +3430,10 @@  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
 
         if (is_write) {
             if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
-                target_phys_addr_t addr1 = addr;
+                target_phys_addr_t addr1 = addr0;
                 io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
                 if (p)
-                    addr1 = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+                    addr1 = (addr0 & ~TARGET_PAGE_MASK) + p->region_offset;
                 /* XXX: could force cpu_single_env to NULL to avoid
                    potential bugs */
                 if (l >= 4 && ((addr1 & 3) == 0)) {
@@ -3439,7 +3474,7 @@  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
         } else {
             if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
                 !(pd & IO_MEM_ROMD)) {
-                target_phys_addr_t addr1 = addr;
+                target_phys_addr_t addr1 = addr0;
                 /* I/O case */
                 io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
                 if (p)
@@ -3473,6 +3508,19 @@  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
     }
 }
 
+void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
+                            int len, int is_write)
+{
+    __cpu_physical_memory_rw(addr, buf, len, is_write, NULL, NULL);
+}
+
+void cpu_physical_memory_rw_io(target_phys_addr_t addr, uint8_t *buf,
+                            int len, int is_write, DeviceState* qdev, 
+			    int* err)
+{
+    __cpu_physical_memory_rw(addr, buf, len, is_write, qdev, err);
+}
+
 /* used for ROM loading : can write in RAM and ROM */
 void cpu_physical_memory_write_rom(target_phys_addr_t addr,
                                    const uint8_t *buf, int len)
@@ -3565,9 +3613,9 @@  static void cpu_notify_map_clients(void)
  * Use cpu_register_map_client() to know when retrying the map operation is
  * likely to succeed.
  */
-void *cpu_physical_memory_map(target_phys_addr_t addr,
+static void *__cpu_physical_memory_map(target_phys_addr_t addr,
                               target_phys_addr_t *plen,
-                              int is_write)
+                              int is_write, DeviceState* qdev, int* err)
 {
     target_phys_addr_t len = *plen;
     target_phys_addr_t done = 0;
@@ -3578,10 +3626,18 @@  void *cpu_physical_memory_map(target_phys_addr_t addr,
     unsigned long pd;
     PhysPageDesc *p;
     unsigned long addr1;
+    target_phys_addr_t addr0 = addr;
+    int temp;
+
+    if (err == NULL)
+	err = &temp;
 
     while (len > 0) {
-        page = addr & TARGET_PAGE_MASK;
-        l = (page + TARGET_PAGE_SIZE) - addr;
+	addr0 = addr;
+	if (qdev != NULL)
+	     addr0 = iommu_phy_addr_translate(addr, is_write, qdev, err);
+        page = addr0 & TARGET_PAGE_MASK;
+        l = (page + TARGET_PAGE_SIZE) - addr0;
         if (l > len)
             l = len;
         p = phys_page_find(page >> TARGET_PAGE_BITS);
@@ -3596,14 +3652,14 @@  void *cpu_physical_memory_map(target_phys_addr_t addr,
                 break;
             }
             bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, TARGET_PAGE_SIZE);
-            bounce.addr = addr;
+            bounce.addr = addr0;
             bounce.len = l;
             if (!is_write) {
-                cpu_physical_memory_rw(addr, bounce.buffer, l, 0);
+                cpu_physical_memory_rw(addr0, bounce.buffer, l, 0);
             }
             ptr = bounce.buffer;
         } else {
-            addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+            addr1 = (pd & TARGET_PAGE_MASK) + (addr0 & ~TARGET_PAGE_MASK);
             ptr = qemu_get_ram_ptr(addr1);
         }
         if (!done) {
@@ -3620,6 +3676,22 @@  void *cpu_physical_memory_map(target_phys_addr_t addr,
     return ret;
 }
 
+void *cpu_physical_memory_map_io(target_phys_addr_t addr,
+                              target_phys_addr_t *plen,
+                              int is_write, DeviceState* qdev, int* err)
+{
+     return __cpu_physical_memory_map(addr,
+            plen, is_write, qdev, err);
+}
+
+
+void *cpu_physical_memory_map(target_phys_addr_t addr,
+                              target_phys_addr_t *plen,
+                              int is_write)
+{
+    return __cpu_physical_memory_map(addr, plen, is_write, NULL, NULL);
+}
+
 /* Unmaps a memory region previously mapped by cpu_physical_memory_map().
  * Will also mark the memory as dirty if is_write == 1.  access_len gives
  * the amount of memory that was actually read or written by the caller.
diff --git a/hw/DMAR.dat b/hw/DMAR.dat
new file mode 100644
index 0000000000000000000000000000000000000000..db5010ce79ab960b3cbd69a12c67863246d26538
GIT binary patch
literal 28
TcmY#pKn4N~j9~7?e;^6~4!8mY

literal 0
HcmV?d00001

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 67480bb..4df7cd0 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -34,6 +34,7 @@ 
 #include "dma.h"
 
 #include <hw/ide/internal.h>
+#include <hw/ide/pci.h>
 
 static int smart_attributes[][5] = {
     /* id,  flags, val, wrst, thrsh */
@@ -432,7 +433,7 @@  static int dma_buf_prepare(BMDMAState *bm, int is_write)
         uint32_t addr;
         uint32_t size;
     } prd;
-    int l, len;
+    int l, len, err;
 
     qemu_sglist_init(&s->sg, s->nsector / (TARGET_PAGE_SIZE/512) + 1);
     s->io_buffer_size = 0;
@@ -442,7 +443,10 @@  static int dma_buf_prepare(BMDMAState *bm, int is_write)
             if (bm->cur_prd_last ||
                 (bm->cur_addr - bm->addr) >= 4096)
                 return s->io_buffer_size != 0;
-            cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+            cpu_physical_memory_read_io(bm->cur_addr, (uint8_t *)&prd, 8,
+		&bm->pci_dev->dev.qdev, &err);
+	    if (err)
+		return 0;
             bm->cur_addr += 8;
             prd.addr = le32_to_cpu(prd.addr);
             prd.size = le32_to_cpu(prd.size);
@@ -514,7 +518,7 @@  static int dma_buf_rw(BMDMAState *bm, int is_write)
         uint32_t addr;
         uint32_t size;
     } prd;
-    int l, len;
+    int l, len, err;
 
     for(;;) {
         l = s->io_buffer_size - s->io_buffer_index;
@@ -525,7 +529,10 @@  static int dma_buf_rw(BMDMAState *bm, int is_write)
             if (bm->cur_prd_last ||
                 (bm->cur_addr - bm->addr) >= 4096)
                 return 0;
-            cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+            cpu_physical_memory_read_io(bm->cur_addr, (uint8_t *)&prd, 8,
+		&bm->pci_dev->dev.qdev, &err);
+	    if (err)
+		return 0;
             bm->cur_addr += 8;
             prd.addr = le32_to_cpu(prd.addr);
             prd.size = le32_to_cpu(prd.size);
@@ -540,11 +547,13 @@  static int dma_buf_rw(BMDMAState *bm, int is_write)
             l = bm->cur_prd_len;
         if (l > 0) {
             if (is_write) {
-                cpu_physical_memory_write(bm->cur_prd_addr,
-                                          s->io_buffer + s->io_buffer_index, l);
+                cpu_physical_memory_write_io(bm->cur_prd_addr,
+                                          s->io_buffer + s->io_buffer_index, l,
+					  &bm->pci_dev->dev.qdev, &err);
             } else {
-                cpu_physical_memory_read(bm->cur_prd_addr,
-                                          s->io_buffer + s->io_buffer_index, l);
+                cpu_physical_memory_read_io(bm->cur_prd_addr,
+                                          s->io_buffer + s->io_buffer_index, l,
+					  &bm->pci_dev->dev.qdev, &err);
             }
             bm->cur_prd_addr += l;
             bm->cur_prd_len -= l;
@@ -600,7 +609,8 @@  static void ide_read_dma_cb(void *opaque, int ret)
 #ifdef DEBUG_AIO
     printf("aio_read: sector_num=%" PRId64 " n=%d\n", sector_num, n);
 #endif
-    bm->aiocb = dma_bdrv_read(s->bs, &s->sg, sector_num, ide_read_dma_cb, bm);
+    bm->aiocb = dma_bdrv_read(s->bs, &s->sg, sector_num, ide_read_dma_cb, bm,
+			      &bm->pci_dev->dev.qdev);
     ide_dma_submit_check(s, ide_read_dma_cb, bm);
 }
 
@@ -746,7 +756,8 @@  static void ide_write_dma_cb(void *opaque, int ret)
 #ifdef DEBUG_AIO
     printf("aio_write: sector_num=%" PRId64 " n=%d\n", sector_num, n);
 #endif
-    bm->aiocb = dma_bdrv_write(s->bs, &s->sg, sector_num, ide_write_dma_cb, bm);
+    bm->aiocb = dma_bdrv_write(s->bs, &s->sg, sector_num, ide_write_dma_cb, bm,
+			       &bm->pci_dev->dev.qdev);
     ide_dma_submit_check(s, ide_write_dma_cb, bm);
 }
 
diff --git a/hw/intel_iommu.c b/hw/intel_iommu.c
new file mode 100644
index 0000000..891597d
--- /dev/null
+++ b/hw/intel_iommu.c
@@ -0,0 +1,705 @@ 
+#include <stddef.h>
+#include <assert.h>
+#include <stdint.h>
+
+#include "intel_iommu.h"
+#include "../qemu-option.h" 
+#include "pci.h"
+#include "pc.h"
+
+#define DMAR_DEVICES_NUM 1
+#define DMAR_REG_BASE 0xFED10000
+
+#define CIRG_GLOBAL_INVALIDATION 		1
+#define CIRG_DOMAIN_SELECTIVE_INVALIDATION	2
+#define CIRG_DEVICE_SELECTIVE_INVALIDATION	3
+
+#define IIRG_GLOBAL_INVALIDATION		1
+#define IIRG_DOMAIN_SELECTIVE_INVALIDATION	2
+#define IIRG_DOMAIN_PAGE_SELECTIVE_INVALIDATION	3
+
+#define pr_debug(fmt, ...) 						\
+	do {								\
+        	fprintf( stderr, "%s : " fmt "\n",			\
+			__func__, ##__VA_ARGS__);			\
+		fflush (stderr); 					\
+	} while (0)
+
+
+/* Registers definitions taken from Intel's IOMMU driver */
+#define	DMAR_VER_REG	0x0	/* Arch version supported by this IOMMU */
+#define	DMAR_CAP_REG	0x8	/* Hardware supported capabilities */
+#define	DMAR_ECAP_REG	0x10	/* Extended capabilities supported */
+#define	DMAR_GCMD_REG	0x18	/* Global command register */
+#define	DMAR_GSTS_REG	0x1c	/* Global status register */
+#define	DMAR_RTADDR_REG	0x20	/* Root entry table */
+#define	DMAR_CCMD_REG	0x28	/* Context command reg */
+#define	DMAR_FSTS_REG	0x34	/* Fault Status register */
+#define	DMAR_FECTL_REG	0x38	/* Fault control register */
+#define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
+#define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
+#define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
+#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
+#define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
+#define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
+#define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
+#define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
+#define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
+#define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
+#define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
+#define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
+#define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
+#define DMAR_ICS_REG	0x98	/* Invalidation complete status register */
+#define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
+#define DMAR_IVA_REG	0xc0
+#define DMAR_IOTLB_REG	0xc8
+#define DMAR_FRR_REG	0xd0
+
+#define field_mask(s,e)		((1UL << (s))*((1UL << ((e)-(s)+1)) - 1))
+#define clear_field(r,s,e)	*(r) = ((*r) & ~field_mask(s,e))
+
+static inline uint8_t get_devfn(uint16_t sid) 
+{
+	return (sid & 0xFF); 
+}
+
+static inline uint8_t get_busnr(uint16_t sid) 
+{
+	return (sid>>8)&0xFF;
+}
+
+static inline void read_reg32(uint8_t* dmar_regs, uint32_t reg,
+			      uint64_t* val)
+{
+	*val = le32_to_cpu(*((uint32_t*)(dmar_regs+reg)));
+} 
+
+static inline void read_reg64(uint8_t* dmar_regs, uint32_t reg,
+			      uint64_t* val)
+{
+	*val = le64_to_cpu(*((uint64_t*)(dmar_regs+reg)));
+}
+
+static inline void write_reg32(uint8_t* dmar_regs, uint32_t reg,
+			       uint32_t val)
+{
+	*(uint32_t*)(dmar_regs+reg) = cpu_to_le32(val);
+}
+
+static inline void write_reg64(uint8_t* dmar_regs, uint32_t reg,
+			       uint64_t val)
+{
+	*(uint64_t*)(dmar_regs+reg) = cpu_to_le64(val);
+}
+
+#define IOMMU_SUB_FIELD(r,t,f,s,e)	\
+static inline void encode_ ## r ## _ ## f (uint64_t *l, uint64_t v) \
+{ \
+	uint64_t l_ = *((uint64_t*)l+((s)/64)); \
+	clear_field(&(l_), (s)%64, (e)%64); \
+	l_ |= (v<<((s)%64)) & field_mask((s)%64, (e)%64); \
+	*(((uint64_t*)l)+(s)/64) = l_; \
+} \
+\
+static inline uint64_t decode_ ## r ## _ ## f (uint64_t *l) \
+{ \
+	return ((*(l+(s)/64)) & (field_mask((s)%64, (e)%64))) \
+		>> ((s)%64); \
+}
+
+
+#define IOMMU_REG_FIELD(r,f,s,e)	IOMMU_SUB_FIELD(r,reg,f,s,e)
+#define IOMMU_IQ_FIELD(r,f,s,e)		IOMMU_SUB_FIELD(r,iq,f,s,e)
+#define IOMMU_PTE_FIELD(r,f,s,e)	IOMMU_SUB_FIELD(r,pt,f,s,e)
+#define IOMMU_RE_FIELD(r,f,s,e)		IOMMU_SUB_FIELD(r,pt,f,s,e)
+#define IOMMU_CE_FIELD(r,f,s,e)		IOMMU_SUB_FIELD(r,pt,f,s,e)
+
+IOMMU_REG_FIELD(iqa, qs, 0, 2)
+IOMMU_REG_FIELD(iqa, iqa, 12, 63)
+IOMMU_REG_FIELD(iqh, qh, 4, 18)
+IOMMU_REG_FIELD(iqt, qt, 4, 18)
+IOMMU_REG_FIELD(ver, min, 0, 3)
+IOMMU_REG_FIELD(cap, cm, 7, 7)
+IOMMU_REG_FIELD(cap, sagaw, 8, 12)
+IOMMU_REG_FIELD(cap, mgaw, 16, 21)
+IOMMU_REG_FIELD(cap, psi, 39, 39)
+IOMMU_REG_FIELD(cap, nfr, 40, 47)
+IOMMU_REG_FIELD(cap, mamv, 48, 53)
+IOMMU_REG_FIELD(cap, fro, 24, 33)
+IOMMU_REG_FIELD(ecap, c, 0, 0)
+IOMMU_REG_FIELD(ecap, qi, 1, 1)
+IOMMU_REG_FIELD(ecap, ch, 5, 5)
+IOMMU_REG_FIELD(ecap, pt, 6, 6)
+IOMMU_REG_FIELD(ecap, iro, 8, 17)
+IOMMU_REG_FIELD(ccmd, caig, 59, 60)
+IOMMU_REG_FIELD(ccmd, cirg, 61, 62)
+IOMMU_REG_FIELD(ccmd, icc, 63, 63)
+IOMMU_REG_FIELD(pmen, prs, 0, 0)
+IOMMU_REG_FIELD(pmen, epm, 31, 31)
+IOMMU_REG_FIELD(gcmd, cfi, 23, 23)
+IOMMU_REG_FIELD(gcmd, sirtp, 24, 24)
+IOMMU_REG_FIELD(gcmd, ire, 25, 25)
+IOMMU_REG_FIELD(gcmd, qie, 26, 26)
+IOMMU_REG_FIELD(gcmd, wbf, 27, 27)
+IOMMU_REG_FIELD(gcmd, eafl, 28, 28)
+IOMMU_REG_FIELD(gcmd, sfl, 29, 29)
+IOMMU_REG_FIELD(gcmd, srtp, 30, 30)
+IOMMU_REG_FIELD(gcmd, te, 31, 31)
+IOMMU_REG_FIELD(gsts, cfis, 23, 23)
+IOMMU_REG_FIELD(gsts, irtps, 24, 24)
+IOMMU_REG_FIELD(gsts, ires, 25, 25)
+IOMMU_REG_FIELD(gsts, qies, 26, 26)
+IOMMU_REG_FIELD(gsts, wbfs, 27, 27)
+IOMMU_REG_FIELD(gsts, afls, 28, 28)
+IOMMU_REG_FIELD(gsts, fls, 29, 29)
+IOMMU_REG_FIELD(gsts, rtps, 30, 30)
+IOMMU_REG_FIELD(gsts, tes, 31, 31)
+IOMMU_REG_FIELD(iotlb, iaig, 57, 58)
+IOMMU_REG_FIELD(iotlb, iirg, 60, 61)
+IOMMU_REG_FIELD(iotlb, ivt, 63, 63)
+IOMMU_REG_FIELD(iva, am, 0, 5)
+IOMMU_REG_FIELD(iva, ih, 6, 6)
+IOMMU_REG_FIELD(iva, addr, 12, 63)
+IOMMU_REG_FIELD(rtaddr, rta, 12, 63)
+
+enum { MGAW = 48};
+enum { FRCD_REG_NUM = 4 };
+enum { DMAR_REGS_PAGE = 0x1000, DMAR_REGS_PAGE_SHIFT = 12, LEVEL_STRIDE=9, 
+	VTD_PAGE_SHIFT=12} ;
+enum { DEVFN_ENTRIES_NUM = 256 };
+enum { IOMMU_WRITE = 2, IOMMU_READ = 1};
+
+#define VTD_PAGE_SIZE (1<<VTD_PAGE_SHIFT)
+
+/* Mapping related structures */
+
+enum { TRASNLATE_UNTRANSLATED = 0, TRANSLATE_ALL = 1, TRANSLATE_PASS_THROUGH = 2, TRANSLATE_RESERVED = 3};
+
+static inline void convert_le128_to_cpu(uint64_t u128[2]) 
+{
+	*u128 = le64_to_cpu(*u128);
+	*(u128+1) = le64_to_cpu(*(u128+1));
+}
+
+IOMMU_PTE_FIELD(pte, r, 0, 0)
+IOMMU_PTE_FIELD(pte, w, 1, 1)
+IOMMU_PTE_FIELD(pte, sp, 7, 7)
+IOMMU_PTE_FIELD(pte, snp, 11, 11)
+IOMMU_PTE_FIELD(pte, addr, 12, 51)
+IOMMU_PTE_FIELD(pte, tm, 62, 62)
+
+IOMMU_CE_FIELD(ce, p, 0, 0)
+IOMMU_CE_FIELD(ce, t, 2, 3)
+IOMMU_CE_FIELD(ce, asr, 12, 63)
+IOMMU_CE_FIELD(ce, aw, 64, 66)
+
+IOMMU_RE_FIELD(re, p, 0, 0)
+IOMMU_RE_FIELD(re, ctp, 12, 63)
+
+IOMMU_IQ_FIELD(iq_desc, id, 0, 3)
+IOMMU_IQ_FIELD(iq_desc, iflag, 4, 4)
+IOMMU_IQ_FIELD(iq_desc, sw, 5, 5)
+IOMMU_IQ_FIELD(iq_desc, wait_stat_addr, 66, 127)
+IOMMU_IQ_FIELD(iq_desc, wait_stat_data, 32, 63)
+
+enum {SPS_21_BIT = 1, SPS_30_BIT = 2, SPS_39_BIT = 4, SPS_48_BIT = 8}; /* sagaw values */
+enum {AGAW_30_BIT = 1, AGAW_39_BIT = 2, AGAW_48_BIT = 4, AGAW_57_BIT = 8, AGAW_64_BIT = 16}; /* agaw values */
+
+/* Invalidation queue descriptors */
+enum { 	IQ_CONTEXT_INVD_DESC_ID = 1, 
+	IQ_IOTLB_INVD_DESC_ID = 2, 
+	IQ_DEV_IOTLB_INVD_DESC_ID = 3,
+	IQ_INT_CACHE_INV_DESC_ID = 4,
+	IQ_INVD_WAIT_DESC_ID = 5 };
+
+enum {DMAR_REGS_SIZE = 0x100};
+
+struct dmar_status
+{
+	uint8_t dmar_regs[DMAR_REGS_SIZE];
+};
+
+struct iommu_state {
+	int mmio_index;
+	int invalidation_queue;
+	int enabled;
+	struct dmar_status dmar_status[DMAR_DEVICES_NUM];
+};
+
+static struct iommu_state iommu_state; 			/* Static state */
+
+static
+uint64_t __iommu_phy_addr_translate(target_phys_addr_t addr,
+                            uint8_t* access_perm, DeviceState* qdev, 
+			    uint64_t* size);
+
+static 
+int intel_iommu_process_iq(struct dmar_status* dmar_status);
+
+static 
+void iommu_reset_regs(uint8_t* dmar) {
+	uint64_t ecap=0, cap=0,  ver=0;
+	memset(dmar, 0, sizeof(*dmar));
+	encode_ver_min(&ver, 1);
+	write_reg32(dmar, DMAR_VER_REG, ver);
+	encode_cap_mgaw(&cap, MGAW-1);
+	encode_cap_sagaw(&cap, AGAW_30_BIT | AGAW_39_BIT | AGAW_48_BIT);
+	encode_cap_nfr(&cap, FRCD_REG_NUM);
+	encode_cap_psi(&cap, 1); /* enabling page selective invalidation */
+	encode_cap_mamv(&cap, 63); /* setting mamv to its maximal value */
+	encode_cap_cm(&cap, 0); /* caching mode is disabled to support Linux */
+	encode_cap_fro(&cap, DMAR_FRR_REG / 16);
+	write_reg64(dmar, DMAR_CAP_REG, cap);
+	encode_ecap_pt(&ecap, 1);
+	encode_ecap_ch(&ecap, 1);
+	encode_ecap_c(&ecap, 1);
+	encode_ecap_iro(&ecap, DMAR_IVA_REG / 16);
+	encode_ecap_qi(&ecap, iommu_state.invalidation_queue);
+	write_reg64(dmar, DMAR_ECAP_REG, ecap);
+}
+
+static void
+intel_iommu_write_status_update(struct dmar_status* dmar_status, 
+				target_phys_addr_t addr)
+{
+	int offset = addr & 0xFFF & ~3;
+	uint8_t* dmar_regs = dmar_status->dmar_regs;
+
+	switch (offset)
+	{
+	case DMAR_GCMD_REG:	{
+		uint64_t gsts;
+		uint64_t gcmd;
+		read_reg32(dmar_regs, DMAR_GCMD_REG, &gcmd);
+		read_reg32(dmar_regs, DMAR_GSTS_REG, &gsts);
+		encode_gsts_fls(&gsts, decode_gcmd_sfl(&gcmd));
+		if (decode_gcmd_srtp(&gcmd))
+			encode_gsts_rtps(&gsts, 1);
+		encode_gsts_wbfs(&gsts, decode_gcmd_wbf(&gcmd));
+		encode_gsts_tes(&gsts, decode_gcmd_te(&gcmd));
+		encode_gcmd_srtp(&gcmd, 0);
+		if (iommu_state.invalidation_queue && decode_gsts_qies(&gsts) == 0 &&
+			decode_gcmd_qie(&gcmd) == 1) {
+			encode_gsts_qies(&gsts, 1);
+		}
+		write_reg32(dmar_regs, DMAR_GSTS_REG, gsts);
+		write_reg32(dmar_regs, DMAR_GCMD_REG, gcmd);
+		break;
+		}
+	case DMAR_CCMD_REG:
+	case DMAR_CCMD_REG+4: {
+		uint64_t ccmd;
+		read_reg64(dmar_regs, DMAR_CCMD_REG, &ccmd);
+		encode_ccmd_caig(&ccmd, decode_ccmd_cirg(&ccmd));
+		encode_ccmd_icc(&ccmd, 0);
+		write_reg64(dmar_regs, DMAR_CCMD_REG, ccmd);
+		break;
+		}
+	case DMAR_PMEN_REG: {
+		uint64_t pmen;
+		read_reg32(dmar_regs, DMAR_PMEN_REG, &pmen);
+		encode_pmen_prs(&pmen, decode_pmen_epm(&pmen));
+		write_reg32(dmar_regs, DMAR_PMEN_REG, pmen);
+		break;
+		}
+	case DMAR_IOTLB_REG:
+	case DMAR_IOTLB_REG+4:
+	case DMAR_IVA_REG:
+	case DMAR_IVA_REG+4: {
+		uint64_t iotlb;
+		uint64_t iva;
+		read_reg64(dmar_regs, DMAR_IOTLB_REG, &iotlb);
+		read_reg64(dmar_regs, DMAR_IVA_REG, &iva);
+		if (decode_iotlb_ivt(&iotlb) == 1) {
+			encode_iva_addr(&iva, 0);
+			encode_iva_am(&iva, 0);
+			encode_iva_ih(&iva, 0);
+			encode_iotlb_iaig(&iotlb, decode_iotlb_iirg(&iotlb));
+			encode_iotlb_ivt(&iotlb, 0);
+			write_reg64(dmar_regs, DMAR_IOTLB_REG, iotlb);
+			write_reg64(dmar_regs, DMAR_IVA_REG, iva);
+		}
+		break;
+		}
+	case DMAR_IQT_REG:
+	case DMAR_IQT_REG+4: {
+		uint64_t gsts;
+		read_reg32(dmar_regs, DMAR_GSTS_REG, &gsts);
+		if (iommu_state.invalidation_queue && 
+			decode_gsts_qies(&gsts) == 1) 
+			intel_iommu_process_iq(dmar_status);
+		break;
+		}
+	default:
+		;
+	}
+	
+}
+
+
+/*
+Software is expected to access 32-bit registers as aligned doublewords. For example, to modify a
+field (e.g., bit or byte) in a 32-bit register, the entire doubleword is read, the appropriate field(s)
+are modified, and the entire doubleword is written back.
+*/
+
+static void
+intel_iommu_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+	struct iommu_state* d = opaque;
+	unsigned int offset = (addr & 0xFFF);
+	unsigned int dmar_idx = (addr & ~(DMAR_REGS_PAGE-1)) >> DMAR_REGS_PAGE_SHIFT;
+	uint8_t* dmar_regs;
+	
+	if (dmar_idx >= DMAR_DEVICES_NUM)
+		return;
+	
+	dmar_regs = d->dmar_status[dmar_idx].dmar_regs;
+	if (offset > DMAR_REGS_SIZE-4)
+		return;			/* outside the registers area - reserved */
+	*(uint32_t*)(((void*)dmar_regs)+offset ) = val;
+	intel_iommu_write_status_update(&(d->dmar_status[dmar_idx]), addr);
+}
+
+static void
+intel_iommu_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    // emulate hw without byte enables: no RMW
+    intel_iommu_writel(opaque, addr & ~3,
+                      (val & 0xffff) << (8*(addr & 3)));
+}
+
+static void
+intel_iommu_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    // emulate hw without byte enables: no RMW
+    intel_iommu_writel(opaque, addr & ~3,
+                      (val & 0xff) << (8*(addr & 3)));
+}
+
+
+static uint32_t
+intel_iommu_readl(void *opaque, target_phys_addr_t addr)
+{
+    struct iommu_state *d = opaque;
+    unsigned int offset = (addr & 0xfff);
+	unsigned int dmar_idx = (addr & ~(DMAR_REGS_PAGE-1)) >> DMAR_REGS_PAGE_SHIFT;
+	if (offset > DMAR_REGS_SIZE-4) {
+		return 0; /* outside the boundary of the registers */
+	}
+	return *(uint32_t*)(d->dmar_status[dmar_idx].dmar_regs+offset);
+}
+
+static uint32_t
+intel_iommu_readb(void *opaque, target_phys_addr_t addr)
+{
+    return ((intel_iommu_readl(opaque, addr & ~3)) >>
+            (8 * (addr & 3))) & 0xff;
+}
+
+static uint32_t
+intel_iommu_readw(void *opaque, target_phys_addr_t addr)
+{
+    return ((intel_iommu_readl(opaque, addr & ~3)) >>
+            (8 * (addr & 3))) & 0xffff;
+}
+
+static
+void iommu_save(QEMUFile *f, void *opaque)
+{
+	const unsigned char *d = opaque;
+	int i;
+	for (i=0; i<sizeof(struct iommu_state); i++) {
+		qemu_put_8s(f, &d[i]);
+	}
+}
+
+static
+int iommu_load(QEMUFile *f, void *opaque, int version_id)
+{
+	uint8_t *d = opaque;
+	int i;
+	for (i=0; i<sizeof(struct iommu_state); i++) {
+		qemu_get_8s(f, &d[i]);
+	}
+	return 0;
+}
+
+
+static 
+void iommu_reset(void *opaque)
+{
+	int i;
+	struct iommu_state *iommu_state = opaque;
+	for (i=0; i<DMAR_DEVICES_NUM; i++)
+	{
+		iommu_reset_regs(iommu_state->dmar_status[i].dmar_regs);
+	}
+
+}
+
+static CPUWriteMemoryFunc *intel_iommu_write[] = {
+    intel_iommu_writeb,	intel_iommu_writew, intel_iommu_writel
+};
+
+static CPUReadMemoryFunc *intel_iommu_read[] = {
+    intel_iommu_readb,	intel_iommu_readw, intel_iommu_readl
+};
+
+static
+void iommu_mmio_map(struct iommu_state* iommu_state) {
+    target_phys_addr_t addr = DMAR_REG_BASE;
+    ram_addr_t regs_size = DMAR_REGS_PAGE * DMAR_DEVICES_NUM;
+    iommu_state->mmio_index = cpu_register_io_memory(intel_iommu_read,
+          intel_iommu_write, iommu_state);
+    cpu_register_physical_memory(addr, regs_size, iommu_state->mmio_index);
+	
+}
+
+int intel_iommu_configure(const char *opt) {
+    struct iommu_state *is = &iommu_state;
+    if (!opt)
+	return -1;
+    if (!strcmp(opt, "on"))
+	is->enabled = 1;
+    else if (!strcmp(opt, "queue")) {
+	is->enabled = 1;
+	is->invalidation_queue = 1; 
+    }
+    else {
+	return -1;
+    }
+
+    if (is->enabled)
+	acpi_table_add("sig=DMAR,data=hw/DMAR.dat");
+
+    return 0;
+}
+
+
+/* Translation related functions */
+static inline int aw_to_levels(int aw) {
+	return aw+2;
+}
+
+
+static inline int level_shift(int level) {
+	return VTD_PAGE_SHIFT+(level*LEVEL_STRIDE);
+}
+
+static inline uint64_t level_size(int level) {
+	return (1ULL<<level_shift(level));
+}
+
+static inline uint64_t level_offset(uint64_t addr, int level) {
+	return (addr >> level_shift(level))&((1ULL<<LEVEL_STRIDE)-1);
+}
+
+static inline uint64_t level_mask(int level) {
+	return (((uint64_t)1)<<level_shift(level))-1;
+}
+
+/* Main translation function */
+static
+target_phys_addr_t __iommu_phy_addr_translate(target_phys_addr_t addr,
+                            uint8_t* access_perm, DeviceState* qdev, uint64_t* size)
+{
+	uint64_t pte;
+	int level;
+	uint8_t* dmar_regs = (iommu_state.dmar_status[0].dmar_regs);
+	uint64_t temp, addr_offset, gsts, rtaddr, pte_addr;
+	PCIDevice* dev = DO_UPCAST(PCIDevice, qdev, qdev);
+	uint8_t devfn = get_devfn(dev->devfn);
+	uint8_t busnr = get_busnr(dev->devfn);
+	uint64_t re[2], ce[2];
+	read_reg32(dmar_regs, DMAR_GSTS_REG, &gsts);
+	if (!size)
+		size = &temp;
+	*access_perm = IOMMU_READ | IOMMU_WRITE; 
+	/* Reading the root entry */
+	read_reg64(dmar_regs, DMAR_RTADDR_REG, &rtaddr);
+
+	cpu_physical_memory_rw(rtaddr+(busnr*sizeof(re)), (uint8_t *)re, sizeof(re), 0);
+	convert_le128_to_cpu(re);
+
+	if (!decode_gsts_tes(&gsts)/*dmar_regs->gsts.tes*/) {
+		return addr;				/* Translation is disabled */
+	}
+
+	if (!decode_gsts_rtps(&gsts)/*dmar_regs->gsts.rtps*/) {
+		*access_perm = 0;
+		return -1; 
+	}
+	if (!decode_re_p(re)) {
+		*access_perm = 0;
+		return -1;
+	}
+
+	/* Reading the relevant context entry */
+	cpu_physical_memory_rw(((decode_re_ctp(re)<<VTD_PAGE_SHIFT)+devfn*sizeof(ce)),
+			(uint8_t *)ce, sizeof(ce), 0);
+	convert_le128_to_cpu(ce);
+
+	if (!decode_ce_p(ce)) {
+		*access_perm = 0;
+		return -1;
+	}
+
+	if (decode_ce_t(ce) == TRANSLATE_RESERVED) {
+		*access_perm = 0;
+		return -1;
+	}
+	if (decode_ce_t(ce) == TRANSLATE_PASS_THROUGH) {
+		return addr;
+	}
+
+	/* Analyzing the levels number from the context */
+	level = aw_to_levels(decode_ce_aw(ce));	
+
+	pte_addr = decode_ce_asr(ce) << DMAR_REGS_PAGE_SHIFT;
+
+	while (level>0 && *access_perm != 0) {
+		pte_addr += level_offset(addr, level-1)*sizeof(pte);
+		cpu_physical_memory_rw(pte_addr,
+				(uint8_t *)&pte, sizeof(pte), 0);
+		pte = le64_to_cpu(pte);
+		pte_addr = decode_pte_addr(&pte);
+		pte_addr <<= VTD_PAGE_SHIFT;
+		if (!decode_pte_r(&pte)) 
+			(*access_perm) &= (~IOMMU_READ);
+		if (!decode_pte_w(&pte)) 
+			(*access_perm) &= (~IOMMU_WRITE);
+		if (decode_pte_sp(&pte))
+			break;		/* Super page */
+		level--;
+	}
+	addr_offset = addr&level_mask(level);
+	*size = level_size(level) - addr_offset;
+	/* shift left also if super-page */
+	return (pte_addr<<(LEVEL_STRIDE*level))+(addr&level_mask(level));	
+}
+
+static uint64_t intel_iommu_phy_addr_translate(target_phys_addr_t addr,
+                            int is_write, DeviceState* qdev, int* err)
+{
+	uint8_t access_perm;
+	uint16_t req_access_perm = is_write ? IOMMU_WRITE : IOMMU_READ;
+	uint64_t phy_addr;
+
+	phy_addr = __iommu_phy_addr_translate(addr, &access_perm, qdev, NULL);
+	if (err && (req_access_perm & access_perm)==0)
+		*err = -1;
+
+	return phy_addr;	
+}
+
+
+static inline
+uint8_t* get_dmar_regs(struct dmar_status* dmar_status, 
+	uint16_t dmar_unit)
+{
+	return dmar_status->dmar_regs;
+}
+
+static
+void intel_iommu_iq_wait(struct dmar_status* dmar_status,
+	uint64_t desc[2])
+{
+	if (decode_iq_desc_iflag(desc)) {
+		pr_debug("interrupts are not supported");
+		return;
+	}
+	if (decode_iq_desc_sw((desc))) {
+		uint32_t data = decode_iq_desc_wait_stat_data(desc);
+		uint64_t addr = 
+			decode_iq_desc_wait_stat_addr(desc);
+		cpu_physical_memory_rw(addr << 2,
+			(uint8_t *)&data, sizeof(data), 1);
+	}
+}
+
+static
+int iommu_process_iq_desc (uint64_t desc[2], 
+	struct dmar_status* dmar_status) 
+{
+	int r = 0;
+	uint32_t id = decode_iq_desc_id(desc);
+	switch (id)
+	{
+	case IQ_CONTEXT_INVD_DESC_ID: {
+		break;
+		}
+	case IQ_IOTLB_INVD_DESC_ID: {
+		break;	
+		}
+	case IQ_INVD_WAIT_DESC_ID: {
+		intel_iommu_iq_wait(dmar_status, desc);
+		break;
+		}
+	case IQ_DEV_IOTLB_INVD_DESC_ID: {
+		pr_debug("IQ_DEV_IOTLB_INVD_DESC_ID is not implemented");
+		break;
+		}
+	case IQ_INT_CACHE_INV_DESC_ID: {
+		pr_debug("IQ_INT_CACHE_INV_DESC_ID is not implemented");
+		break;
+		}
+	default:
+		r = -1;
+		pr_debug("invalid descriptor id %x", id);
+	}
+	return r;
+}
+
+static 
+int intel_iommu_process_iq(struct dmar_status* dmar_status)
+{
+	uint8_t* dmar_regs = 
+		dmar_status->dmar_regs;
+	uint64_t iqa, iqt, iqh;
+	uint64_t qt, qh;
+	size_t len, desc_size;
+	target_phys_addr_t map_len, qaddr;
+	uint64 desc[2];
+
+	read_reg64(dmar_regs, DMAR_IQA_REG, &iqa);
+	read_reg64(dmar_regs, DMAR_IQT_REG, &iqt);
+	read_reg64(dmar_regs, DMAR_IQH_REG, &iqh);
+	len = 1 << (8 + decode_iqa_qs(&iqa));
+	desc_size = sizeof(desc);
+	map_len = len * desc_size;
+	qaddr = decode_iqa_iqa(&iqa) << VTD_PAGE_SHIFT;
+	qh = decode_iqh_qh(&iqh);
+	qt = decode_iqt_qt(&iqt);	
+
+	if (map_len < len * desc_size) {
+		pr_debug("could not map entire invalidation queue");
+		exit(1);
+	}
+
+	while (qh != qt) {
+		cpu_physical_memory_rw(qaddr + qh*desc_size, 
+			(uint8_t *)&desc, sizeof(desc), 0);
+		convert_le128_to_cpu(desc);
+		iommu_process_iq_desc(desc, dmar_status);	
+		qh = (qh+1)%len;
+	}
+	encode_iqh_qh(&iqh, qh);
+	write_reg64(dmar_regs, DMAR_IQH_REG, iqh);
+	return 0;
+}
+
+void intel_iommu_init(void) {
+    struct iommu_state *is = &iommu_state;
+    static const char info_str[] = "intel-iommu";
+    if (!is->enabled)
+	return;
+    
+    set_iommu_translation_fn(intel_iommu_phy_addr_translate);
+    iommu_mmio_map(is);
+    register_savevm(info_str, -1, 1, iommu_save, iommu_load, is);
+    qemu_register_reset(iommu_reset, is);
+    iommu_reset(is);
+}
+
diff --git a/hw/intel_iommu.h b/hw/intel_iommu.h
new file mode 100644
index 0000000..e8235c7
--- /dev/null
+++ b/hw/intel_iommu.h
@@ -0,0 +1,18 @@ 
+#ifndef _INTEL_IOMMU_H_
+#define _INTEL_IOMMU_H_
+
+#include "hw.h"
+
+struct QemuOpts;
+
+/* Init the IOMMU emulation */
+void intel_iommu_init(void);
+
+/* Configure the iommu */
+int intel_iommu_configure(const char* opt);
+
+/* Translate the physical address */
+/*uint64_t iommu_phy_addr_translate(target_phys_addr_t addr,
+                            int is_write, int devfn, int* err);
+*/							
+#endif
diff --git a/hw/pc.c b/hw/pc.c
index 0aebae9..2652ab5 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -46,6 +46,7 @@ 
 #include "elf.h"
 #include "multiboot.h"
 #include "device-assignment.h"
+#include "intel_iommu.h"
 
 #include "kvm.h"
 
@@ -1028,6 +1029,7 @@  static void pc_init1(ram_addr_t ram_size,
 
     isa_dev = isa_create_simple("i8042");
     DMA_init(0);
+    intel_iommu_init();
 #ifdef HAS_AUDIO
     audio_init(pci_enabled ? pci_bus : NULL, isa_irq);
 #endif
diff --git a/vl.c b/vl.c
index d959fdb..613a58e 100644
--- a/vl.c
+++ b/vl.c
@@ -136,6 +136,7 @@  int main(int argc, char **argv)
 #include "hw/xen.h"
 #include "hw/qdev.h"
 #include "hw/loader.h"
+#include "hw/intel_iommu.h"
 #include "bt-host.h"
 #include "net.h"
 #include "net/slirp.h"
@@ -5750,6 +5751,13 @@  int main(int argc, char **argv, char **envp)
                     fclose(fp);
                     break;
                 }
+	    case QEMU_OPTION_intel_iommu:
+		{
+		    if (intel_iommu_configure(optarg) < 0) {
+			fprintf(stderr, "Wrong intel_iommu configuration\n");
+			exit(1);
+		    }
+		}
             }
         }
     }