Patchwork Basic Intel IOMMU DMAR emulation

login
register
mail settings
Submitter Nadav Amit
Date April 8, 2010, 12:58 a.m.
Message ID <8DC5AC30-AF5C-461F-BA46-69F6C583C9EA@gmail.com>
Download mbox | patch
Permalink /patch/49680/
State New
Headers show

Comments

Nadav Amit - April 8, 2010, 12:58 a.m.
This patch enables basic Intel IOMMU (VT-d) emulation for DMA remappings. Registers invalidation is supported, as well as partial queued invalidation. In addition the structure allows other IOMMU architectures to easily connect to the IOMMU indirection emulation.

In general the patch emulates on one hand the expected behavior of the registers, and performs the translation of DMA accesses according to the paging structures.

Currently only IDE device is supported (SCSI and E1000 should follow).

The emulation is done using Intel's proposed caching mode. The guest OS must use the caching mode in correct way. Unfortunately, this is not the current case with Linux. Patches were sent to Intel IOMMU owner in order to fix the pending issues.

The current patch has the following (additional) limitations:
1. DMAR tables must be loaded by command line ( -acpitable sig=DMAR,data=hw/DMAR.dat )
2. Queued invalidation does not support interrupts on wait descriptors
3. DMAR faults are not supported
4. No protection of read-only registers in the IOMMU.

For enabling registers based invalidation set in the command line:
-intel-iommu on -acpitable sig=DMAR,data=hw/DMAR.dat

For enabling queued invalidation set in the command line:
-intel-iommu queue -acpitable sig=DMAR,data=hw/DMAR.dat

Signed-off-by: Nadav Amit <nadav.amit at gmail.com>
---
 Makefile.target  |    1 +
 cpu-common.h     |   22 ++
 dma-helpers.c    |   20 +-
 dma.h            |    6 +-
 exec.c           |  100 ++++++-
 hw/DMAR.dat      |  Bin 0 -> 28 bytes
 hw/ide/core.c    |   31 ++-
 hw/intel_iommu.c |  888 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/intel_iommu.h |   18 ++
 hw/pc.c          |    2 +
 vl.c             |    8 +
 11 files changed, 1063 insertions(+), 33 deletions(-)
 create mode 100644 hw/DMAR.dat
 create mode 100644 hw/intel_iommu.c
 create mode 100644 hw/intel_iommu.h
Paul Brook - April 8, 2010, 11:43 a.m.
>1. DMAR tables must be loaded by command line ( -acpitable
> sig=DMAR,data=hw/DMAR.dat )

Why?

> +void cpu_physical_memory_rw_io(target_phys_addr_t addr, uint8_t *buf,
> +                              int len, int is_write, uint16_t devfn,
> +                              int* err); 

This appears to be PCI specific, which is wrong.
You should be using the qdev infrastructure.

>+/* Mapping related structures */
>+struct IommuRootEntry {
>+       uint32_t p              : 1 ;
>+       uint32_t res1           : 11 ;
>+       uint64_t ctp            : 52 ;
>+       uint64_t res2           : 64 ;
>+};

This is almost certainly wrong on some hosts. You can not assume anything 
about bitfield layout. Do not use these if an exact binary representation is 
required.

Paul

Patch

diff --git a/Makefile.target b/Makefile.target
index 4d88543..79c9ff9 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -233,6 +233,7 @@  obj-i386-y += testdev.o
 
 obj-i386-$(CONFIG_KVM_PIT) += i8254-kvm.o
 obj-i386-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += device-assignment.o
+obj-i386-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += intel_iommu.o
 
 # Hardware support
 obj-ia64-y += ide.o pckbd.o vga.o $(SOUND_HW) dma.o $(AUDIODRV)
diff --git a/cpu-common.h b/cpu-common.h
index 6cae15b..007a82d 100644
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -47,6 +47,9 @@  void cpu_unregister_io_memory(int table_address);
 
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                             int len, int is_write);
+void cpu_physical_memory_rw_io(target_phys_addr_t addr, uint8_t *buf,
+			       int len, int is_write, uint16_t devfn,
+			       int* err); 
 static inline void cpu_physical_memory_read(target_phys_addr_t addr,
                                             uint8_t *buf, int len)
 {
@@ -57,9 +60,28 @@  static inline void cpu_physical_memory_write(target_phys_addr_t addr,
 {
     cpu_physical_memory_rw(addr, (uint8_t *)buf, len, 1);
 }
+static inline void cpu_physical_memory_read_io(target_phys_addr_t addr,
+                                            uint8_t *buf, int len, 
+					    uint16_t devfn, int* err)
+{
+    cpu_physical_memory_rw_io(addr, buf, len, 0, devfn, err);
+}
+static inline void cpu_physical_memory_write_io(target_phys_addr_t addr,
+                                            uint8_t *buf, int len,
+					    uint16_t devfn, int* err)
+{
+    cpu_physical_memory_rw_io(addr, buf, len, 1, devfn, err);
+}
+
+void set_iommu_translation_fn(target_phys_addr_t(*fn)(target_phys_addr_t, 
+    int, int, int*));
+
 void *cpu_physical_memory_map(target_phys_addr_t addr,
                               target_phys_addr_t *plen,
                               int is_write);
+void *cpu_physical_memory_map_io(target_phys_addr_t addr,
+                              target_phys_addr_t *plen,
+                              int is_write, uint16_t devfn, int* err);
 void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len,
                                int is_write, target_phys_addr_t access_len);
 void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque));
diff --git a/dma-helpers.c b/dma-helpers.c
index d4fc077..9ead209 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -47,6 +47,7 @@  typedef struct {
     target_phys_addr_t sg_cur_byte;
     QEMUIOVector iov;
     QEMUBH *bh;
+    uint16_t busdevfn;
 } DMAAIOCB;
 
 static void dma_bdrv_cb(void *opaque, int ret);
@@ -84,6 +85,7 @@  static void dma_bdrv_cb(void *opaque, int ret)
     DMAAIOCB *dbs = (DMAAIOCB *)opaque;
     target_phys_addr_t cur_addr, cur_len;
     void *mem;
+    int err;
 
     dbs->acb = NULL;
     dbs->sector_num += dbs->iov.size / 512;
@@ -100,8 +102,9 @@  static void dma_bdrv_cb(void *opaque, int ret)
     while (dbs->sg_cur_index < dbs->sg->nsg) {
         cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
         cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
-        mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->is_write);
-        if (!mem)
+        mem = cpu_physical_memory_map_io(cur_addr, &cur_len, !dbs->is_write,
+	    dbs->busdevfn, &err);
+        if (!mem || err)
             break;
         qemu_iovec_add(&dbs->iov, mem, cur_len);
         dbs->sg_cur_byte += cur_len;
@@ -147,7 +150,7 @@  static AIOPool dma_aio_pool = {
 static BlockDriverAIOCB *dma_bdrv_io(
     BlockDriverState *bs, QEMUSGList *sg, uint64_t sector_num,
     BlockDriverCompletionFunc *cb, void *opaque,
-    int is_write)
+    int is_write, int busdevfn)
 {
     DMAAIOCB *dbs =  qemu_aio_get(&dma_aio_pool, bs, cb, opaque);
 
@@ -159,6 +162,7 @@  static BlockDriverAIOCB *dma_bdrv_io(
     dbs->sg_cur_byte = 0;
     dbs->is_write = is_write;
     dbs->bh = NULL;
+    dbs->busdevfn = busdevfn;
     qemu_iovec_init(&dbs->iov, sg->nsg);
     /*
      * DMA flushing is handled in dma_bdrv_cb() calling dma_bdrv_unmap()
@@ -175,14 +179,16 @@  static BlockDriverAIOCB *dma_bdrv_io(
 
 BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
                                 QEMUSGList *sg, uint64_t sector,
-                                void (*cb)(void *opaque, int ret), void *opaque)
+                                void (*cb)(void *opaque, int ret), void *opaque,
+				uint16_t busdevfn)
 {
-    return dma_bdrv_io(bs, sg, sector, cb, opaque, 0);
+    return dma_bdrv_io(bs, sg, sector, cb, opaque, 0, busdevfn);
 }
 
 BlockDriverAIOCB *dma_bdrv_write(BlockDriverState *bs,
                                  QEMUSGList *sg, uint64_t sector,
-                                 void (*cb)(void *opaque, int ret), void *opaque)
+                                 void (*cb)(void *opaque, int ret), void *opaque,
+				 uint16_t busdevfn)
 {
-    return dma_bdrv_io(bs, sg, sector, cb, opaque, 1);
+    return dma_bdrv_io(bs, sg, sector, cb, opaque, 1, busdevfn);
 }
diff --git a/dma.h b/dma.h
index f3bb275..12f8c4d 100644
--- a/dma.h
+++ b/dma.h
@@ -34,8 +34,10 @@  void qemu_sglist_destroy(QEMUSGList *qsg);
 
 BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
                                 QEMUSGList *sg, uint64_t sector,
-                                BlockDriverCompletionFunc *cb, void *opaque);
+                                BlockDriverCompletionFunc *cb, void *opaque,
+				uint16_t busdevfn);
 BlockDriverAIOCB *dma_bdrv_write(BlockDriverState *bs,
                                  QEMUSGList *sg, uint64_t sector,
-                                 BlockDriverCompletionFunc *cb, void *opaque);
+                                 BlockDriverCompletionFunc *cb, void *opaque,
+				 uint16_t busdevfn);
 #endif
diff --git a/exec.c b/exec.c
index de2fbea..6527307 100644
--- a/exec.c
+++ b/exec.c
@@ -186,6 +186,24 @@  unsigned long qemu_real_host_page_size;
 unsigned long qemu_host_page_bits;
 unsigned long qemu_host_page_size;
 unsigned long qemu_host_page_mask;
+target_phys_addr_t (*iommu_phy_addr_translate)(target_phys_addr_t addr,
+	int is_write, int devfn, int* err);
+
+static target_phys_addr_t no_iommu_translate(target_phys_addr_t addr, 
+    int is_write, int devfn, int* err)
+{
+    return addr;
+}
+
+static void init_iommu_translation(void)
+{
+    iommu_phy_addr_translate = no_iommu_translate;
+}
+
+void set_iommu_translation_fn(target_phys_addr_t(*fn)(target_phys_addr_t, int, int, int*))
+{
+    iommu_phy_addr_translate = fn;
+}
 
 /* This is a multi-level map on the virtual address space.
    The bottom level has pointers to PageDesc.  */
@@ -551,6 +569,7 @@  static void code_gen_alloc(unsigned long tb_size)
    size. */
 void cpu_exec_init_all(unsigned long tb_size)
 {
+    init_iommu_translation();
     cpu_gen_init();
     code_gen_alloc(tb_size);
     code_gen_ptr = code_gen_buffer;
@@ -3371,8 +3390,8 @@  int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
 }
 
 #else
-void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-                            int len, int is_write)
+static void __cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
+                            int len, int is_write, int devfn, int* err)
 {
     int l, io_index;
     uint8_t *ptr;
@@ -3380,10 +3399,26 @@  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
     target_phys_addr_t page;
     unsigned long pd;
     PhysPageDesc *p;
+    target_phys_addr_t addr0;
+    int single_err;
+    int temp;
+    if (err == NULL)
+        err = &temp;
+    *err = 0;
+
 
     while (len > 0) {
-        page = addr & TARGET_PAGE_MASK;
-        l = (page + TARGET_PAGE_SIZE) - addr;
+	single_err = 0;
+	addr0 = addr;
+	if (devfn != -1) {
+	    *err = 0;
+	    addr0 = iommu_phy_addr_translate(addr, is_write, devfn, &single_err);
+            *err |= single_err;
+            if (single_err)
+        	continue;
+        }
+        page = addr0 & TARGET_PAGE_MASK;
+        l = (page + TARGET_PAGE_SIZE) - addr0;
         if (l > len)
             l = len;
         p = phys_page_find(page >> TARGET_PAGE_BITS);
@@ -3395,10 +3430,10 @@  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
 
         if (is_write) {
             if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
-                target_phys_addr_t addr1 = addr;
+                target_phys_addr_t addr1 = addr0;
                 io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
                 if (p)
-                    addr1 = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+                    addr1 = (addr0 & ~TARGET_PAGE_MASK) + p->region_offset;
                 /* XXX: could force cpu_single_env to NULL to avoid
                    potential bugs */
                 if (l >= 4 && ((addr1 & 3) == 0)) {
@@ -3439,7 +3474,7 @@  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
         } else {
             if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
                 !(pd & IO_MEM_ROMD)) {
-                target_phys_addr_t addr1 = addr;
+                target_phys_addr_t addr1 = addr0;
                 /* I/O case */
                 io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
                 if (p)
@@ -3473,6 +3508,19 @@  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
     }
 }
 
+void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
+                            int len, int is_write)
+{
+    __cpu_physical_memory_rw(addr, buf, len, is_write, -1, NULL);
+}
+
+void cpu_physical_memory_rw_io(target_phys_addr_t addr, uint8_t *buf,
+                            int len, int is_write, uint16_t devfn, 
+			    int* err)
+{
+    __cpu_physical_memory_rw(addr, buf, len, is_write, devfn, err);
+}
+
 /* used for ROM loading : can write in RAM and ROM */
 void cpu_physical_memory_write_rom(target_phys_addr_t addr,
                                    const uint8_t *buf, int len)
@@ -3565,9 +3613,9 @@  static void cpu_notify_map_clients(void)
  * Use cpu_register_map_client() to know when retrying the map operation is
  * likely to succeed.
  */
-void *cpu_physical_memory_map(target_phys_addr_t addr,
+static void *__cpu_physical_memory_map(target_phys_addr_t addr,
                               target_phys_addr_t *plen,
-                              int is_write)
+                              int is_write, uint16_t devfn, int* err)
 {
     target_phys_addr_t len = *plen;
     target_phys_addr_t done = 0;
@@ -3578,10 +3626,18 @@  void *cpu_physical_memory_map(target_phys_addr_t addr,
     unsigned long pd;
     PhysPageDesc *p;
     unsigned long addr1;
+    target_phys_addr_t addr0 = addr;
+    int temp;
+
+    if (err == NULL)
+	err = &temp;
 
     while (len > 0) {
-        page = addr & TARGET_PAGE_MASK;
-        l = (page + TARGET_PAGE_SIZE) - addr;
+	addr0 = addr;
+	if (devfn != -1)
+	     addr0 = iommu_phy_addr_translate(addr, is_write, devfn, err);
+        page = addr0 & TARGET_PAGE_MASK;
+        l = (page + TARGET_PAGE_SIZE) - addr0;
         if (l > len)
             l = len;
         p = phys_page_find(page >> TARGET_PAGE_BITS);
@@ -3596,14 +3652,14 @@  void *cpu_physical_memory_map(target_phys_addr_t addr,
                 break;
             }
             bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, TARGET_PAGE_SIZE);
-            bounce.addr = addr;
+            bounce.addr = addr0;
             bounce.len = l;
             if (!is_write) {
-                cpu_physical_memory_rw(addr, bounce.buffer, l, 0);
+                cpu_physical_memory_rw(addr0, bounce.buffer, l, 0);
             }
             ptr = bounce.buffer;
         } else {
-            addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+            addr1 = (pd & TARGET_PAGE_MASK) + (addr0 & ~TARGET_PAGE_MASK);
             ptr = qemu_get_ram_ptr(addr1);
         }
         if (!done) {
@@ -3620,6 +3676,22 @@  void *cpu_physical_memory_map(target_phys_addr_t addr,
     return ret;
 }
 
+void *cpu_physical_memory_map_io(target_phys_addr_t addr,
+                              target_phys_addr_t *plen,
+                              int is_write, uint16_t devfn, int* err)
+{
+     return __cpu_physical_memory_map(addr,
+            plen, is_write, devfn, err);
+}
+
+
+void *cpu_physical_memory_map(target_phys_addr_t addr,
+                              target_phys_addr_t *plen,
+                              int is_write)
+{
+    return __cpu_physical_memory_map(addr, plen, is_write, -1, NULL);
+}
+
 /* Unmaps a memory region previously mapped by cpu_physical_memory_map().
  * Will also mark the memory as dirty if is_write == 1.  access_len gives
  * the amount of memory that was actually read or written by the caller.
diff --git a/hw/DMAR.dat b/hw/DMAR.dat
new file mode 100644
index 0000000000000000000000000000000000000000..db5010ce79ab960b3cbd69a12c67863246d26538
GIT binary patch
literal 28
TcmY#pKn4N~j9~7?e;^6~4!8mY

literal 0
HcmV?d00001

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 67480bb..807c035 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -34,6 +34,7 @@ 
 #include "dma.h"
 
 #include <hw/ide/internal.h>
+#include <hw/ide/pci.h>
 
 static int smart_attributes[][5] = {
     /* id,  flags, val, wrst, thrsh */
@@ -432,7 +433,7 @@  static int dma_buf_prepare(BMDMAState *bm, int is_write)
         uint32_t addr;
         uint32_t size;
     } prd;
-    int l, len;
+    int l, len, err;
 
     qemu_sglist_init(&s->sg, s->nsector / (TARGET_PAGE_SIZE/512) + 1);
     s->io_buffer_size = 0;
@@ -442,7 +443,10 @@  static int dma_buf_prepare(BMDMAState *bm, int is_write)
             if (bm->cur_prd_last ||
                 (bm->cur_addr - bm->addr) >= 4096)
                 return s->io_buffer_size != 0;
-            cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+            cpu_physical_memory_read_io(bm->cur_addr, (uint8_t *)&prd, 8,
+		bm->pci_dev->dev.devfn, &err);
+	    if (err)
+		return 0;
             bm->cur_addr += 8;
             prd.addr = le32_to_cpu(prd.addr);
             prd.size = le32_to_cpu(prd.size);
@@ -514,7 +518,7 @@  static int dma_buf_rw(BMDMAState *bm, int is_write)
         uint32_t addr;
         uint32_t size;
     } prd;
-    int l, len;
+    int l, len, err;
 
     for(;;) {
         l = s->io_buffer_size - s->io_buffer_index;
@@ -525,7 +529,10 @@  static int dma_buf_rw(BMDMAState *bm, int is_write)
             if (bm->cur_prd_last ||
                 (bm->cur_addr - bm->addr) >= 4096)
                 return 0;
-            cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+            cpu_physical_memory_read_io(bm->cur_addr, (uint8_t *)&prd, 8,
+		bm->pci_dev->dev.devfn, &err);
+	    if (err)
+		return 0;
             bm->cur_addr += 8;
             prd.addr = le32_to_cpu(prd.addr);
             prd.size = le32_to_cpu(prd.size);
@@ -540,11 +547,13 @@  static int dma_buf_rw(BMDMAState *bm, int is_write)
             l = bm->cur_prd_len;
         if (l > 0) {
             if (is_write) {
-                cpu_physical_memory_write(bm->cur_prd_addr,
-                                          s->io_buffer + s->io_buffer_index, l);
+                cpu_physical_memory_write_io(bm->cur_prd_addr,
+                                          s->io_buffer + s->io_buffer_index, l,
+					  bm->pci_dev->dev.devfn, &err);
             } else {
-                cpu_physical_memory_read(bm->cur_prd_addr,
-                                          s->io_buffer + s->io_buffer_index, l);
+                cpu_physical_memory_read_io(bm->cur_prd_addr,
+                                          s->io_buffer + s->io_buffer_index, l,
+					  bm->pci_dev->dev.devfn, &err);
             }
             bm->cur_prd_addr += l;
             bm->cur_prd_len -= l;
@@ -600,7 +609,8 @@  static void ide_read_dma_cb(void *opaque, int ret)
 #ifdef DEBUG_AIO
     printf("aio_read: sector_num=%" PRId64 " n=%d\n", sector_num, n);
 #endif
-    bm->aiocb = dma_bdrv_read(s->bs, &s->sg, sector_num, ide_read_dma_cb, bm);
+    bm->aiocb = dma_bdrv_read(s->bs, &s->sg, sector_num, ide_read_dma_cb, bm,
+			      bm->pci_dev->dev.devfn);
     ide_dma_submit_check(s, ide_read_dma_cb, bm);
 }
 
@@ -746,7 +756,8 @@  static void ide_write_dma_cb(void *opaque, int ret)
 #ifdef DEBUG_AIO
     printf("aio_write: sector_num=%" PRId64 " n=%d\n", sector_num, n);
 #endif
-    bm->aiocb = dma_bdrv_write(s->bs, &s->sg, sector_num, ide_write_dma_cb, bm);
+    bm->aiocb = dma_bdrv_write(s->bs, &s->sg, sector_num, ide_write_dma_cb, bm,
+			       bm->pci_dev->dev.devfn);
     ide_dma_submit_check(s, ide_write_dma_cb, bm);
 }
 
diff --git a/hw/intel_iommu.c b/hw/intel_iommu.c
new file mode 100644
index 0000000..4b958b2
--- /dev/null
+++ b/hw/intel_iommu.c
@@ -0,0 +1,888 @@ 
+#include <stddef.h>
+#include <assert.h>
+#include <stdint.h>
+
+#include "intel_iommu.h"
+/*#include "device-assignment.h"
+*/
+#include "../qemu-option.h" 
+/*#include "../qemu-kvm.h"
+#include "../qemu-queue.h"
+*/
+#define DMAR_DEVICES_NUM 1
+#define DMAR_REG_BASE 0xFED10000
+
+#define CIRG_GLOBAL_INVALIDATION 		1
+#define CIRG_DOMAIN_SELECTIVE_INVALIDATION	2
+#define CIRG_DEVICE_SELECTIVE_INVALIDATION	3
+
+#define IIRG_GLOBAL_INVALIDATION		1
+#define IIRG_DOMAIN_SELECTIVE_INVALIDATION	2
+#define IIRG_DOMAIN_PAGE_SELECTIVE_INVALIDATION	3
+
+#define pr_debug(fmt, ...) 						\
+	do {								\
+        	fprintf( stderr, "%s : " fmt "\n",			\
+			__func__, ##__VA_ARGS__);			\
+		fflush (stderr); 					\
+	} while (0)
+
+
+#define print_err( msg, pr_err, err ) 					\
+	do {								\
+		*(err) = 1; 						\
+			if (pr_err) { 					\
+			fprintf(stderr, "intel_iommu.c: %s \n", (msg)); \
+			fflush(stderr); 				\
+		} 							\
+	} while (0)
+	
+#define mb() 	asm volatile("mfence":::"memory")
+
+enum { MGAW = 48};
+enum { FRCD_REG_NUM = 4 };
+enum { DMAR_REGS_PAGE = 0x1000, DMAR_REGS_PAGE_SHIFT = 12, LEVEL_STRIDE=9, VTD_PAGE_SHIFT=12} ;
+enum { DEVFN_ENTRIES_NUM = 256 };
+typedef enum { IOMMU_WRITE = 2, IOMMU_READ = 1/*, IOMMU_ANY*/ } IommuAccessType;
+
+#define VTD_PAGE_SIZE (1<<VTD_PAGE_SHIFT)
+
+/* Mapping related structures */
+struct IommuRootEntry {
+	uint32_t p 		: 1 ;
+	uint32_t res1 		: 11 ;
+	uint64_t ctp 		: 52 ;
+	uint64_t res2		: 64 ;
+};
+
+struct IommuContextEntry {
+	uint32_t p		: 1 ;
+	uint32_t fpd		: 1 ;
+	uint32_t t		: 2 ;
+	uint32_t eh		: 1 ;
+	uint32_t alh 		: 1 ;
+	uint32_t res1		: 6 ;
+	uint64_t asr		: 52 ;
+	uint32_t aw		: 3 ;
+	uint32_t avail		: 4 ;
+	uint32_t res2		: 1;
+	uint32_t did		: 16;
+	uint64_t res3		: 40;
+};
+
+struct bus_root {
+	uint16_t busnr;
+	struct IommuRootEntry root;
+	QLIST_ENTRY(bus_root) node;
+};
+
+struct devfn_context {
+	uint8_t busnr;
+	uint8_t devfn;
+	uint8_t h_busnr;
+	uint8_t h_devfn;
+	struct IommuContextEntry context;
+	QLIST_ENTRY(devfn_context) node;
+};
+
+
+enum { TRASNLATE_UNTRANSLATED = 0, TRANSLATE_ALL = 1, TRANSLATE_PASS_THROUGH = 2, TRANSLATE_RESERVED = 3};
+
+struct IommuPageTableEntry {
+	uint32_t r		: 1 ;
+	uint32_t w		: 1 ;
+	uint32_t avail1		: 5 ;
+	uint32_t sp		: 1 ;
+	uint32_t avail2		: 3 ;
+	uint32_t snp		: 1 ;
+	uint64_t addr		: 40 ;
+	uint32_t avail3		: 10 ;
+	uint32_t tm		: 1 ;
+	uint32_t avail4 	: 1 ;
+};
+
+
+/* IOMMU Registers */
+struct gcmd_reg
+{
+	uint32_t res		: 23;
+	uint32_t cfi		: 1;
+	uint32_t sirtp		: 1;
+	uint32_t ire		: 1;
+	uint32_t qie		: 1;
+	uint32_t wbf		: 1;
+	uint32_t eafl		: 1;
+	uint32_t sfl		: 1;
+	uint32_t srtp		: 1;
+	uint32_t te		: 1;
+}	__attribute__((__packed__));
+	
+struct gsts_reg
+{
+	uint32_t res		: 23;
+	uint32_t cfis		: 1;
+	uint32_t irtps		: 1;
+	uint32_t ires		: 1;
+	uint32_t qies		: 1;
+	uint32_t wbfs		: 1;
+	uint32_t afls		: 1;
+	uint32_t fls		: 1;
+	uint32_t rtps		: 1;
+	uint32_t tes		: 1;
+}	__attribute__((__packed__));
+
+struct ecap_reg 
+{
+	uint32_t c		: 1;
+	uint32_t qi		: 1;
+	uint32_t di		: 1;
+	uint32_t ir		: 1;
+	uint32_t eim		: 1;
+	uint32_t ch		: 1;
+	uint32_t pt		: 1;
+	uint32_t sc		: 1;
+	uint32_t iro		: 10;
+	uint32_t res1		: 2;
+	uint32_t mhmv		: 4;
+	uint64_t res2		: 40;	
+}	__attribute__((__packed__));
+
+struct ver_reg
+{
+	uint32_t min		: 4;
+	uint32_t max		: 4;
+	uint32_t res		: 24;
+}	__attribute__((__packed__));
+
+struct cap_reg
+{
+	uint32_t nd		: 3;
+	uint32_t afl		: 1;
+	uint32_t rwbf		: 1;
+	uint32_t plmr		: 1;
+	uint32_t phmr		: 1;
+	uint32_t cm		: 1;
+	uint32_t sagaw		: 5;
+	uint32_t res1		: 3;
+	uint32_t mgaw		: 6;
+	uint32_t zlr		: 1;
+	uint32_t isoch		: 1;
+	uint32_t fro		: 10;
+	uint32_t sps		: 4;
+	uint32_t res2		: 1;
+	uint32_t psi		: 1;
+	uint32_t nfr		: 8;
+	uint32_t mamv		: 6;
+	uint32_t dwd		: 1;
+	uint32_t drd		: 1;
+	uint32_t res3		: 8;
+}	__attribute__((__packed__));
+
+enum {SPS_21_BIT = 1, SPS_30_BIT = 2, SPS_39_BIT = 4, SPS_48_BIT = 8}; /* sagaw values */
+enum {AGAW_30_BIT = 1, AGAW_39_BIT = 2, AGAW_48_BIT = 4, AGAW_57_BIT = 8, AGAW_64_BIT = 16}; /* agaw values */
+
+
+struct ccmd_reg {
+	uint32_t did		: 16;
+	uint32_t sid		: 16;
+	uint32_t fm		: 2;
+	uint32_t res		: 25;
+	uint32_t caig		: 2;
+	uint32_t cirg		: 2;
+	uint32_t icc		: 1;
+}	__attribute__((__packed__));
+	
+struct fsts_reg {
+	uint32_t pfo		: 1;
+	uint32_t ppf		: 1;
+	uint32_t afo		: 1;
+	uint32_t apf		: 1;
+	uint32_t iqe		: 1;
+	uint32_t ice		: 1;
+	uint32_t ite		: 1;
+	uint32_t res1		: 1;
+	uint32_t fri		: 8;
+	uint32_t res2		: 16;
+}	__attribute__((__packed__));
+	
+
+struct fectl_reg {
+	uint32_t res		: 30;
+	uint32_t ip		: 1;
+	uint32_t im		: 1;
+}	__attribute__((__packed__));
+
+struct fedata_reg {
+	uint32_t imd		: 16;
+	uint32_t eimd		: 16;
+}	__attribute__((__packed__));
+
+struct feaddr_reg {
+	uint32_t res		: 2;
+	uint32_t ma		: 30;
+}	__attribute__((__packed__)); 
+
+struct aflog_reg {
+	uint32_t res		: 9;
+	uint32_t fls		: 3;
+	uint64_t fla		: 52;
+}	__attribute__((__packed__)); 
+
+struct pmen_reg {
+	uint32_t prs		: 1;
+	uint32_t res		: 30;
+	uint32_t epm		: 1;
+}	__attribute__((__packed__)); 
+
+
+struct iqh_reg {
+	uint32_t res1		: 4;
+	uint32_t qh		: 15;
+	uint64_t res2		: 45;
+}	__attribute__((__packed__)); 
+
+struct iqt_reg {
+	uint32_t res1		: 4;
+	uint32_t qt		: 15;
+	uint64_t res2		: 45;
+}	__attribute__((__packed__)); 
+
+struct iqa_reg {
+	uint32_t qs		: 3;
+	uint32_t res		: 9;
+	uint64_t iqa		: 52;
+}	__attribute__((__packed__)); 
+
+struct ics_reg {
+	uint32_t iwc		: 1;
+	uint32_t res		: 31;
+}	__attribute__((__packed__)); 
+
+struct iectl_reg {
+	uint32_t res		: 30;
+	uint32_t ip		: 1;
+	uint32_t im		: 1;
+}	__attribute__((__packed__)); 
+
+struct iedata_reg {
+	uint32_t imd 		: 16;
+	uint32_t eimd		: 16;
+}	__attribute__((__packed__)); 
+
+struct ieaddr_reg {
+	uint32_t res 		: 2;
+	uint32_t ma		: 30;
+}	__attribute__((__packed__)); 
+
+/* IOTLB Registers */
+struct iotlb_reg {
+	uint32_t res1 		: 32;
+	uint32_t did 		: 16;
+	uint32_t dw		: 1;
+	uint32_t dr		: 1;
+	uint32_t res2		: 7;
+	uint32_t iaig		: 2;
+	uint32_t res3		: 1;
+	uint32_t iirg		: 2;
+	uint32_t res4		: 1;
+	uint32_t ivt		: 1;
+}	__attribute__((__packed__)); 
+
+struct iva_reg {
+	uint32_t am		: 6;
+	uint32_t ih		: 1;
+	uint32_t res		: 5;
+	uint64_t addr		: 52;
+}	__attribute__((__packed__));
+
+/* Faults reporting register */
+struct frcd_reg {
+	uint32_t res1		: 12;
+	uint64_t fi		: 52;
+	uint32_t sid		: 16;
+	uint32_t res2		: 16;
+	uint32_t fr		: 8;
+	uint32_t res3		: 20;
+	uint32_t at		: 2;
+	uint32_t t		: 1;
+	uint32_t f		: 1;
+}	__attribute__((__packed__));
+	
+struct dmar_regs_region {
+	struct ver_reg ver;	/* Arch version supported by this IOMMU */
+	uint32_t res1;		/* Reserved */
+	struct cap_reg cap;	/* Hardware supported capabilities */
+	struct ecap_reg ecap;	/* Extended capabilities supported */
+	struct gcmd_reg gcmd;	/* Global command register */
+	struct gsts_reg gsts;	/* Global status register */
+	uint64_t readdr;	/* Root entry table */
+	struct ccmd_reg ccmd;	/* Context command reg */
+	uint32_t res2;		/* Reserved */
+	uint32_t fsts;		/* Fault Status register */
+	uint32_t fectl;		/* Fault control register */
+	uint32_t fedata;	/* Fault event interrupt data register */
+	uint32_t feaddr;	/* Fault event interrupt addr register */
+	uint32_t feuaddr; 	/* Upper address register */
+	uint64_t res3[2];	/* Reserved */
+	uint64_t aflog;		/* Advanced Fault control */
+	uint32_t res4; 		/* Reserved */
+	struct pmen_reg pmen;	/* Enable Protected Memory Region */
+	uint32_t plmbase;	/* PMRR Low addr */
+	uint32_t plmlimit;	/* PMRR low limit */
+	uint64_t phmbase;	/* pmrr high base addr */
+	uint64_t phmlimit;	/* pmrr high limit */
+	struct iqh_reg iqh;	/* Invalidation queue head register */
+	struct iqt_reg iqt;		/* Invalidation queue tail register */
+	struct iqa_reg iqa;		/* Invalidation queue addr register */
+	uint32_t res5;		/* Reserved */
+	uint32_t ics;		/* Invalidation complete status register */
+	uint32_t icec;		/* Invalidation Completion Event Control Register */
+	uint32_t iced; 		/* Invalidation Queue Event message data register */
+	uint32_t icea;		/* Invalidation Completion Event Address Register */
+	uint32_t iceua; 	/* Invalidation Completion Event Upper Address Register */
+	uint64_t res6;		/* Reserved */
+	uint64_t irtar;		/* Interrupt Remapping Table Address Register */
+	struct iva_reg iva;	/* Invalidate Address Register */
+	struct iotlb_reg iotlb; /* IOTLB Invalidate Register */
+	struct frcd_reg frr[FRCD_REG_NUM];	/* Fault recording register */
+} __attribute__((__packed__));
+
+/* Invalidation queue descriptors */
+enum { 	IQ_CONTEXT_INVD_DESC_ID = 1, 
+	IQ_IOTLB_INVD_DESC_ID = 2, 
+	IQ_DEV_IOTLB_INVD_DESC_ID = 3,
+	IQ_INT_CACHE_INV_DESC_ID = 4,
+	IQ_INVD_WAIT_DESC_ID = 5 };
+
+struct iommu_generic_desc {
+	uint32_t id		: 4;
+	uint64_t res1		: 60;
+	uint64_t res2		: 64; 
+} __attribute__((__packed__));
+
+struct iotlb_invd_desc {
+	uint32_t id		: 4;
+	uint32_t g		: 2;
+	uint32_t dw		: 1;
+	uint32_t dr		: 1;
+	uint32_t res1		: 8;
+	uint32_t did		: 16;
+	uint32_t res2		: 32;
+	uint32_t am		: 6;
+	uint32_t ih		: 1;
+	uint32_t res3		: 5;
+	uint64_t addr		: 52;
+	struct iotlb_reg iotlb;
+} __attribute__((__packed__));
+
+struct context_cache_invd_desc {
+	uint32_t id		: 4;
+	uint32_t g		: 2;
+	uint32_t res1		: 10;
+	uint32_t did		: 16;
+	uint32_t sid		: 16;
+	uint32_t fm		: 2;
+	uint32_t res2		: 14;
+	uint64_t res3		: 64;	
+} __attribute__((__packed__));
+
+struct inv_wait_desc {
+	uint32_t id		: 4;
+	uint32_t iflag		: 1;
+	uint32_t sw		: 1;
+	uint32_t fn		: 1;
+	uint32_t res1		: 25;
+	uint32_t stat_data	: 32;
+	uint32_t res2		: 2;
+	uint64_t stat_addr	: 62;
+} __attribute__((__packed__));
+
+struct int_entry_cache_invd_desc {
+	uint32_t id 		: 4;
+	uint32_t g		: 1;
+	uint32_t res1		: 22;
+	uint32_t im		: 5;
+	uint32_t iidx		: 16;
+	uint32_t res2		: 16;
+	uint64_t res3		: 64;
+} __attribute__((__packed__));
+
+struct dev_iotlb_invd_desc {
+	uint32_t id		: 4;
+	uint32_t res1		: 12;
+	uint32_t max_invs_pend 	: 5;
+	uint32_t res2		: 11;
+	uint32_t sid		: 16;
+	uint32_t res3		: 16;
+	uint32_t s		: 1;
+	uint32_t res4		: 11;
+	uint64_t addr		: 52;
+} __attribute__((__packed__));
+
+struct dmar_status
+{
+	struct dmar_regs_region dmar_regs;
+};
+
+struct iommu_state {
+	int mmio_index;
+	int invalidation_queue;
+	int enabled;
+	struct dmar_status dmar_status[DMAR_DEVICES_NUM];
+};
+
+static struct iommu_state iommu_state; 			/* Static state */
+
+/* Forward declaration */
+static
+uint64_t __iommu_phy_addr_translate(target_phys_addr_t addr,
+                            uint8_t* access_perm, uint8_t busnr, 
+			    uint8_t devfn, uint64_t* size);
+
+static 
+int intel_iommu_process_iq(struct dmar_status* dmar_status);
+
+static 
+void iommu_reset_regs(struct dmar_regs_region* dmar) {
+	memset(dmar, 0, sizeof(*dmar));
+	dmar->ver.min=1;
+	dmar->cap.sps=0; /* Super-pages disabled */
+	dmar->cap.mgaw = MGAW - 1; 
+	dmar->cap.sagaw = AGAW_30_BIT | AGAW_39_BIT | AGAW_48_BIT;
+	dmar->cap.nfr = FRCD_REG_NUM;
+	dmar->cap.psi=1;	/* enabling page selective invalidation */
+	dmar->cap.mamv = 63; /* setting mamv to its maximal value */
+	dmar->cap.cm=1;	/* Caching mode is set to track non-present->present */
+	dmar->ecap.pt=1;
+	dmar->ecap.ch=1;
+	dmar->ecap.c=1;	
+	/* Assertions to make sure the offsets are a multiply of 16 */
+	assert((offsetof(struct dmar_regs_region, frr)) % 16 == 0);
+	assert((offsetof(struct dmar_regs_region, iva)) % 16 == 0);
+	
+	dmar->cap.fro = (offsetof(struct dmar_regs_region, frr)) / 16;
+	dmar->ecap.iro = (offsetof(struct dmar_regs_region, iva)) / 16;
+	dmar->ecap.qi = iommu_state.invalidation_queue;
+}
+
+static void
+intel_iommu_write_status_update(struct dmar_status* dmar_status, target_phys_addr_t addr)
+{
+	int offset = addr & 0xFFF & ~3;
+	struct dmar_regs_region* dmar_regs = &(dmar_status->dmar_regs);
+
+	switch (offset)
+	{
+	case offsetof(struct dmar_regs_region, gcmd):	{
+		/* first we will update gcmd, and only then gsts */
+		dmar_regs->gsts.fls = dmar_regs->gcmd.sfl;		/* set fault log */
+		dmar_regs->gsts.rtps |= dmar_regs->gcmd.srtp;	/* set root table pointer */
+		dmar_regs->gsts.wbfs = dmar_regs->gcmd.wbf;		/* write buffer flush */
+		dmar_regs->gsts.tes = dmar_regs->gcmd.te;		/* translation enable */
+		dmar_regs->gcmd.srtp = 0; /* to ease detection of setting of the bit */
+		if (iommu_state.invalidation_queue && dmar_regs->gsts.qies == 0 &&
+			dmar_regs->gcmd.qie == 1) {
+			dmar_regs->gsts.qies = 1;
+
+		}
+		break;
+		}
+	case offsetof(struct dmar_regs_region, ccmd):
+	case offsetof(struct dmar_regs_region, ccmd)+4:
+		dmar_regs->ccmd.caig = dmar_regs->ccmd.cirg;		
+		dmar_regs->ccmd.icc = 0;	/* Clearing the invalidate context cache to "emulate" invalidation */
+		break;
+	case offsetof(struct dmar_regs_region, pmen):
+		dmar_regs->pmen.prs = dmar_regs->pmen.epm;		/* enable memory protection */
+		break;
+	case offsetof(struct dmar_regs_region, iotlb):		/* this one is actually unnecassary */
+	case offsetof(struct dmar_regs_region, iotlb)+4:
+	case offsetof(struct dmar_regs_region, iva):
+	case offsetof(struct dmar_regs_region, iva)+4:
+		if (dmar_regs->iotlb.ivt == 1) {
+			dmar_regs->iva.addr = 0;
+			dmar_regs->iva.am = 0;
+			dmar_regs->iva.ih = 0;
+			dmar_regs->iotlb.iaig = dmar_regs->iotlb.iirg; 	/* actual granularity matches requested */
+			dmar_regs->iotlb.ivt = 0;						/* marking invalidation is over */
+		}
+		break;
+	case offsetof(struct dmar_regs_region, iqt):
+	case offsetof(struct dmar_regs_region, iqt)+4:
+		if (iommu_state.invalidation_queue && 
+			dmar_regs->gsts.qies == 1) 
+			intel_iommu_process_iq(dmar_status);
+		break;
+	default:
+		;
+	}
+	
+}
+
+
+/*
+Software is expected to access 32-bit registers as aligned doublewords. For example, to modify a
+field (e.g., bit or byte) in a 32-bit register, the entire doubleword is read, the appropriate field(s)
+are modified, and the entire doubleword is written back.
+*/
+
+static void
+intel_iommu_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+	struct iommu_state* d = opaque;
+	unsigned int offset = (addr & 0xFFF);
+	unsigned int dmar_idx = (addr & ~(DMAR_REGS_PAGE-1)) >> DMAR_REGS_PAGE_SHIFT;
+	struct dmar_regs_region* dmar_regs;
+	
+	if (dmar_idx >= DMAR_DEVICES_NUM)
+		return;
+	
+	dmar_regs = &(d->dmar_status[dmar_idx].dmar_regs);
+	if (offset >= sizeof(*dmar_regs))
+		return;			/* outside the registers area - reserved */
+	*(uint32_t*)(((void*)dmar_regs)+offset ) = val;
+	intel_iommu_write_status_update(&(d->dmar_status[dmar_idx]), addr);
+}
+
+static void
+intel_iommu_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    // emulate hw without byte enables: no RMW
+    intel_iommu_writel(opaque, addr & ~3,
+                      (val & 0xffff) << (8*(addr & 3)));
+}
+
+static void
+intel_iommu_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    // emulate hw without byte enables: no RMW
+    intel_iommu_writel(opaque, addr & ~3,
+                      (val & 0xff) << (8*(addr & 3)));
+}
+
+
+static uint32_t
+intel_iommu_readl(void *opaque, target_phys_addr_t addr)
+{
+    struct iommu_state *d = opaque;
+    unsigned int offset = (addr & 0xfff);
+	unsigned int dmar_idx = (addr & ~(DMAR_REGS_PAGE-1)) >> DMAR_REGS_PAGE_SHIFT;
+	if (offset > sizeof(struct dmar_regs_region)-4) {
+		return 0; /* outside the boundary of the registers */
+	}
+	return *(uint32_t*)(((void*)&(d->dmar_status[dmar_idx].dmar_regs))+offset);
+}
+
+static uint32_t
+intel_iommu_readb(void *opaque, target_phys_addr_t addr)
+{
+    return ((intel_iommu_readl(opaque, addr & ~3)) >>
+            (8 * (addr & 3))) & 0xff;
+}
+
+static uint32_t
+intel_iommu_readw(void *opaque, target_phys_addr_t addr)
+{
+    return ((intel_iommu_readl(opaque, addr & ~3)) >>
+            (8 * (addr & 3))) & 0xffff;
+}
+
+static
+void iommu_save(QEMUFile *f, void *opaque)
+{
+	const unsigned char *d = opaque;
+	int i;
+	for (i=0; i<sizeof(struct iommu_state); i++) {
+		qemu_put_8s(f, &d[i]);
+	}
+}
+
+static
+int iommu_load(QEMUFile *f, void *opaque, int version_id)
+{
+	uint8_t *d = opaque;
+	int i;
+	for (i=0; i<sizeof(struct iommu_state); i++) {
+		qemu_get_8s(f, &d[i]);
+	}
+	return 0;
+}
+
+
+static 
+void iommu_reset(void *opaque)
+{
+	int i;
+	struct iommu_state *iommu_state = opaque;
+	for (i=0; i<DMAR_DEVICES_NUM; i++)
+	{
+		iommu_reset_regs(&(iommu_state->dmar_status[i].dmar_regs));
+	}
+
+}
+
+static CPUWriteMemoryFunc *intel_iommu_write[] = {
+    intel_iommu_writeb,	intel_iommu_writew, intel_iommu_writel
+};
+
+static CPUReadMemoryFunc *intel_iommu_read[] = {
+    intel_iommu_readb,	intel_iommu_readw, intel_iommu_readl
+};
+
+static
+void iommu_mmio_map(struct iommu_state* iommu_state) {
+    target_phys_addr_t addr = DMAR_REG_BASE;
+    ram_addr_t regs_size = DMAR_REGS_PAGE * DMAR_DEVICES_NUM;
+    iommu_state->mmio_index = cpu_register_io_memory(intel_iommu_read,
+          intel_iommu_write, iommu_state);
+    cpu_register_physical_memory(addr, regs_size, iommu_state->mmio_index);
+	
+}
+
+int intel_iommu_configure(const char *opt) {
+    struct iommu_state *is = &iommu_state;
+    if (!opt)
+	return -1;
+    if (!strcmp(opt, "on"))
+	is->enabled = 1;
+    else if (!strcmp(opt, "queue")) {
+	is->enabled = 1;
+	is->invalidation_queue = 1; 
+    }
+    else
+	return -1;
+
+    return 0;
+}
+
+
+/* Translation related functions */
+static inline int aw_to_levels(int aw) {
+	return aw+2;
+}
+
+
+static inline int level_shift(int level) {
+	return VTD_PAGE_SHIFT+(level*LEVEL_STRIDE);
+}
+
+static inline uint64_t level_size(int level) {
+	return (1ULL<<level_shift(level));
+}
+
+static inline uint64_t level_offset(uint64_t addr, int level) {
+	return (addr >> level_shift(level))&((1ULL<<LEVEL_STRIDE)-1);
+}
+
+static inline uint64_t level_mask(int level) {
+	return (((uint64_t)1)<<level_shift(level))-1;
+}
+
+/* Main translation function */
+static
+target_phys_addr_t __iommu_phy_addr_translate(target_phys_addr_t addr,
+                            uint8_t* access_perm, uint8_t busnr, uint8_t devfn, uint64_t* size)
+{
+	struct IommuRootEntry re;
+	struct IommuContextEntry ce;
+	struct IommuPageTableEntry pte;
+	int level;
+	uint64_t pte_addr;
+	volatile struct dmar_regs_region* dmar_regs = &(iommu_state.dmar_status[0].dmar_regs);
+	uint64_t temp, addr_offset;
+	if (!size)
+		size = &temp;
+	*access_perm = IOMMU_READ | IOMMU_WRITE; 
+	/* Reading the root entry */
+	cpu_physical_memory_rw(dmar_regs[0].readdr+(busnr*sizeof(re)), (uint8_t *)&re, sizeof(re), 0);
+
+	if (!dmar_regs->gsts.tes) {
+		return addr;				/* Translation is disabled */
+	}
+
+	if (!dmar_regs->gsts.rtps) {
+		*access_perm = 0;
+		return -1; 
+	}
+	if (!re.p) {
+		*access_perm = 0;
+		return -1;
+	}
+
+	/* Reading the relevant context entry */
+	cpu_physical_memory_rw(((re.ctp<<VTD_PAGE_SHIFT)+devfn*sizeof(ce)),
+			(uint8_t *)&ce, sizeof(ce), 0);
+	if (!ce.p) {
+		*access_perm = 0;
+		return -1;
+	}
+
+	if (ce.t == TRANSLATE_RESERVED) {
+		*access_perm = 0;
+		return -1;
+	}
+	if (ce.t == TRANSLATE_PASS_THROUGH) {
+		return addr;
+	}
+
+	level = aw_to_levels(ce.aw);		/* Analyzing the levels number from the context */
+
+	pte_addr = ce.asr << DMAR_REGS_PAGE_SHIFT;
+
+	while (level>0 && *access_perm != 0) {
+		pte_addr += level_offset(addr, level-1)*sizeof(pte);
+		cpu_physical_memory_rw(pte_addr,
+				(uint8_t *)&pte, sizeof(pte), 0);
+		pte_addr = pte.addr;
+		pte_addr <<= VTD_PAGE_SHIFT;
+		if (!pte.r) 
+			(*access_perm) &= (~IOMMU_READ);
+		if (!pte.w) 
+			(*access_perm) &= (~IOMMU_WRITE);
+		if (pte.sp)
+			break;		/* Super page */
+		level--;
+	}
+	addr_offset = addr&level_mask(level);
+	*size = level_size(level) - addr_offset;
+	/* shift left also if super-page */
+	return (pte_addr<<(LEVEL_STRIDE*level))+(addr&level_mask(level));	
+}
+
+static uint64_t intel_iommu_phy_addr_translate(target_phys_addr_t addr,
+                            int is_write, int devfn, int* err)
+{
+	uint8_t access_perm;
+	uint16_t req_access_perm = is_write ? IOMMU_WRITE : IOMMU_READ;
+	uint64_t phy_addr;
+
+	phy_addr = __iommu_phy_addr_translate(addr, &access_perm, 0, devfn, NULL);
+	if (err && (req_access_perm & access_perm)==0)
+		*err = -1;
+ 
+	return phy_addr;	
+}
+
+static inline
+uint8_t get_devfn(uint16_t sid) 
+{
+	return (sid & 0xFF); 
+}
+
+static inline
+uint8_t get_busnr(uint16_t sid) 
+{
+	return (sid>>8)&0xFF;
+}
+
+static inline
+struct dmar_regs_region* get_dmar_regs(struct dmar_status* dmar_status, 
+	uint16_t dmar_unit)
+{
+	return &(dmar_status->dmar_regs);
+}
+
+static
+void intel_iommu_iq_wait(struct dmar_status* dmar_status,
+	struct inv_wait_desc* desc)
+{
+	if (desc->iflag) {
+		pr_debug("interrupts are not supported");
+		return;
+	}
+	if (desc->sw) {
+		uint32_t* data_target = NULL;
+		uint32_t data;
+		target_phys_addr_t len = 4; /* DWORD */
+		data_target = cpu_physical_memory_map(desc->stat_addr, &len, 1 );
+		if (data_target == NULL) {
+			pr_debug("wait desc. addr %lx cannot be mapped", 
+				(uint64_t)desc->stat_addr);
+			return;
+		}
+		
+	data = desc->stat_data;
+	*data_target = desc->stat_data;
+	cpu_physical_memory_rw(desc->stat_addr << 2,
+		(uint8_t *)&data, sizeof(data), 1);
+		cpu_physical_memory_unmap(data_target, len, 1, len);
+	}
+}
+
+static
+int iommu_process_iq_desc (struct iommu_generic_desc* desc, 
+	struct dmar_status* dmar_status) 
+{
+	int r = 0;
+
+	switch (desc->id)
+	{
+	case IQ_CONTEXT_INVD_DESC_ID: {
+		break;
+		}
+	case IQ_IOTLB_INVD_DESC_ID: {
+		break;	
+		}
+	case IQ_INVD_WAIT_DESC_ID: {
+		struct inv_wait_desc* wait_desc = 
+			(struct inv_wait_desc*)desc; 
+		intel_iommu_iq_wait(dmar_status, wait_desc);
+		break;
+		}
+	case IQ_DEV_IOTLB_INVD_DESC_ID: {
+		pr_debug("IQ_DEV_IOTLB_INVD_DESC_ID is not implemented");
+		break;
+		}
+	case IQ_INT_CACHE_INV_DESC_ID: {
+		pr_debug("IQ_INT_CACHE_INV_DESC_ID is not implemented");
+		break;
+		}
+	default:
+		r = -1;
+		pr_debug("invalid descriptor id %x", (uint32_t)desc->id);
+	}
+	return r;
+}
+
+static 
+int intel_iommu_process_iq(struct dmar_status* dmar_status)
+{
+	struct dmar_regs_region* dmar_regs = 
+		&(dmar_status->dmar_regs);
+
+	struct iqa_reg* iqa = &dmar_regs->iqa;
+	struct iqt_reg* iqt = &dmar_regs->iqt;
+	struct iqh_reg* iqh = &dmar_regs->iqh;
+	size_t len = 1 << (8 + iqa->qs);
+	size_t desc_size = sizeof(struct iommu_generic_desc);
+	target_phys_addr_t map_len = len * desc_size;
+	target_phys_addr_t qaddr = iqa->iqa << VTD_PAGE_SHIFT;
+	struct iommu_generic_desc desc;
+
+	if (map_len < len * desc_size) {
+		pr_debug("could not map entire invalidation queue");
+		exit(1);
+	}
+
+	while (iqh->qh != iqt->qt) {
+		cpu_physical_memory_rw(qaddr + (iqh->qh)*desc_size, (uint8_t *)&desc, sizeof(desc), 0);
+		iommu_process_iq_desc(&desc, dmar_status);	
+		iqh->qh++;
+		iqh->qh %= len;
+	}
+	return 0;
+}
+
+void intel_iommu_init(void) {
+    struct iommu_state *is = &iommu_state;
+    static const char info_str[] = "intel-iommu";
+    if (!is->enabled)
+	return;
+
+    set_iommu_translation_fn(intel_iommu_phy_addr_translate);
+    iommu_mmio_map(is);
+    register_savevm(info_str, -1, 1, iommu_save, iommu_load, is);
+    qemu_register_reset(iommu_reset, is);
+    iommu_reset(is);
+}
+
diff --git a/hw/intel_iommu.h b/hw/intel_iommu.h
new file mode 100644
index 0000000..e8235c7
--- /dev/null
+++ b/hw/intel_iommu.h
@@ -0,0 +1,18 @@ 
+#ifndef _INTEL_IOMMU_H_
+#define _INTEL_IOMMU_H_
+
+#include "hw.h"
+
+struct QemuOpts;
+
+/* Init the IOMMU emulation */
+void intel_iommu_init(void);
+
+/* Configure the iommu */
+int intel_iommu_configure(const char* opt);
+
+/* Translate the physical address */
+/*uint64_t iommu_phy_addr_translate(target_phys_addr_t addr,
+                            int is_write, int devfn, int* err);
+*/							
+#endif
diff --git a/hw/pc.c b/hw/pc.c
index 0aebae9..2652ab5 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -46,6 +46,7 @@ 
 #include "elf.h"
 #include "multiboot.h"
 #include "device-assignment.h"
+#include "intel_iommu.h"
 
 #include "kvm.h"
 
@@ -1028,6 +1029,7 @@  static void pc_init1(ram_addr_t ram_size,
 
     isa_dev = isa_create_simple("i8042");
     DMA_init(0);
+    intel_iommu_init();
 #ifdef HAS_AUDIO
     audio_init(pci_enabled ? pci_bus : NULL, isa_irq);
 #endif
diff --git a/vl.c b/vl.c
index d959fdb..613a58e 100644
--- a/vl.c
+++ b/vl.c
@@ -136,6 +136,7 @@  int main(int argc, char **argv)
 #include "hw/xen.h"
 #include "hw/qdev.h"
 #include "hw/loader.h"
+#include "hw/intel_iommu.h"
 #include "bt-host.h"
 #include "net.h"
 #include "net/slirp.h"
@@ -5750,6 +5751,13 @@  int main(int argc, char **argv, char **envp)
                     fclose(fp);
                     break;
                 }
+	    case QEMU_OPTION_intel_iommu:
+		{
+		    if (intel_iommu_configure(optarg) < 0) {
+			fprintf(stderr, "Wrong intel_iommu configuration\n");
+			exit(1);
+		    }
+		}
             }
         }
     }