Patchwork [RFC,2/4] AMD IOMMU emulation

login
register
mail settings
Submitter Eduard - Gabriel Munteanu
Date Aug. 4, 2010, 10:32 p.m.
Message ID <d09ebf953e219ad9250e0f1a29acec4628cd0613.1280958471.git.eduard.munteanu@linux360.ro>
Download mbox | patch
Permalink /patch/60901/
State New
Headers show

Comments

Eduard - Gabriel Munteanu - Aug. 4, 2010, 10:32 p.m.
This introduces emulation for the AMD IOMMU, described in "AMD I/O
Virtualization Technology (IOMMU) Specification".

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
---
 Makefile.target |    2 +
 configure       |   10 +
 hw/amd_iommu.c  |  671 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/pc.c         |    4 +
 hw/pc.h         |    3 +
 hw/pci_ids.h    |    2 +
 hw/pci_regs.h   |    1 +
 7 files changed, 693 insertions(+), 0 deletions(-)
 create mode 100644 hw/amd_iommu.c
Blue Swirl - Aug. 5, 2010, 9:31 p.m.
On Wed, Aug 4, 2010 at 10:32 PM, Eduard - Gabriel Munteanu
<eduard.munteanu@linux360.ro> wrote:
> This introduces emulation for the AMD IOMMU, described in "AMD I/O
> Virtualization Technology (IOMMU) Specification".
>
> Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> ---
>  Makefile.target |    2 +
>  configure       |   10 +
>  hw/amd_iommu.c  |  671 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/pc.c         |    4 +
>  hw/pc.h         |    3 +
>  hw/pci_ids.h    |    2 +
>  hw/pci_regs.h   |    1 +
>  7 files changed, 693 insertions(+), 0 deletions(-)
>  create mode 100644 hw/amd_iommu.c
>
> diff --git a/Makefile.target b/Makefile.target
> index 70a9c1b..86226a0 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -219,6 +219,8 @@ obj-i386-y += pcspk.o i8254.o
>  obj-i386-$(CONFIG_KVM_PIT) += i8254-kvm.o
>  obj-i386-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += device-assignment.o
>
> +obj-i386-$(CONFIG_AMD_IOMMU) += amd_iommu.o

Make this unconditional.

> +
>  # Hardware support
>  obj-ia64-y += ide.o pckbd.o vga.o $(SOUND_HW) dma.o $(AUDIODRV)
>  obj-ia64-y += fdc.o mc146818rtc.o serial.o i8259.o ipf.o
> diff --git a/configure b/configure
> index af50607..7448603 100755
> --- a/configure
> +++ b/configure
> @@ -317,6 +317,7 @@ io_thread="no"
>  mixemu="no"
>  kvm_cap_pit=""
>  kvm_cap_device_assignment=""
> +amd_iommu="no"
>  kerneldir=""
>  aix="no"
>  blobs="yes"
> @@ -629,6 +630,8 @@ for opt do
>   ;;
>   --enable-kvm-device-assignment) kvm_cap_device_assignment="yes"
>   ;;
> +  --enable-amd-iommu-emul) amd_iommu="yes"
> +  ;;
>   --enable-profiler) profiler="yes"
>   ;;
>   --enable-cocoa)
> @@ -871,6 +874,8 @@ echo "  --disable-kvm-pit        disable KVM pit support"
>  echo "  --enable-kvm-pit         enable KVM pit support"
>  echo "  --disable-kvm-device-assignment  disable KVM device assignment support"
>  echo "  --enable-kvm-device-assignment   enable KVM device assignment support"
> +echo "  --disable-amd-iommu-emul disable AMD IOMMU emulation"
> +echo "  --enable-amd-iommu-emul  enable AMD IOMMU emulation"
>  echo "  --disable-nptl           disable usermode NPTL support"
>  echo "  --enable-nptl            enable usermode NPTL support"
>  echo "  --enable-system          enable all system emulation targets"
> @@ -2251,6 +2256,7 @@ echo "Install blobs     $blobs"
>  echo "KVM support       $kvm"
>  echo "KVM PIT support   $kvm_cap_pit"
>  echo "KVM device assig. $kvm_cap_device_assignment"
> +echo "AMD IOMMU emul.   $amd_iommu"
>  echo "fdt support       $fdt"
>  echo "preadv support    $preadv"
>  echo "fdatasync         $fdatasync"
> @@ -2645,6 +2651,10 @@ case "$target_arch2" in
>   x86_64)
>     TARGET_BASE_ARCH=i386
>     target_phys_bits=64
> +    if test "$amd_iommu" = "yes"; then
> +      echo "CONFIG_AMD_IOMMU=y" >> $config_target_mak
> +      echo "CONFIG_PCI_IOMMU=y" >> $config_host_mak
> +    fi

Drop all configure changes.

>   ;;
>   ia64)
>     target_phys_bits=64
> diff --git a/hw/amd_iommu.c b/hw/amd_iommu.c
> new file mode 100644
> index 0000000..ff9903e
> --- /dev/null
> +++ b/hw/amd_iommu.c
> @@ -0,0 +1,671 @@
> +/*
> + * AMD IOMMU emulation
> + *
> + * Copyright (c) 2010 Eduard - Gabriel Munteanu
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "pc.h"
> +#include "hw.h"
> +#include "pci.h"
> +#include "qlist.h"
> +
> +/* Capability registers */
> +#define CAPAB_HEADER            0x00
> +#define   CAPAB_REV_TYPE        0x02
> +#define   CAPAB_FLAGS           0x03
> +#define CAPAB_BAR_LOW           0x04
> +#define CAPAB_BAR_HIGH          0x08
> +#define CAPAB_RANGE             0x0C
> +#define CAPAB_MISC              0x10
> +
> +#define CAPAB_SIZE              0x14
> +
> +/* Capability header data */
> +#define CAPAB_FLAG_IOTLBSUP     (1 << 0)
> +#define CAPAB_FLAG_HTTUNNEL     (1 << 1)
> +#define CAPAB_FLAG_NPCACHE      (1 << 2)
> +#define CAPAB_INIT_REV          (1 << 3)
> +#define CAPAB_INIT_TYPE         3
> +#define CAPAB_INIT_REV_TYPE     (CAPAB_REV | CAPAB_TYPE)
> +#define CAPAB_INIT_FLAGS        (CAPAB_FLAG_NPCACHE | CAPAB_FLAG_HTTUNNEL)
> +#define CAPAB_INIT_MISC         (64 << 15) | (48 << 8)
> +#define CAPAB_BAR_MASK          ~((1UL << 14) - 1)
> +
> +/* MMIO registers */
> +#define MMIO_DEVICE_TABLE       0x0000
> +#define MMIO_COMMAND_BASE       0x0008
> +#define MMIO_EVENT_BASE         0x0010
> +#define MMIO_CONTROL            0x0018
> +#define MMIO_EXCL_BASE          0x0020
> +#define MMIO_EXCL_LIMIT         0x0028
> +#define MMIO_COMMAND_HEAD       0x2000
> +#define MMIO_COMMAND_TAIL       0x2008
> +#define MMIO_EVENT_HEAD         0x2010
> +#define MMIO_EVENT_TAIL         0x2018
> +#define MMIO_STATUS             0x2020
> +
> +#define MMIO_SIZE               0x4000
> +
> +#define MMIO_DEVTAB_SIZE_MASK   ((1ULL << 12) - 1)
> +#define MMIO_DEVTAB_BASE_MASK   (((1ULL << 52) - 1) & ~MMIO_DEVTAB_SIZE_MASK)
> +#define MMIO_DEVTAB_ENTRY_SIZE  32
> +#define MMIO_DEVTAB_SIZE_UNIT   4096
> +
> +#define MMIO_CMDBUF_SIZE_BYTE       (MMIO_COMMAND_BASE + 7)
> +#define MMIO_CMDBUF_SIZE_MASK       0x0F
> +#define MMIO_CMDBUF_BASE_MASK       MMIO_DEVTAB_BASE_MASK
> +#define MMIO_CMDBUF_DEFAULT_SIZE    8
> +#define MMIO_CMDBUF_HEAD_MASK       (((1ULL << 19) - 1) & ~0x0F)
> +#define MMIO_CMDBUF_TAIL_MASK       MMIO_EVTLOG_HEAD_MASK
> +
> +#define MMIO_EVTLOG_SIZE_BYTE       (MMIO_EVENT_BASE + 7)
> +#define MMIO_EVTLOG_SIZE_MASK       MMIO_CMDBUF_SIZE_MASK
> +#define MMIO_EVTLOG_BASE_MASK       MMIO_CMDBUF_BASE_MASK
> +#define MMIO_EVTLOG_DEFAULT_SIZE    MMIO_CMDBUF_DEFAULT_SIZE
> +#define MMIO_EVTLOG_HEAD_MASK       (((1ULL << 19) - 1) & ~0x0F)
> +#define MMIO_EVTLOG_TAIL_MASK       MMIO_EVTLOG_HEAD_MASK
> +
> +#define MMIO_EXCL_BASE_MASK         MMIO_DEVTAB_BASE_MASK
> +#define MMIO_EXCL_ENABLED_MASK      (1ULL << 0)
> +#define MMIO_EXCL_ALLOW_MASK        (1ULL << 1)
> +#define MMIO_EXCL_LIMIT_MASK        MMIO_DEVTAB_BASE_MASK
> +#define MMIO_EXCL_LIMIT_LOW         0xFFF
> +
> +#define MMIO_CONTROL_IOMMUEN        (1ULL << 0)
> +#define MMIO_CONTROL_HTTUNEN        (1ULL << 1)
> +#define MMIO_CONTROL_EVENTLOGEN     (1ULL << 2)
> +#define MMIO_CONTROL_EVENTINTEN     (1ULL << 3)
> +#define MMIO_CONTROL_COMWAITINTEN   (1ULL << 4)
> +#define MMIO_CONTROL_CMDBUFEN       (1ULL << 12)
> +
> +#define MMIO_STATUS_EVTLOG_OF       (1ULL << 0)
> +#define MMIO_STATUS_EVTLOG_INTR     (1ULL << 1)
> +#define MMIO_STATUS_COMWAIT_INTR    (1ULL << 2)
> +#define MMIO_STATUS_EVTLOG_RUN      (1ULL << 3)
> +#define MMIO_STATUS_CMDBUF_RUN      (1ULL << 4)
> +
> +#define CMDBUF_ID_BYTE              0x07
> +#define CMDBUF_ID_RSHIFT            4
> +#define CMDBUF_ENTRY_SIZE           0x10
> +
> +#define CMD_COMPLETION_WAIT         0x01
> +#define CMD_INVAL_DEVTAB_ENTRY      0x02
> +#define CMD_INVAL_IOMMU_PAGES       0x03
> +#define CMD_INVAL_IOTLB_PAGES       0x04
> +#define CMD_INVAL_INTR_TABLE        0x05
> +
> +#define DEVTAB_ENTRY_SIZE           32
> +
> +/* Device table entry bits 0:63 */
> +#define DEV_VALID                   (1ULL << 0)
> +#define DEV_TRANSLATION_VALID       (1ULL << 1)
> +#define DEV_MODE_MASK               0x7
> +#define DEV_MODE_RSHIFT             9
> +#define DEV_PT_ROOT_MASK            0xFFFFFFFFFF000
> +#define DEV_PT_ROOT_RSHIFT          12
> +#define DEV_PERM_SHIFT              61
> +#define DEV_PERM_READ               (1ULL << 61)
> +#define DEV_PERM_WRITE              (1ULL << 62)
> +
> +/* Device table entry bits 64:127 */
> +#define DEV_DOMAIN_ID_MASK          ((1ULL << 16) - 1)
> +#define DEV_IOTLB_SUPPORT           (1ULL << 17)
> +#define DEV_SUPPRESS_PF             (1ULL << 18)
> +#define DEV_SUPPRESS_ALL_PF         (1ULL << 19)
> +#define DEV_IOCTL_MASK              ~3
> +#define DEV_IOCTL_RSHIFT            20
> +#define   DEV_IOCTL_DENY            0
> +#define   DEV_IOCTL_PASSTHROUGH     1
> +#define   DEV_IOCTL_TRANSLATE       2
> +#define DEV_CACHE                   (1ULL << 37)
> +#define DEV_SNOOP_DISABLE           (1ULL << 38)
> +#define DEV_EXCL                    (1ULL << 39)
> +
> +struct amd_iommu_invalidator {
> +    int                     devfn;
> +    PCIInvalidateIOTLBFunc  *func;
> +    void                    *opaque;
> +    QLIST_ENTRY(amd_iommu_invalidator) list;
> +};

This should be AMDIOMMUInvalidator with typedef.

> +
> +struct amd_iommu_state {
> +    PCIDevice                   dev;
> +
> +    int                         capab_offset;
> +    unsigned char               *capab;
> +
> +    int                         mmio_index;
> +    target_phys_addr_t          mmio_addr;
> +    unsigned char               *mmio_buf;
> +    int                         mmio_enabled;
> +
> +    int                         enabled;
> +    int                         ats_enabled;
> +
> +    target_phys_addr_t          devtab;
> +    size_t                      devtab_len;
> +
> +    target_phys_addr_t          cmdbuf;
> +    int                         cmdbuf_enabled;
> +    size_t                      cmdbuf_len;
> +    size_t                      cmdbuf_head;
> +    size_t                      cmdbuf_tail;
> +    int                         completion_wait_intr;
> +
> +    target_phys_addr_t          evtlog;
> +    int                         evtlog_enabled;
> +    int                         evtlog_intr;
> +    size_t                      evtlog_len;
> +    size_t                      evtlog_head;
> +    size_t                      evtlog_tail;
> +
> +    target_phys_addr_t          excl_base;
> +    target_phys_addr_t          excl_limit;
> +    int                         excl_enabled;
> +    int                         excl_allow;
> +
> +    QLIST_HEAD(, amd_iommu_invalidator) invalidators;
> +};

Likewise, AMDIOMMUState.

> +static void amd_iommu_register_invalidator(PCIIOMMU *iommu,
> +                                           PCIDevice *dev,
> +                                           pci_addr_t addr,
> +                                           PCIInvalidateIOTLBFunc *cb,
> +                                           void *opaque)
> +{
> +    struct amd_iommu_invalidator *inval;
> +    struct amd_iommu_state *st = iommu->opaque;
> +
> +    inval = qemu_malloc(sizeof(struct amd_iommu_invalidator));
> +    inval->devfn    = dev->devfn;
> +    inval->func     = cb;
> +    inval->opaque   = opaque;
> +
> +    QLIST_INSERT_HEAD(&st->invalidators, inval, list);
> +}
> +
> +static void amd_iommu_completion_wait(struct amd_iommu_state *st,
> +                                      uint8_t *cmd)
> +{
> +    uint64_t addr;
> +
> +    if (cmd[0] & 1) {
> +        addr = le64_to_cpu(*(uint64_t *) cmd) & 0xFFFFFFFFFFFF8;
> +        cpu_physical_memory_write(addr, cmd + 8, 8);
> +    }
> +
> +    if (cmd[0] & 2)
> +        st->mmio_buf[MMIO_STATUS] |= MMIO_STATUS_COMWAIT_INTR;
> +}
> +
> +static void amd_iommu_inval_iotlb(struct amd_iommu_state *st,
> +                                  uint8_t *cmd)
> +{
> +    struct amd_iommu_invalidator *inval;
> +    int devfn = *(uint16_t *) cmd;
> +
> +    QLIST_FOREACH(inval, &st->invalidators, list) {
> +        if (inval->devfn == devfn) {
> +            inval->func(inval->opaque);
> +            QLIST_REMOVE(inval, list);
> +        }
> +    }
> +}
> +
> +static void amd_iommu_cmdbuf_run(struct amd_iommu_state *st)
> +{
> +    uint8_t cmd[16];
> +    int type;
> +
> +    if (!st->cmdbuf_enabled)
> +        return;
> +
> +    /* Check if there's work to do. */
> +    if (st->cmdbuf_head == st->cmdbuf_tail)
> +        return;
> +
> +    cpu_physical_memory_read(st->cmdbuf + st->cmdbuf_head, cmd, 16);
> +    type = cmd[CMDBUF_ID_BYTE] >> CMDBUF_ID_RSHIFT;
> +    switch (type) {
> +        case CMD_COMPLETION_WAIT:
> +            amd_iommu_completion_wait(st, cmd);
> +            break;
> +        case CMD_INVAL_DEVTAB_ENTRY:
> +            break;
> +        case CMD_INVAL_IOMMU_PAGES:
> +            break;
> +        case CMD_INVAL_IOTLB_PAGES:
> +            amd_iommu_inval_iotlb(st, cmd);
> +            break;
> +        case CMD_INVAL_INTR_TABLE:
> +            break;
> +        default:
> +            break;
> +    }
> +
> +    /* Increment and wrap head pointer. */
> +    st->cmdbuf_head += CMDBUF_ENTRY_SIZE;
> +    if (st->cmdbuf_head >= st->cmdbuf_len)
> +        st->cmdbuf_head = 0;
> +}
> +
> +static uint32_t amd_iommu_mmio_buf_read(struct amd_iommu_state *st,
> +                                        size_t offset,
> +                                        size_t size)
> +{
> +    ssize_t i;
> +    uint32_t ret;
> +
> +    if (!size)
> +        return 0;
> +
> +    ret = st->mmio_buf[offset + size - 1];
> +    for (i = size - 2; i >= 0; i--) {
> +        ret <<= 8;
> +        ret |= st->mmio_buf[offset + i];
> +    }
> +
> +    return ret;
> +}
> +
> +static void amd_iommu_mmio_buf_write(struct amd_iommu_state *st,
> +                                     size_t offset,
> +                                     size_t size,
> +                                     uint32_t val)
> +{
> +    size_t i;
> +
> +    for (i = 0; i < size; i++) {
> +        st->mmio_buf[offset + i] = val & 0xFF;
> +        val >>= 8;
> +    }
> +}
> +
> +static void amd_iommu_update_mmio(struct amd_iommu_state *st,
> +                                  target_phys_addr_t addr)
> +{
> +    size_t reg = addr & ~0x07;
> +    uint64_t *base = (uint64_t *) &st->mmio_buf[reg];
> +    uint64_t val = *base;
> +
> +    switch (reg) {
> +        case MMIO_CONTROL:
> +            st->enabled              = !!(val & MMIO_CONTROL_IOMMUEN);
> +            st->ats_enabled          = !!(val & MMIO_CONTROL_HTTUNEN);
> +            st->evtlog_enabled       = st->enabled &&
> +                                       !!(val & MMIO_CONTROL_EVENTLOGEN);
> +            st->evtlog_intr          = !!(val & MMIO_CONTROL_EVENTINTEN);
> +            st->completion_wait_intr = !!(val & MMIO_CONTROL_COMWAITINTEN);
> +            st->cmdbuf_enabled       = st->enabled &&
> +                                       !!(val & MMIO_CONTROL_CMDBUFEN);
> +
> +            /* Update status flags depending on the control register. */
> +            if (st->cmdbuf_enabled)
> +                st->mmio_buf[MMIO_STATUS] |= MMIO_STATUS_CMDBUF_RUN;
> +            else
> +                st->mmio_buf[MMIO_STATUS] &= ~MMIO_STATUS_CMDBUF_RUN;
> +            if (st->evtlog_enabled)
> +                st->mmio_buf[MMIO_STATUS] |= MMIO_STATUS_EVTLOG_RUN;
> +            else
> +                st->mmio_buf[MMIO_STATUS] &= ~MMIO_STATUS_EVTLOG_RUN;
> +
> +            amd_iommu_cmdbuf_run(st);
> +            break;
> +        case MMIO_DEVICE_TABLE:
> +            st->devtab = (target_phys_addr_t) (val & MMIO_DEVTAB_BASE_MASK);
> +            st->devtab_len = ((val & MMIO_DEVTAB_SIZE_MASK) + 1) *
> +                             (MMIO_DEVTAB_SIZE_UNIT / MMIO_DEVTAB_ENTRY_SIZE);
> +            break;
> +        case MMIO_COMMAND_BASE:
> +            st->cmdbuf = (target_phys_addr_t) (val & MMIO_CMDBUF_BASE_MASK);
> +            st->cmdbuf_len = 1UL << (st->mmio_buf[MMIO_CMDBUF_SIZE_BYTE] &
> +                                     MMIO_CMDBUF_SIZE_MASK);
> +            amd_iommu_cmdbuf_run(st);
> +            break;
> +        case MMIO_COMMAND_HEAD:
> +            st->cmdbuf_head = val & MMIO_CMDBUF_HEAD_MASK;
> +            amd_iommu_cmdbuf_run(st);
> +            break;
> +        case MMIO_COMMAND_TAIL:
> +            st->cmdbuf_tail = val & MMIO_CMDBUF_TAIL_MASK;
> +            amd_iommu_cmdbuf_run(st);
> +            break;
> +        case MMIO_EVENT_BASE:
> +            st->evtlog = (target_phys_addr_t) (val & MMIO_EVTLOG_BASE_MASK);
> +            st->evtlog_len = 1UL << (st->mmio_buf[MMIO_EVTLOG_SIZE_BYTE] &
> +                                     MMIO_EVTLOG_SIZE_MASK);
> +            break;
> +        case MMIO_EVENT_HEAD:
> +            st->evtlog_head = val & MMIO_EVTLOG_HEAD_MASK;
> +            break;
> +        case MMIO_EVENT_TAIL:
> +            st->evtlog_tail = val & MMIO_EVTLOG_TAIL_MASK;
> +            break;
> +        case MMIO_EXCL_BASE:
> +            st->excl_base = (target_phys_addr_t) (val & MMIO_EXCL_BASE_MASK);
> +            st->excl_enabled = val & MMIO_EXCL_ENABLED_MASK;
> +            st->excl_allow = val & MMIO_EXCL_ALLOW_MASK;
> +            break;
> +        case MMIO_EXCL_LIMIT:
> +            st->excl_limit = (target_phys_addr_t) ((val & MMIO_EXCL_LIMIT_MASK) |
> +                                                   MMIO_EXCL_LIMIT_LOW);
> +            break;
> +        default:
> +            break;
> +    }
> +}
> +
> +static uint32_t amd_iommu_mmio_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    struct amd_iommu_state *st = opaque;
> +
> +    return amd_iommu_mmio_buf_read(st, addr, 1);
> +}
> +
> +static uint32_t amd_iommu_mmio_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    struct amd_iommu_state *st = opaque;
> +
> +    return amd_iommu_mmio_buf_read(st, addr, 2);
> +}
> +
> +static uint32_t amd_iommu_mmio_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    struct amd_iommu_state *st = opaque;
> +
> +    return amd_iommu_mmio_buf_read(st, addr, 4);
> +}
> +
> +static void amd_iommu_mmio_writeb(void *opaque,
> +                                  target_phys_addr_t addr,
> +                                  uint32_t val)
> +{
> +    struct amd_iommu_state *st = opaque;
> +
> +    amd_iommu_mmio_buf_write(st, addr, 1, val);
> +    amd_iommu_update_mmio(st, addr);
> +}
> +
> +static void amd_iommu_mmio_writew(void *opaque,
> +                                  target_phys_addr_t addr,
> +                                  uint32_t val)
> +{
> +    struct amd_iommu_state *st = opaque;
> +
> +    amd_iommu_mmio_buf_write(st, addr, 2, val);
> +    amd_iommu_update_mmio(st, addr);
> +}
> +
> +static void amd_iommu_mmio_writel(void *opaque,
> +                                  target_phys_addr_t addr,
> +                                  uint32_t val)
> +{
> +    struct amd_iommu_state *st = opaque;
> +
> +    amd_iommu_mmio_buf_write(st, addr, 4, val);
> +    amd_iommu_update_mmio(st, addr);
> +}
> +
> +static CPUReadMemoryFunc * const amd_iommu_mmio_read[] = {
> +    amd_iommu_mmio_readb,
> +    amd_iommu_mmio_readw,
> +    amd_iommu_mmio_readl,
> +};
> +
> +static CPUWriteMemoryFunc * const amd_iommu_mmio_write[] = {
> +    amd_iommu_mmio_writeb,
> +    amd_iommu_mmio_writew,
> +    amd_iommu_mmio_writel,
> +};
> +
> +static void amd_iommu_init_mmio(struct amd_iommu_state *st)
> +{
> +    st->mmio_buf[MMIO_CMDBUF_SIZE_BYTE] = MMIO_CMDBUF_DEFAULT_SIZE;
> +    st->mmio_buf[MMIO_EVTLOG_SIZE_BYTE] = MMIO_EVTLOG_DEFAULT_SIZE;
> +}
> +
> +static void amd_iommu_enable_mmio(struct amd_iommu_state *st)
> +{
> +    target_phys_addr_t addr;
> +
> +    st->mmio_index = cpu_register_io_memory(amd_iommu_mmio_read,
> +                                            amd_iommu_mmio_write, st);
> +    if (st->mmio_index < 0)
> +        return;
> +
> +    addr = le64_to_cpu(*(uint64_t *) &st->capab[CAPAB_BAR_LOW]) & CAPAB_BAR_MASK;
> +    cpu_register_physical_memory(addr, MMIO_SIZE, st->mmio_index);
> +
> +    st->mmio_addr = addr;
> +    st->mmio_buf = qemu_mallocz(MMIO_SIZE);
> +    st->mmio_enabled = 1;
> +    amd_iommu_init_mmio(st);
> +}
> +
> +static uint32_t amd_iommu_read_capab(PCIDevice *pci_dev,
> +                                     uint32_t addr, int len)
> +{
> +    return pci_default_cap_read_config(pci_dev, addr, len);
> +}
> +
> +static void amd_iommu_write_capab(PCIDevice *dev,
> +                                  uint32_t addr, uint32_t val, int len)
> +{
> +    struct amd_iommu_state *st;
> +    unsigned char *capab;
> +    int reg;
> +
> +    st = DO_UPCAST(struct amd_iommu_state, dev, dev);
> +    capab = st->capab;
> +    reg = (addr - 0x40) & ~0x3;  /* Get the 32-bits register. */
> +
> +    switch (reg) {
> +        case CAPAB_HEADER:
> +        case CAPAB_MISC:
> +            /* Read-only. */
> +            return;
> +        case CAPAB_BAR_LOW:
> +        case CAPAB_BAR_HIGH:
> +        case CAPAB_RANGE:
> +            if (st->mmio_enabled)
> +                return;
> +            pci_default_cap_write_config(dev, addr, val, len);
> +            break;
> +        default:
> +            return;
> +    }
> +
> +    if (capab[CAPAB_BAR_LOW] & 0x1)
> +        amd_iommu_enable_mmio(st);
> +}
> +
> +static int amd_iommu_init_capab(PCIDevice *dev)
> +{
> +    struct amd_iommu_state *st;
> +    unsigned char *capab;
> +
> +    st = DO_UPCAST(struct amd_iommu_state, dev, dev);
> +    capab = st->dev.config + st->capab_offset;
> +
> +    capab[CAPAB_REV_TYPE]  = CAPAB_REV_TYPE;
> +    capab[CAPAB_FLAGS]     = CAPAB_FLAGS;
> +    capab[CAPAB_BAR_LOW]   = 0;
> +    capab[CAPAB_BAR_HIGH]  = 0;
> +    capab[CAPAB_RANGE]     = 0;
> +    *((uint32_t *) &capab[CAPAB_MISC]) = cpu_to_le32(CAPAB_INIT_MISC);
> +
> +    st->capab = capab;
> +    st->dev.cap.length = CAPAB_SIZE;
> +
> +    return 0;
> +}
> +
> +static int amd_iommu_translate(PCIIOMMU *iommu,
> +                               PCIDevice *dev,
> +                               pci_addr_t addr,
> +                               target_phys_addr_t *paddr,
> +                               int *len,
> +                               unsigned perms);

Please move the implementation here to avoid this declaration.

> +static int amd_iommu_pci_initfn(PCIDevice *dev)
> +{
> +    struct amd_iommu_state *st;
> +    PCIIOMMU *iommu;
> +    int err;
> +
> +    st = DO_UPCAST(struct amd_iommu_state, dev, dev);
> +
> +    pci_config_set_vendor_id(st->dev.config, PCI_VENDOR_ID_AMD);
> +    pci_config_set_device_id(st->dev.config, PCI_DEVICE_ID_AMD_IOMMU);
> +    pci_config_set_class(st->dev.config, PCI_CLASS_SYSTEM_IOMMU);
> +
> +    st->capab_offset = pci_add_capability(&st->dev,
> +                                          PCI_CAP_ID_SEC,
> +                                          CAPAB_SIZE);
> +    err = pci_enable_capability_support(&st->dev, st->capab_offset,
> +                                        amd_iommu_read_capab,
> +                                        amd_iommu_write_capab,
> +                                        amd_iommu_init_capab);
> +    if (err)
> +        return err;
> +
> +    iommu = qemu_mallocz(sizeof(PCIIOMMU));
> +    iommu->opaque = st;
> +    iommu->translate = amd_iommu_translate;
> +    iommu->register_iotlb_invalidator = amd_iommu_register_invalidator;
> +    pci_register_iommu(dev, iommu);

I'd avoid the structure and just pass the stuff to pci_register_iommu
as function arguments.

> +
> +    return 0;
> +}
> +
> +static const VMStateDescription vmstate_amd_iommu = {
> +    .name                       = "amd-iommu",
> +    .version_id                 = 1,
> +    .minimum_version_id         = 1,
> +    .minimum_version_id_old     = 1,
> +    .fields                     = (VMStateField []) {
> +        VMSTATE_PCI_DEVICE(dev, struct amd_iommu_state),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static PCIDeviceInfo amd_iommu_pci_info = {
> +    .qdev.name    = "amd-iommu",
> +    .qdev.desc    = "AMD IOMMU",
> +    .qdev.size    = sizeof(struct amd_iommu_state),
> +    .qdev.vmsd    = &vmstate_amd_iommu,
> +    .init         = amd_iommu_pci_initfn,
> +};
> +
> +void amd_iommu_init(PCIBus *bus)
> +{
> +    pci_create_simple(bus, -1, "amd-iommu");
> +}

Just open code this in pc.c.

> +
> +static void amd_iommu_register(void)
> +{
> +    pci_qdev_register(&amd_iommu_pci_info);
> +}
> +
> +device_init(amd_iommu_register);
> +
> +static void amd_iommu_page_fault(struct amd_iommu_state *st,
> +                                 int devfn,
> +                                 unsigned domid,
> +                                 target_phys_addr_t addr,
> +                                 int present,
> +                                 int is_write)
> +{
> +    uint16_t entry[8];
> +    uint64_t *entry_addr = (uint64_t *) &entry[4];
> +
> +    entry[0] = cpu_to_le16(devfn);
> +    entry[1] = 0;
> +    entry[2] = cpu_to_le16(domid);
> +    entry[3] = (2UL << 12) | (!!present << 4) | (!!is_write << 5);
> +    *entry_addr = cpu_to_le64(addr);
> +
> +    cpu_physical_memory_write((target_phys_addr_t) st->evtlog + st->evtlog_tail, (uint8_t *) &entry, 128);
> +    st->evtlog_tail += 128;
> +}
> +
> +static inline uint64_t amd_iommu_get_perms(uint64_t entry)
> +{
> +    return (entry & (DEV_PERM_READ | DEV_PERM_WRITE)) >> DEV_PERM_SHIFT;
> +}
> +
> +static int amd_iommu_translate(PCIIOMMU *iommu,
> +                               PCIDevice *dev,
> +                               pci_addr_t addr,
> +                               target_phys_addr_t *paddr,
> +                               int *len,
> +                               unsigned perms)
> +{
> +    int devfn, present;
> +    target_phys_addr_t entry_addr, pte_addr;
> +    uint64_t entry[4], pte, page_offset, pte_perms;
> +    unsigned level, domid;
> +    struct amd_iommu_state *st = iommu->opaque;
> +
> +    if (!st->enabled)
> +        goto no_translation;
> +
> +    /* Get device table entry. */
> +    devfn = dev->devfn;
> +    entry_addr = st->devtab + devfn * DEVTAB_ENTRY_SIZE;
> +    cpu_physical_memory_read(entry_addr, (uint8_t *) entry, 32);
> +
> +    pte = entry[0];
> +    if (!(pte & DEV_VALID) || !(pte & DEV_TRANSLATION_VALID)) {
> +        goto no_translation;
> +    }
> +    domid = entry[1] & DEV_DOMAIN_ID_MASK;
> +    level = (pte >> DEV_MODE_RSHIFT) & DEV_MODE_MASK;
> +    while (level > 0) {
> +        /*
> +         * Check permissions: the bitwise
> +         * implication perms -> entry_perms must be true.
> +         */
> +        pte_perms = amd_iommu_get_perms(pte);
> +        present = pte & 1;
> +        if (!present || perms != (perms & pte_perms)) {
> +            amd_iommu_page_fault(st, devfn, domid, addr,
> +                                 present, !!(perms & IOMMU_PERM_WRITE));
> +            return -EPERM;
> +        }
> +
> +        /* Go to the next lower level. */
> +        pte_addr = pte & DEV_PT_ROOT_MASK;
> +        pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3;
> +        pte = ldq_phys(pte_addr);
> +        level = (pte >> DEV_MODE_RSHIFT) & DEV_MODE_MASK;
> +    }
> +    page_offset = addr & 4095;
> +    *paddr = (pte & DEV_PT_ROOT_MASK) + page_offset;
> +    *len = 4096 - page_offset;
> +
> +    return 0;
> +
> +no_translation:
> +    *paddr = addr;
> +    *len = INT_MAX;
> +    return 0;
> +}
> diff --git a/hw/pc.c b/hw/pc.c
> index 186e322..4c929f9 100644
> --- a/hw/pc.c
> +++ b/hw/pc.c
> @@ -1066,6 +1066,10 @@ void pc_pci_device_init(PCIBus *pci_bus)
>     int max_bus;
>     int bus;
>
> +#ifdef CONFIG_AMD_IOMMU
> +    amd_iommu_init(pci_bus);
> +#endif
> +
>     max_bus = drive_get_max_bus(IF_SCSI);
>     for (bus = 0; bus <= max_bus; bus++) {
>         pci_create_simple(pci_bus, -1, "lsi53c895a");
> diff --git a/hw/pc.h b/hw/pc.h
> index 3ef2f75..255ad93 100644
> --- a/hw/pc.h
> +++ b/hw/pc.h
> @@ -191,4 +191,7 @@ void extboot_init(BlockDriverState *bs);
>
>  int e820_add_entry(uint64_t, uint64_t, uint32_t);
>
> +/* amd_iommu.c */
> +void amd_iommu_init(PCIBus *bus);
> +
>  #endif
> diff --git a/hw/pci_ids.h b/hw/pci_ids.h
> index 39e9f1d..d790312 100644
> --- a/hw/pci_ids.h
> +++ b/hw/pci_ids.h
> @@ -26,6 +26,7 @@
>
>  #define PCI_CLASS_MEMORY_RAM             0x0500
>
> +#define PCI_CLASS_SYSTEM_IOMMU           0x0806
>  #define PCI_CLASS_SYSTEM_OTHER           0x0880
>
>  #define PCI_CLASS_SERIAL_USB             0x0c03
> @@ -56,6 +57,7 @@
>
>  #define PCI_VENDOR_ID_AMD                0x1022
>  #define PCI_DEVICE_ID_AMD_LANCE          0x2000
> +#define PCI_DEVICE_ID_AMD_IOMMU          0x0000     /* FIXME */
>
>  #define PCI_VENDOR_ID_MOTOROLA           0x1057
>  #define PCI_DEVICE_ID_MOTOROLA_MPC106    0x0002
> diff --git a/hw/pci_regs.h b/hw/pci_regs.h
> index 1c675dc..6399b5d 100644
> --- a/hw/pci_regs.h
> +++ b/hw/pci_regs.h
> @@ -216,6 +216,7 @@
>  #define  PCI_CAP_ID_SHPC       0x0C    /* PCI Standard Hot-Plug Controller */
>  #define  PCI_CAP_ID_SSVID      0x0D    /* Bridge subsystem vendor/device ID */
>  #define  PCI_CAP_ID_AGP3       0x0E    /* AGP Target PCI-PCI bridge */
> +#define  PCI_CAP_ID_SEC     0x0F    /* Secure Device (AMD IOMMU) */

Indentation seems to be off.

>  #define  PCI_CAP_ID_EXP        0x10    /* PCI Express */
>  #define  PCI_CAP_ID_MSIX       0x11    /* MSI-X */
>  #define  PCI_CAP_ID_AF         0x13    /* PCI Advanced Features */
> --
> 1.7.1
>
>
>
Eduard - Gabriel Munteanu - Aug. 6, 2010, 12:41 a.m.
On Thu, Aug 05, 2010 at 09:31:58PM +0000, Blue Swirl wrote:
> On Wed, Aug 4, 2010 at 10:32 PM, Eduard - Gabriel Munteanu
> <eduard.munteanu@linux360.ro> wrote:

[snip]

> > diff --git a/Makefile.target b/Makefile.target
> > index 70a9c1b..86226a0 100644
> > --- a/Makefile.target
> > +++ b/Makefile.target
> > @@ -219,6 +219,8 @@ obj-i386-y += pcspk.o i8254.o
> > ??obj-i386-$(CONFIG_KVM_PIT) += i8254-kvm.o
> > ??obj-i386-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += device-assignment.o
> >
> > +obj-i386-$(CONFIG_AMD_IOMMU) += amd_iommu.o
> 
> Make this unconditional.
> 

[snip]

> 
> Drop all configure changes.
> 

Alright, so it's not going to be a compile-time configurable option.
I'll make some cmdline option for it and make really sure I don't mess
performance in hot paths.

(I'm actually happy to know it's gonna go in that way.)

[snip]

> > +struct amd_iommu_invalidator {
> > + ?? ??int ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? devfn;
> > + ?? ??PCIInvalidateIOTLBFunc ??*func;
> > + ?? ??void ?? ?? ?? ?? ?? ?? ?? ?? ?? ??*opaque;
> > + ?? ??QLIST_ENTRY(amd_iommu_invalidator) list;
> > +};
> 
> This should be AMDIOMMUInvalidator with typedef.
> 
> > +
> > +struct amd_iommu_state {

[snip]

> > +};
> 
> Likewise, AMDIOMMUState.
>

[snip]

> > +static int amd_iommu_translate(PCIIOMMU *iommu,
> > + ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? PCIDevice *dev,
> > + ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? pci_addr_t addr,
> > + ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? target_phys_addr_t *paddr,
> > + ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? int *len,
> > + ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? unsigned perms);
> 
> Please move the implementation here to avoid this declaration.
> 

[snip]

> > + ?? ??iommu = qemu_mallocz(sizeof(PCIIOMMU));
> > + ?? ??iommu->opaque = st;
> > + ?? ??iommu->translate = amd_iommu_translate;
> > + ?? ??iommu->register_iotlb_invalidator = amd_iommu_register_invalidator;
> > + ?? ??pci_register_iommu(dev, iommu);
> 
> I'd avoid the structure and just pass the stuff to pci_register_iommu
> as function arguments.
> 

[snip]

> > +void amd_iommu_init(PCIBus *bus)
> > +{
> > + ?? ??pci_create_simple(bus, -1, "amd-iommu");
> > +}
> 
> Just open code this in pc.c.
> 

Roger, I'll fix these.

[snip]

> > ??#define PCI_VENDOR_ID_MOTOROLA ?? ?? ?? ?? ?? 0x1057
> > ??#define PCI_DEVICE_ID_MOTOROLA_MPC106 ?? ??0x0002
> > diff --git a/hw/pci_regs.h b/hw/pci_regs.h
> > index 1c675dc..6399b5d 100644
> > --- a/hw/pci_regs.h
> > +++ b/hw/pci_regs.h
> > @@ -216,6 +216,7 @@
> > ??#define ??PCI_CAP_ID_SHPC ?? ?? ?? 0x0C ?? ??/* PCI Standard Hot-Plug Controller */
> > ??#define ??PCI_CAP_ID_SSVID ?? ?? ??0x0D ?? ??/* Bridge subsystem vendor/device ID */
> > ??#define ??PCI_CAP_ID_AGP3 ?? ?? ?? 0x0E ?? ??/* AGP Target PCI-PCI bridge */
> > +#define ??PCI_CAP_ID_SEC ?? ?? 0x0F ?? ??/* Secure Device (AMD IOMMU) */
> 
> Indentation seems to be off.
> 
> > ??#define ??PCI_CAP_ID_EXP ?? ?? ?? ??0x10 ?? ??/* PCI Express */
> > ??#define ??PCI_CAP_ID_MSIX ?? ?? ?? 0x11 ?? ??/* MSI-X */
> > ??#define ??PCI_CAP_ID_AF ?? ?? ?? ?? 0x13 ?? ??/* PCI Advanced Features */
> > --
> > 1.7.1

The original has tabs instead of spaces, but my changes line up
properly. Which way should I go, convert it all to spaces, add my line
with tabs or leave it like this? Of course, any cleanup would go in a
separate patch.


	Thanks,
	Eduard

Patch

diff --git a/Makefile.target b/Makefile.target
index 70a9c1b..86226a0 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -219,6 +219,8 @@  obj-i386-y += pcspk.o i8254.o
 obj-i386-$(CONFIG_KVM_PIT) += i8254-kvm.o
 obj-i386-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += device-assignment.o
 
+obj-i386-$(CONFIG_AMD_IOMMU) += amd_iommu.o
+
 # Hardware support
 obj-ia64-y += ide.o pckbd.o vga.o $(SOUND_HW) dma.o $(AUDIODRV)
 obj-ia64-y += fdc.o mc146818rtc.o serial.o i8259.o ipf.o
diff --git a/configure b/configure
index af50607..7448603 100755
--- a/configure
+++ b/configure
@@ -317,6 +317,7 @@  io_thread="no"
 mixemu="no"
 kvm_cap_pit=""
 kvm_cap_device_assignment=""
+amd_iommu="no"
 kerneldir=""
 aix="no"
 blobs="yes"
@@ -629,6 +630,8 @@  for opt do
   ;;
   --enable-kvm-device-assignment) kvm_cap_device_assignment="yes"
   ;;
+  --enable-amd-iommu-emul) amd_iommu="yes"
+  ;;
   --enable-profiler) profiler="yes"
   ;;
   --enable-cocoa)
@@ -871,6 +874,8 @@  echo "  --disable-kvm-pit        disable KVM pit support"
 echo "  --enable-kvm-pit         enable KVM pit support"
 echo "  --disable-kvm-device-assignment  disable KVM device assignment support"
 echo "  --enable-kvm-device-assignment   enable KVM device assignment support"
+echo "  --disable-amd-iommu-emul disable AMD IOMMU emulation"
+echo "  --enable-amd-iommu-emul  enable AMD IOMMU emulation"
 echo "  --disable-nptl           disable usermode NPTL support"
 echo "  --enable-nptl            enable usermode NPTL support"
 echo "  --enable-system          enable all system emulation targets"
@@ -2251,6 +2256,7 @@  echo "Install blobs     $blobs"
 echo "KVM support       $kvm"
 echo "KVM PIT support   $kvm_cap_pit"
 echo "KVM device assig. $kvm_cap_device_assignment"
+echo "AMD IOMMU emul.   $amd_iommu"
 echo "fdt support       $fdt"
 echo "preadv support    $preadv"
 echo "fdatasync         $fdatasync"
@@ -2645,6 +2651,10 @@  case "$target_arch2" in
   x86_64)
     TARGET_BASE_ARCH=i386
     target_phys_bits=64
+    if test "$amd_iommu" = "yes"; then
+      echo "CONFIG_AMD_IOMMU=y" >> $config_target_mak
+      echo "CONFIG_PCI_IOMMU=y" >> $config_host_mak
+    fi
   ;;
   ia64)
     target_phys_bits=64
diff --git a/hw/amd_iommu.c b/hw/amd_iommu.c
new file mode 100644
index 0000000..ff9903e
--- /dev/null
+++ b/hw/amd_iommu.c
@@ -0,0 +1,671 @@ 
+/*
+ * AMD IOMMU emulation
+ *
+ * Copyright (c) 2010 Eduard - Gabriel Munteanu
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "pc.h"
+#include "hw.h"
+#include "pci.h"
+#include "qlist.h"
+
+/* Capability registers */
+#define CAPAB_HEADER            0x00
+#define   CAPAB_REV_TYPE        0x02
+#define   CAPAB_FLAGS           0x03
+#define CAPAB_BAR_LOW           0x04
+#define CAPAB_BAR_HIGH          0x08
+#define CAPAB_RANGE             0x0C
+#define CAPAB_MISC              0x10
+
+#define CAPAB_SIZE              0x14
+
+/* Capability header data */
+#define CAPAB_FLAG_IOTLBSUP     (1 << 0)
+#define CAPAB_FLAG_HTTUNNEL     (1 << 1)
+#define CAPAB_FLAG_NPCACHE      (1 << 2)
+#define CAPAB_INIT_REV          (1 << 3)
+#define CAPAB_INIT_TYPE         3
+#define CAPAB_INIT_REV_TYPE     (CAPAB_REV | CAPAB_TYPE)
+#define CAPAB_INIT_FLAGS        (CAPAB_FLAG_NPCACHE | CAPAB_FLAG_HTTUNNEL)
+#define CAPAB_INIT_MISC         (64 << 15) | (48 << 8)
+#define CAPAB_BAR_MASK          ~((1UL << 14) - 1)
+
+/* MMIO registers */
+#define MMIO_DEVICE_TABLE       0x0000
+#define MMIO_COMMAND_BASE       0x0008
+#define MMIO_EVENT_BASE         0x0010
+#define MMIO_CONTROL            0x0018
+#define MMIO_EXCL_BASE          0x0020
+#define MMIO_EXCL_LIMIT         0x0028
+#define MMIO_COMMAND_HEAD       0x2000
+#define MMIO_COMMAND_TAIL       0x2008
+#define MMIO_EVENT_HEAD         0x2010
+#define MMIO_EVENT_TAIL         0x2018
+#define MMIO_STATUS             0x2020
+
+#define MMIO_SIZE               0x4000
+
+#define MMIO_DEVTAB_SIZE_MASK   ((1ULL << 12) - 1)
+#define MMIO_DEVTAB_BASE_MASK   (((1ULL << 52) - 1) & ~MMIO_DEVTAB_SIZE_MASK)
+#define MMIO_DEVTAB_ENTRY_SIZE  32
+#define MMIO_DEVTAB_SIZE_UNIT   4096
+
+#define MMIO_CMDBUF_SIZE_BYTE       (MMIO_COMMAND_BASE + 7)
+#define MMIO_CMDBUF_SIZE_MASK       0x0F
+#define MMIO_CMDBUF_BASE_MASK       MMIO_DEVTAB_BASE_MASK
+#define MMIO_CMDBUF_DEFAULT_SIZE    8
+#define MMIO_CMDBUF_HEAD_MASK       (((1ULL << 19) - 1) & ~0x0F)
+#define MMIO_CMDBUF_TAIL_MASK       MMIO_EVTLOG_HEAD_MASK
+
+#define MMIO_EVTLOG_SIZE_BYTE       (MMIO_EVENT_BASE + 7)
+#define MMIO_EVTLOG_SIZE_MASK       MMIO_CMDBUF_SIZE_MASK
+#define MMIO_EVTLOG_BASE_MASK       MMIO_CMDBUF_BASE_MASK
+#define MMIO_EVTLOG_DEFAULT_SIZE    MMIO_CMDBUF_DEFAULT_SIZE
+#define MMIO_EVTLOG_HEAD_MASK       (((1ULL << 19) - 1) & ~0x0F)
+#define MMIO_EVTLOG_TAIL_MASK       MMIO_EVTLOG_HEAD_MASK
+
+#define MMIO_EXCL_BASE_MASK         MMIO_DEVTAB_BASE_MASK
+#define MMIO_EXCL_ENABLED_MASK      (1ULL << 0)
+#define MMIO_EXCL_ALLOW_MASK        (1ULL << 1)
+#define MMIO_EXCL_LIMIT_MASK        MMIO_DEVTAB_BASE_MASK
+#define MMIO_EXCL_LIMIT_LOW         0xFFF
+
+#define MMIO_CONTROL_IOMMUEN        (1ULL << 0)
+#define MMIO_CONTROL_HTTUNEN        (1ULL << 1)
+#define MMIO_CONTROL_EVENTLOGEN     (1ULL << 2)
+#define MMIO_CONTROL_EVENTINTEN     (1ULL << 3)
+#define MMIO_CONTROL_COMWAITINTEN   (1ULL << 4)
+#define MMIO_CONTROL_CMDBUFEN       (1ULL << 12)
+
+#define MMIO_STATUS_EVTLOG_OF       (1ULL << 0)
+#define MMIO_STATUS_EVTLOG_INTR     (1ULL << 1)
+#define MMIO_STATUS_COMWAIT_INTR    (1ULL << 2)
+#define MMIO_STATUS_EVTLOG_RUN      (1ULL << 3)
+#define MMIO_STATUS_CMDBUF_RUN      (1ULL << 4)
+
+#define CMDBUF_ID_BYTE              0x07
+#define CMDBUF_ID_RSHIFT            4
+#define CMDBUF_ENTRY_SIZE           0x10
+
+#define CMD_COMPLETION_WAIT         0x01
+#define CMD_INVAL_DEVTAB_ENTRY      0x02
+#define CMD_INVAL_IOMMU_PAGES       0x03
+#define CMD_INVAL_IOTLB_PAGES       0x04
+#define CMD_INVAL_INTR_TABLE        0x05
+
+#define DEVTAB_ENTRY_SIZE           32
+
+/* Device table entry bits 0:63 */
+#define DEV_VALID                   (1ULL << 0)
+#define DEV_TRANSLATION_VALID       (1ULL << 1)
+#define DEV_MODE_MASK               0x7
+#define DEV_MODE_RSHIFT             9
+#define DEV_PT_ROOT_MASK            0xFFFFFFFFFF000
+#define DEV_PT_ROOT_RSHIFT          12
+#define DEV_PERM_SHIFT              61
+#define DEV_PERM_READ               (1ULL << 61)
+#define DEV_PERM_WRITE              (1ULL << 62)
+
+/* Device table entry bits 64:127 */
+#define DEV_DOMAIN_ID_MASK          ((1ULL << 16) - 1)
+#define DEV_IOTLB_SUPPORT           (1ULL << 17)
+#define DEV_SUPPRESS_PF             (1ULL << 18)
+#define DEV_SUPPRESS_ALL_PF         (1ULL << 19)
+#define DEV_IOCTL_MASK              ~3
+#define DEV_IOCTL_RSHIFT            20
+#define   DEV_IOCTL_DENY            0
+#define   DEV_IOCTL_PASSTHROUGH     1
+#define   DEV_IOCTL_TRANSLATE       2
+#define DEV_CACHE                   (1ULL << 37)
+#define DEV_SNOOP_DISABLE           (1ULL << 38)
+#define DEV_EXCL                    (1ULL << 39)
+
+struct amd_iommu_invalidator {
+    int                     devfn;
+    PCIInvalidateIOTLBFunc  *func;
+    void                    *opaque;
+    QLIST_ENTRY(amd_iommu_invalidator) list;
+};
+
+struct amd_iommu_state {
+    PCIDevice                   dev;
+
+    int                         capab_offset;
+    unsigned char               *capab;
+
+    int                         mmio_index;
+    target_phys_addr_t          mmio_addr;
+    unsigned char               *mmio_buf;
+    int                         mmio_enabled;
+
+    int                         enabled;
+    int                         ats_enabled;
+
+    target_phys_addr_t          devtab;
+    size_t                      devtab_len;
+
+    target_phys_addr_t          cmdbuf;
+    int                         cmdbuf_enabled;
+    size_t                      cmdbuf_len;
+    size_t                      cmdbuf_head;
+    size_t                      cmdbuf_tail;
+    int                         completion_wait_intr;
+
+    target_phys_addr_t          evtlog;
+    int                         evtlog_enabled;
+    int                         evtlog_intr;
+    size_t                      evtlog_len;
+    size_t                      evtlog_head;
+    size_t                      evtlog_tail;
+
+    target_phys_addr_t          excl_base;
+    target_phys_addr_t          excl_limit;
+    int                         excl_enabled;
+    int                         excl_allow;
+
+    QLIST_HEAD(, amd_iommu_invalidator) invalidators;
+};
+
+static void amd_iommu_register_invalidator(PCIIOMMU *iommu,
+                                           PCIDevice *dev,
+                                           pci_addr_t addr,
+                                           PCIInvalidateIOTLBFunc *cb,
+                                           void *opaque)
+{
+    struct amd_iommu_invalidator *inval;
+    struct amd_iommu_state *st = iommu->opaque;
+
+    inval = qemu_malloc(sizeof(struct amd_iommu_invalidator));
+    inval->devfn    = dev->devfn;
+    inval->func     = cb;
+    inval->opaque   = opaque;
+
+    QLIST_INSERT_HEAD(&st->invalidators, inval, list);
+}
+
+static void amd_iommu_completion_wait(struct amd_iommu_state *st,
+                                      uint8_t *cmd)
+{
+    uint64_t addr;
+
+    if (cmd[0] & 1) {
+        addr = le64_to_cpu(*(uint64_t *) cmd) & 0xFFFFFFFFFFFF8;
+        cpu_physical_memory_write(addr, cmd + 8, 8);
+    }
+
+    if (cmd[0] & 2)
+        st->mmio_buf[MMIO_STATUS] |= MMIO_STATUS_COMWAIT_INTR;
+}
+
+static void amd_iommu_inval_iotlb(struct amd_iommu_state *st,
+                                  uint8_t *cmd)
+{
+    struct amd_iommu_invalidator *inval;
+    int devfn = *(uint16_t *) cmd;
+
+    QLIST_FOREACH(inval, &st->invalidators, list) {
+        if (inval->devfn == devfn) {
+            inval->func(inval->opaque);
+            QLIST_REMOVE(inval, list);
+        }
+    }
+}
+
+static void amd_iommu_cmdbuf_run(struct amd_iommu_state *st)
+{
+    uint8_t cmd[16];
+    int type;
+
+    if (!st->cmdbuf_enabled)
+        return;
+
+    /* Check if there's work to do. */
+    if (st->cmdbuf_head == st->cmdbuf_tail)
+        return;
+
+    cpu_physical_memory_read(st->cmdbuf + st->cmdbuf_head, cmd, 16);
+    type = cmd[CMDBUF_ID_BYTE] >> CMDBUF_ID_RSHIFT;
+    switch (type) {
+        case CMD_COMPLETION_WAIT:
+            amd_iommu_completion_wait(st, cmd);
+            break;
+        case CMD_INVAL_DEVTAB_ENTRY:
+            break;
+        case CMD_INVAL_IOMMU_PAGES:
+            break;
+        case CMD_INVAL_IOTLB_PAGES:
+            amd_iommu_inval_iotlb(st, cmd);
+            break;
+        case CMD_INVAL_INTR_TABLE:
+            break;
+        default:
+            break;
+    }
+
+    /* Increment and wrap head pointer. */
+    st->cmdbuf_head += CMDBUF_ENTRY_SIZE;
+    if (st->cmdbuf_head >= st->cmdbuf_len)
+        st->cmdbuf_head = 0;
+}
+
+static uint32_t amd_iommu_mmio_buf_read(struct amd_iommu_state *st,
+                                        size_t offset,
+                                        size_t size)
+{
+    ssize_t i;
+    uint32_t ret;
+
+    if (!size)
+        return 0;
+
+    ret = st->mmio_buf[offset + size - 1];
+    for (i = size - 2; i >= 0; i--) {
+        ret <<= 8;
+        ret |= st->mmio_buf[offset + i];
+    }
+
+    return ret;
+}
+
+static void amd_iommu_mmio_buf_write(struct amd_iommu_state *st,
+                                     size_t offset,
+                                     size_t size,
+                                     uint32_t val)
+{
+    size_t i;
+
+    for (i = 0; i < size; i++) {
+        st->mmio_buf[offset + i] = val & 0xFF;
+        val >>= 8;
+    }
+}
+
+static void amd_iommu_update_mmio(struct amd_iommu_state *st,
+                                  target_phys_addr_t addr)
+{
+    size_t reg = addr & ~0x07;
+    uint64_t *base = (uint64_t *) &st->mmio_buf[reg];
+    uint64_t val = *base;
+
+    switch (reg) {
+        case MMIO_CONTROL:
+            st->enabled              = !!(val & MMIO_CONTROL_IOMMUEN);
+            st->ats_enabled          = !!(val & MMIO_CONTROL_HTTUNEN);
+            st->evtlog_enabled       = st->enabled &&
+                                       !!(val & MMIO_CONTROL_EVENTLOGEN);
+            st->evtlog_intr          = !!(val & MMIO_CONTROL_EVENTINTEN);
+            st->completion_wait_intr = !!(val & MMIO_CONTROL_COMWAITINTEN);
+            st->cmdbuf_enabled       = st->enabled &&
+                                       !!(val & MMIO_CONTROL_CMDBUFEN);
+            
+            /* Update status flags depending on the control register. */
+            if (st->cmdbuf_enabled)
+                st->mmio_buf[MMIO_STATUS] |= MMIO_STATUS_CMDBUF_RUN;
+            else
+                st->mmio_buf[MMIO_STATUS] &= ~MMIO_STATUS_CMDBUF_RUN;
+            if (st->evtlog_enabled)
+                st->mmio_buf[MMIO_STATUS] |= MMIO_STATUS_EVTLOG_RUN;
+            else
+                st->mmio_buf[MMIO_STATUS] &= ~MMIO_STATUS_EVTLOG_RUN;
+
+            amd_iommu_cmdbuf_run(st);
+            break;
+        case MMIO_DEVICE_TABLE:
+            st->devtab = (target_phys_addr_t) (val & MMIO_DEVTAB_BASE_MASK);
+            st->devtab_len = ((val & MMIO_DEVTAB_SIZE_MASK) + 1) *
+                             (MMIO_DEVTAB_SIZE_UNIT / MMIO_DEVTAB_ENTRY_SIZE);
+            break;
+        case MMIO_COMMAND_BASE:
+            st->cmdbuf = (target_phys_addr_t) (val & MMIO_CMDBUF_BASE_MASK);
+            st->cmdbuf_len = 1UL << (st->mmio_buf[MMIO_CMDBUF_SIZE_BYTE] &
+                                     MMIO_CMDBUF_SIZE_MASK);
+            amd_iommu_cmdbuf_run(st);
+            break;
+        case MMIO_COMMAND_HEAD:
+            st->cmdbuf_head = val & MMIO_CMDBUF_HEAD_MASK;
+            amd_iommu_cmdbuf_run(st);
+            break;
+        case MMIO_COMMAND_TAIL:
+            st->cmdbuf_tail = val & MMIO_CMDBUF_TAIL_MASK;
+            amd_iommu_cmdbuf_run(st);
+            break;
+        case MMIO_EVENT_BASE:
+            st->evtlog = (target_phys_addr_t) (val & MMIO_EVTLOG_BASE_MASK);
+            st->evtlog_len = 1UL << (st->mmio_buf[MMIO_EVTLOG_SIZE_BYTE] &
+                                     MMIO_EVTLOG_SIZE_MASK);
+            break;
+        case MMIO_EVENT_HEAD:
+            st->evtlog_head = val & MMIO_EVTLOG_HEAD_MASK;
+            break;
+        case MMIO_EVENT_TAIL:
+            st->evtlog_tail = val & MMIO_EVTLOG_TAIL_MASK;
+            break;
+        case MMIO_EXCL_BASE:
+            st->excl_base = (target_phys_addr_t) (val & MMIO_EXCL_BASE_MASK);
+            st->excl_enabled = val & MMIO_EXCL_ENABLED_MASK;
+            st->excl_allow = val & MMIO_EXCL_ALLOW_MASK;
+            break;
+        case MMIO_EXCL_LIMIT:
+            st->excl_limit = (target_phys_addr_t) ((val & MMIO_EXCL_LIMIT_MASK) |
+                                                   MMIO_EXCL_LIMIT_LOW);
+            break;
+        default:
+            break;
+    }
+}
+
+static uint32_t amd_iommu_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    struct amd_iommu_state *st = opaque;
+
+    return amd_iommu_mmio_buf_read(st, addr, 1);
+}
+
+static uint32_t amd_iommu_mmio_readw(void *opaque, target_phys_addr_t addr)
+{
+    struct amd_iommu_state *st = opaque;
+
+    return amd_iommu_mmio_buf_read(st, addr, 2);
+}
+
+static uint32_t amd_iommu_mmio_readl(void *opaque, target_phys_addr_t addr)
+{
+    struct amd_iommu_state *st = opaque;
+
+    return amd_iommu_mmio_buf_read(st, addr, 4);
+}
+
+static void amd_iommu_mmio_writeb(void *opaque,
+                                  target_phys_addr_t addr,
+                                  uint32_t val)
+{
+    struct amd_iommu_state *st = opaque;
+
+    amd_iommu_mmio_buf_write(st, addr, 1, val);
+    amd_iommu_update_mmio(st, addr);
+}
+
+static void amd_iommu_mmio_writew(void *opaque,
+                                  target_phys_addr_t addr,
+                                  uint32_t val)
+{
+    struct amd_iommu_state *st = opaque;
+
+    amd_iommu_mmio_buf_write(st, addr, 2, val);
+    amd_iommu_update_mmio(st, addr);
+}
+
+static void amd_iommu_mmio_writel(void *opaque,
+                                  target_phys_addr_t addr,
+                                  uint32_t val)
+{
+    struct amd_iommu_state *st = opaque;
+
+    amd_iommu_mmio_buf_write(st, addr, 4, val);
+    amd_iommu_update_mmio(st, addr);
+}
+
+static CPUReadMemoryFunc * const amd_iommu_mmio_read[] = {
+    amd_iommu_mmio_readb,
+    amd_iommu_mmio_readw,
+    amd_iommu_mmio_readl,
+};
+
+static CPUWriteMemoryFunc * const amd_iommu_mmio_write[] = {
+    amd_iommu_mmio_writeb,
+    amd_iommu_mmio_writew,
+    amd_iommu_mmio_writel,
+};
+
+static void amd_iommu_init_mmio(struct amd_iommu_state *st)
+{
+    st->mmio_buf[MMIO_CMDBUF_SIZE_BYTE] = MMIO_CMDBUF_DEFAULT_SIZE;
+    st->mmio_buf[MMIO_EVTLOG_SIZE_BYTE] = MMIO_EVTLOG_DEFAULT_SIZE;
+}
+
+static void amd_iommu_enable_mmio(struct amd_iommu_state *st)
+{
+    target_phys_addr_t addr;
+
+    st->mmio_index = cpu_register_io_memory(amd_iommu_mmio_read,
+                                            amd_iommu_mmio_write, st);
+    if (st->mmio_index < 0)
+        return;
+
+    addr = le64_to_cpu(*(uint64_t *) &st->capab[CAPAB_BAR_LOW]) & CAPAB_BAR_MASK;
+    cpu_register_physical_memory(addr, MMIO_SIZE, st->mmio_index);
+
+    st->mmio_addr = addr;
+    st->mmio_buf = qemu_mallocz(MMIO_SIZE);
+    st->mmio_enabled = 1;
+    amd_iommu_init_mmio(st);
+}
+
+static uint32_t amd_iommu_read_capab(PCIDevice *pci_dev,
+                                     uint32_t addr, int len)
+{
+    return pci_default_cap_read_config(pci_dev, addr, len);
+}
+
+static void amd_iommu_write_capab(PCIDevice *dev,
+                                  uint32_t addr, uint32_t val, int len)
+{
+    struct amd_iommu_state *st;
+    unsigned char *capab;
+    int reg;
+
+    st = DO_UPCAST(struct amd_iommu_state, dev, dev);
+    capab = st->capab;
+    reg = (addr - 0x40) & ~0x3;  /* Get the 32-bits register. */
+
+    switch (reg) {
+        case CAPAB_HEADER:
+        case CAPAB_MISC:
+            /* Read-only. */
+            return;
+        case CAPAB_BAR_LOW:
+        case CAPAB_BAR_HIGH:
+        case CAPAB_RANGE:
+            if (st->mmio_enabled)
+                return;
+            pci_default_cap_write_config(dev, addr, val, len);
+            break;
+        default:
+            return;
+    }
+
+    if (capab[CAPAB_BAR_LOW] & 0x1)
+        amd_iommu_enable_mmio(st);
+}
+
+static int amd_iommu_init_capab(PCIDevice *dev)
+{
+    struct amd_iommu_state *st;
+    unsigned char *capab;
+
+    st = DO_UPCAST(struct amd_iommu_state, dev, dev);
+    capab = st->dev.config + st->capab_offset;
+
+    capab[CAPAB_REV_TYPE]  = CAPAB_REV_TYPE;
+    capab[CAPAB_FLAGS]     = CAPAB_FLAGS;
+    capab[CAPAB_BAR_LOW]   = 0;
+    capab[CAPAB_BAR_HIGH]  = 0;
+    capab[CAPAB_RANGE]     = 0;
+    *((uint32_t *) &capab[CAPAB_MISC]) = cpu_to_le32(CAPAB_INIT_MISC);
+
+    st->capab = capab;
+    st->dev.cap.length = CAPAB_SIZE;
+
+    return 0;
+}
+
+static int amd_iommu_translate(PCIIOMMU *iommu,
+                               PCIDevice *dev,
+                               pci_addr_t addr,
+                               target_phys_addr_t *paddr,
+                               int *len,
+                               unsigned perms);
+
+static int amd_iommu_pci_initfn(PCIDevice *dev)
+{
+    struct amd_iommu_state *st;
+    PCIIOMMU *iommu;
+    int err;
+
+    st = DO_UPCAST(struct amd_iommu_state, dev, dev);
+
+    pci_config_set_vendor_id(st->dev.config, PCI_VENDOR_ID_AMD);
+    pci_config_set_device_id(st->dev.config, PCI_DEVICE_ID_AMD_IOMMU);
+    pci_config_set_class(st->dev.config, PCI_CLASS_SYSTEM_IOMMU);
+
+    st->capab_offset = pci_add_capability(&st->dev,
+                                          PCI_CAP_ID_SEC,
+                                          CAPAB_SIZE);
+    err = pci_enable_capability_support(&st->dev, st->capab_offset,
+                                        amd_iommu_read_capab,
+                                        amd_iommu_write_capab,
+                                        amd_iommu_init_capab);
+    if (err)
+        return err;
+
+    iommu = qemu_mallocz(sizeof(PCIIOMMU));
+    iommu->opaque = st;
+    iommu->translate = amd_iommu_translate;
+    iommu->register_iotlb_invalidator = amd_iommu_register_invalidator;
+    pci_register_iommu(dev, iommu);
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_amd_iommu = {
+    .name                       = "amd-iommu",
+    .version_id                 = 1,
+    .minimum_version_id         = 1,
+    .minimum_version_id_old     = 1,
+    .fields                     = (VMStateField []) {
+        VMSTATE_PCI_DEVICE(dev, struct amd_iommu_state),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static PCIDeviceInfo amd_iommu_pci_info = {
+    .qdev.name    = "amd-iommu",
+    .qdev.desc    = "AMD IOMMU",
+    .qdev.size    = sizeof(struct amd_iommu_state),
+    .qdev.vmsd    = &vmstate_amd_iommu,
+    .init         = amd_iommu_pci_initfn,
+};
+
+void amd_iommu_init(PCIBus *bus)
+{
+    pci_create_simple(bus, -1, "amd-iommu");
+}
+
+static void amd_iommu_register(void)
+{
+    pci_qdev_register(&amd_iommu_pci_info);
+}
+
+device_init(amd_iommu_register);
+
+static void amd_iommu_page_fault(struct amd_iommu_state *st,
+                                 int devfn,
+                                 unsigned domid,
+                                 target_phys_addr_t addr,
+                                 int present,
+                                 int is_write)
+{
+    uint16_t entry[8];
+    uint64_t *entry_addr = (uint64_t *) &entry[4];
+
+    entry[0] = cpu_to_le16(devfn);
+    entry[1] = 0;
+    entry[2] = cpu_to_le16(domid);
+    entry[3] = (2UL << 12) | (!!present << 4) | (!!is_write << 5);
+    *entry_addr = cpu_to_le64(addr);
+
+    cpu_physical_memory_write((target_phys_addr_t) st->evtlog + st->evtlog_tail, (uint8_t *) &entry, 128);
+    st->evtlog_tail += 128;
+}
+
+static inline uint64_t amd_iommu_get_perms(uint64_t entry)
+{
+    return (entry & (DEV_PERM_READ | DEV_PERM_WRITE)) >> DEV_PERM_SHIFT;
+}
+
+static int amd_iommu_translate(PCIIOMMU *iommu,
+                               PCIDevice *dev,
+                               pci_addr_t addr,
+                               target_phys_addr_t *paddr,
+                               int *len,
+                               unsigned perms)
+{
+    int devfn, present;
+    target_phys_addr_t entry_addr, pte_addr;
+    uint64_t entry[4], pte, page_offset, pte_perms;
+    unsigned level, domid;
+    struct amd_iommu_state *st = iommu->opaque;
+
+    if (!st->enabled)
+        goto no_translation;
+
+    /* Get device table entry. */
+    devfn = dev->devfn;
+    entry_addr = st->devtab + devfn * DEVTAB_ENTRY_SIZE;
+    cpu_physical_memory_read(entry_addr, (uint8_t *) entry, 32);
+
+    pte = entry[0];
+    if (!(pte & DEV_VALID) || !(pte & DEV_TRANSLATION_VALID)) {
+        goto no_translation;
+    }
+    domid = entry[1] & DEV_DOMAIN_ID_MASK;
+    level = (pte >> DEV_MODE_RSHIFT) & DEV_MODE_MASK;
+    while (level > 0) {
+        /*
+         * Check permissions: the bitwise
+         * implication perms -> entry_perms must be true.
+         */
+        pte_perms = amd_iommu_get_perms(pte);
+        present = pte & 1;
+        if (!present || perms != (perms & pte_perms)) {
+            amd_iommu_page_fault(st, devfn, domid, addr,
+                                 present, !!(perms & IOMMU_PERM_WRITE));
+            return -EPERM;
+        }
+
+        /* Go to the next lower level. */
+        pte_addr = pte & DEV_PT_ROOT_MASK;
+        pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3;
+        pte = ldq_phys(pte_addr);
+        level = (pte >> DEV_MODE_RSHIFT) & DEV_MODE_MASK;
+    }
+    page_offset = addr & 4095;
+    *paddr = (pte & DEV_PT_ROOT_MASK) + page_offset;
+    *len = 4096 - page_offset;
+
+    return 0;
+
+no_translation:
+    *paddr = addr;
+    *len = INT_MAX;
+    return 0;
+}
diff --git a/hw/pc.c b/hw/pc.c
index 186e322..4c929f9 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -1066,6 +1066,10 @@  void pc_pci_device_init(PCIBus *pci_bus)
     int max_bus;
     int bus;
 
+#ifdef CONFIG_AMD_IOMMU
+    amd_iommu_init(pci_bus);
+#endif
+
     max_bus = drive_get_max_bus(IF_SCSI);
     for (bus = 0; bus <= max_bus; bus++) {
         pci_create_simple(pci_bus, -1, "lsi53c895a");
diff --git a/hw/pc.h b/hw/pc.h
index 3ef2f75..255ad93 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -191,4 +191,7 @@  void extboot_init(BlockDriverState *bs);
 
 int e820_add_entry(uint64_t, uint64_t, uint32_t);
 
+/* amd_iommu.c */
+void amd_iommu_init(PCIBus *bus);
+
 #endif
diff --git a/hw/pci_ids.h b/hw/pci_ids.h
index 39e9f1d..d790312 100644
--- a/hw/pci_ids.h
+++ b/hw/pci_ids.h
@@ -26,6 +26,7 @@ 
 
 #define PCI_CLASS_MEMORY_RAM             0x0500
 
+#define PCI_CLASS_SYSTEM_IOMMU           0x0806
 #define PCI_CLASS_SYSTEM_OTHER           0x0880
 
 #define PCI_CLASS_SERIAL_USB             0x0c03
@@ -56,6 +57,7 @@ 
 
 #define PCI_VENDOR_ID_AMD                0x1022
 #define PCI_DEVICE_ID_AMD_LANCE          0x2000
+#define PCI_DEVICE_ID_AMD_IOMMU          0x0000     /* FIXME */
 
 #define PCI_VENDOR_ID_MOTOROLA           0x1057
 #define PCI_DEVICE_ID_MOTOROLA_MPC106    0x0002
diff --git a/hw/pci_regs.h b/hw/pci_regs.h
index 1c675dc..6399b5d 100644
--- a/hw/pci_regs.h
+++ b/hw/pci_regs.h
@@ -216,6 +216,7 @@ 
 #define  PCI_CAP_ID_SHPC 	0x0C	/* PCI Standard Hot-Plug Controller */
 #define  PCI_CAP_ID_SSVID	0x0D	/* Bridge subsystem vendor/device ID */
 #define  PCI_CAP_ID_AGP3	0x0E	/* AGP Target PCI-PCI bridge */
+#define  PCI_CAP_ID_SEC     0x0F    /* Secure Device (AMD IOMMU) */
 #define  PCI_CAP_ID_EXP 	0x10	/* PCI Express */
 #define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
 #define  PCI_CAP_ID_AF		0x13	/* PCI Advanced Features */