diff mbox series

[v2,5/8] x86_iommu/amd: Add interrupt remap support when VAPIC is not enabled

Message ID 1536949623-23564-6-git-send-email-brijesh.singh@amd.com
State New
Headers show
Series x86_iommu/amd: add interrupt remap support | expand

Commit Message

Brijesh Singh Sept. 14, 2018, 6:27 p.m. UTC
Emulate the interrupt remapping support when guest virtual APIC is
not enabled.

For more info Refer: AMD IOMMU spec Rev 3.0 - section 2.2.5.1

When VAPIC is not enabled, it uses interrupt remapping as defined in
Table 20 and Figure 15 from IOMMU spec.

Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Cc: Tom Lendacky <Thomas.Lendacky@amd.com>
Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 hw/i386/amd_iommu.c  | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 hw/i386/amd_iommu.h  |  46 ++++++++++++-
 hw/i386/trace-events |   7 ++
 3 files changed, 240 insertions(+), 2 deletions(-)

Comments

Peter Xu Sept. 17, 2018, 5:52 a.m. UTC | #1
On Fri, Sep 14, 2018 at 01:27:00PM -0500, Brijesh Singh wrote:
> Emulate the interrupt remapping support when guest virtual APIC is
> not enabled.
> 
> For more info Refer: AMD IOMMU spec Rev 3.0 - section 2.2.5.1
> 
> When VAPIC is not enabled, it uses interrupt remapping as defined in
> Table 20 and Figure 15 from IOMMU spec.

(feel free to cc me in your next post)

> 
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Richard Henderson <rth@twiddle.net>
> Cc: Eduardo Habkost <ehabkost@redhat.com>
> Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> Cc: Tom Lendacky <Thomas.Lendacky@amd.com>
> Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> ---
>  hw/i386/amd_iommu.c  | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/i386/amd_iommu.h  |  46 ++++++++++++-
>  hw/i386/trace-events |   7 ++
>  3 files changed, 240 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
> index b15962b..9c8e4de 100644
> --- a/hw/i386/amd_iommu.c
> +++ b/hw/i386/amd_iommu.c
> @@ -28,6 +28,8 @@
>  #include "qemu/error-report.h"
>  #include "hw/i386/apic_internal.h"
>  #include "trace.h"
> +#include "cpu.h"
> +#include "hw/i386/apic-msidef.h"
>  
>  /* used AMD-Vi MMIO registers */
>  const char *amdvi_mmio_low[] = {
> @@ -1029,17 +1031,144 @@ static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
>      return ret;
>  }
>  
> +static int amdvi_get_irte(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
> +                          union irte *irte, uint16_t devid)
> +{
> +    uint64_t irte_root, offset;
> +
> +    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;

(I'll have similar endianess question like previous patch, but I'll
 stop looking at those since I'll need to know whether you plan to
 support that first...)

> +    offset = (origin->data & AMDVI_IRTE_OFFSET) << 2;
> +
> +    trace_amdvi_ir_irte(irte_root, offset);
> +
> +    if (dma_memory_read(&address_space_memory, irte_root + offset,
> +                        irte, sizeof(*irte))) {
> +        trace_amdvi_ir_err("failed to get irte");
> +        return -AMDVI_IR_GET_IRTE;
> +    }
> +
> +    trace_amdvi_ir_irte_val(irte->val);
> +
> +    return 0;
> +}
> +
> +static int amdvi_int_remap_legacy(AMDVIState *iommu,
> +                                  MSIMessage *origin,
> +                                  MSIMessage *translated,
> +                                  uint64_t *dte,
> +                                  X86IOMMUIrq *irq,
> +                                  uint16_t sid)
> +{
> +    int ret;
> +    union irte irte;
> +
> +    /* get interrupt remapping table */
> +    ret = amdvi_get_irte(iommu, origin, dte, &irte, sid);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    if (!irte.fields.valid) {
> +        trace_amdvi_ir_target_abort("RemapEn is disabled");

Note that recently QEMU introduced error_report_once().  Feel free to
switch to that if you want.  Many of the VT-d emulation code switched
to that to make sure errors like this will be dumped at least once and
we're also free from Dos attack.  This should at least apply to all of
your below trace_amdvi_ir_target_abort() calls, even some other traces.

> +        return -AMDVI_IR_TARGET_ABORT;
> +    }
> +
> +    if (irte.fields.guest_mode) {
> +        trace_amdvi_ir_target_abort("guest mode is not zero");
> +        return -AMDVI_IR_TARGET_ABORT;
> +    }
> +
> +    if (irte.fields.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
> +        trace_amdvi_ir_target_abort("reserved int_type");
> +        return -AMDVI_IR_TARGET_ABORT;
> +    }
> +
> +    irq->delivery_mode = irte.fields.int_type;
> +    irq->vector = irte.fields.vector;
> +    irq->dest_mode = irte.fields.dm;
> +    irq->redir_hint = irte.fields.rq_eoi;
> +    irq->dest = irte.fields.destination;
> +
> +    return 0;
> +}
> +
> +static int __amdvi_int_remap_msi(AMDVIState *iommu,
> +                                 MSIMessage *origin,
> +                                 MSIMessage *translated,
> +                                 uint64_t *dte,
> +                                 X86IOMMUIrq *irq,
> +                                 uint16_t sid)
> +{
> +    uint8_t int_ctl;
> +
> +    int_ctl = (dte[2] >> AMDVI_IR_INTCTL_SHIFT) & 3;
> +    trace_amdvi_ir_intctl(int_ctl);
> +
> +    switch (int_ctl) {
> +    case AMDVI_IR_INTCTL_PASS:
> +        memcpy(translated, origin, sizeof(*origin));
> +        return 0;
> +    case AMDVI_IR_INTCTL_REMAP:
> +        break;
> +    case AMDVI_IR_INTCTL_ABORT:
> +        trace_amdvi_ir_target_abort("int_ctl abort");
> +        return -AMDVI_IR_TARGET_ABORT;
> +    default:
> +        trace_amdvi_ir_target_abort("int_ctl reserved");
> +        return -AMDVI_IR_TARGET_ABORT;
> +    }
> +
> +    return amdvi_int_remap_legacy(iommu, origin, translated, dte, irq, sid);
> +}
> +
> +static bool amdvi_validate_int_reamp(AMDVIState *s, uint64_t *dte)

s/reamp/remap/

> +{
> +    /* Check if IR is enabled in DTE */
> +    if (!(dte[2] & AMDVI_IR_REMAP_ENABLE)) {
> +        return false;
> +    }
> +
> +    /* validate that we are configure with intremap=on */
> +    if (!s->intr_enabled) {
> +        error_report("Interrupt remapping is enabled in the guest but "
> +                     "not in the host. Use intremap=on to enable interrupt "
> +                     "remapping in amd-iommu.");
> +        exit(1);

This will crash QEMU.  IMHO this is not a configuration error but a
guest kernel error.  We'd better not crash QEMU for that.  Wny not
just report the error upwards.

> +    }
> +
> +    return true;
> +}
> +
>  /* Interrupt remapping for MSI/MSI-X entry */
>  static int amdvi_int_remap_msi(AMDVIState *iommu,
>                                 MSIMessage *origin,
>                                 MSIMessage *translated,
>                                 uint16_t sid)
>  {
> +    int ret = 0;
> +    uint64_t pass = 0;
> +    uint64_t dte[4] = { 0 };
> +    X86IOMMUIrq irq = { 0 };
> +    uint8_t dest_mode, delivery_mode;
> +
>      assert(origin && translated);
>  
> +    /*
> +     * When IOMMU is enabled, interrupt remap request will come either from
> +     * IO-APIC or PCI device. If interrupt is from PCI device then it will
> +     * have a valid requester id but if the interrupt is from IO-APIC
> +     * then requester id will be invalid.
> +     */
> +    if (sid == X86_IOMMU_SID_INVALID) {
> +        sid = AMDVI_IOAPIC_SB_DEVID;
> +    }
> +
>      trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid);
>  
> -    if (!iommu || !iommu->intr_enabled) {
> +    /* verify that interrupt remapping is enabled before going further. */
> +    if (!iommu ||
> +        !amdvi_get_dte(iommu, sid, dte)  ||
> +        !amdvi_validate_int_reamp(iommu, dte)) {
>          memcpy(translated, origin, sizeof(*origin));
>          goto out;
>      }
> @@ -1055,10 +1184,68 @@ static int amdvi_int_remap_msi(AMDVIState *iommu,
>          return -AMDVI_IR_ERR;
>      }
>  
> +    delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;

IIRC you mentioned that you'll paste some field definition here, or
which figure/table we should refer to.  But here I still see no
comment, and I don't know where defined it.  Again, Figure 14, bits
10:8 are part of index there.

> +
> +    switch (delivery_mode) {
> +    case AMDVI_IOAPIC_INT_TYPE_FIXED:
> +    case AMDVI_IOAPIC_INT_TYPE_ARBITRATED:
> +        trace_amdvi_ir_delivery_mode("fixed/arbitrated");
> +        ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, sid);
> +        if (ret < 0) {
> +            goto remap_fail;
> +        } else {
> +            /* Translate IRQ to MSI messages */
> +            x86_iommu_irq_to_msi_message(&irq, translated);
> +            goto out;
> +        }
> +        break;
> +    case AMDVI_IOAPIC_INT_TYPE_SMI:
> +        error_report("SMI is not supported!");
> +        ret = -AMDVI_IR_ERR;

Is it intended to not have the break?

> +    case AMDVI_IOAPIC_INT_TYPE_NMI:
> +        pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK;
> +        trace_amdvi_ir_delivery_mode("nmi");
> +        break;
> +    case AMDVI_IOAPIC_INT_TYPE_INIT:
> +        pass = dte[3] & AMDVI_DEV_INT_PASS_MASK;
> +        trace_amdvi_ir_delivery_mode("init");
> +        break;
> +    case AMDVI_IOAPIC_INT_TYPE_EINT:
> +        pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK;
> +        trace_amdvi_ir_delivery_mode("eint");
> +        break;
> +    default:
> +        trace_amdvi_ir_delivery_mode("unsupported delivery_mode");
> +        ret = -AMDVI_IR_ERR;
> +        break;
> +    }
> +
> +    if (ret < 0) {
> +        goto remap_fail;
> +    }
> +
> +    /* dest_mode 1 is valid for fixed and arbitrated interrupts only */
> +    dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
> +    if (dest_mode) {
> +        trace_amdvi_ir_err("invalid dest_mode");
> +        goto remap_fail;
> +    }
> +
> +    if (pass) {
> +        memcpy(translated, origin, sizeof(*origin));
> +    } else {
> +        /* pass through is not enabled */
> +        trace_amdvi_ir_err("passthrough is not enabled");
> +        goto remap_fail;
> +    }
> +
>  out:
>      trace_amdvi_ir_remap_msi(origin->address, origin->data,
>                               translated->address, translated->data);
>      return 0;
> +
> +remap_fail:
> +    return -AMDVI_IR_TARGET_ABORT;
>  }
>  
>  static int amdvi_int_remap(X86IOMMUState *iommu,
> diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
> index 71ff3c1..bd27cd2 100644
> --- a/hw/i386/amd_iommu.h
> +++ b/hw/i386/amd_iommu.h
> @@ -217,7 +217,51 @@
>  
>  /* Interrupt remapping errors */
>  #define AMDVI_IR_ERR            0x1
> -
> +#define AMDVI_IR_GET_IRTE       0x2
> +#define AMDVI_IR_TARGET_ABORT   0x3
> +
> +/* Interrupt remapping */
> +#define AMDVI_IR_REMAP_ENABLE           1ULL
> +#define AMDVI_IR_INTCTL_SHIFT           60
> +#define AMDVI_IR_INTCTL_ABORT           0
> +#define AMDVI_IR_INTCTL_PASS            1
> +#define AMDVI_IR_INTCTL_REMAP           2
> +
> +#define AMDVI_IR_PHYS_ADDR_MASK         (((1ULL << 45) - 1) << 6)
> +
> +/* MSI data 10:0 bits (section 2.2.5.1 Fig 14) */
> +#define AMDVI_IRTE_OFFSET               0x7ff
> +
> +/* Delivery mode of MSI data (same as IOAPIC deilver mode encoding) */
> +#define AMDVI_IOAPIC_INT_TYPE_FIXED          0x0
> +#define AMDVI_IOAPIC_INT_TYPE_ARBITRATED     0x1
> +#define AMDVI_IOAPIC_INT_TYPE_SMI            0x2
> +#define AMDVI_IOAPIC_INT_TYPE_NMI            0x4
> +#define AMDVI_IOAPIC_INT_TYPE_INIT           0x5
> +#define AMDVI_IOAPIC_INT_TYPE_EINT           0x7
> +
> +/* Pass through interrupt */
> +#define AMDVI_DEV_INT_PASS_MASK         (1UL << 56)
> +#define AMDVI_DEV_EINT_PASS_MASK        (1UL << 57)
> +#define AMDVI_DEV_NMI_PASS_MASK         (1UL << 58)
> +#define AMDVI_DEV_LINT0_PASS_MASK       (1UL << 62)
> +#define AMDVI_DEV_LINT1_PASS_MASK       (1UL << 63)
> +
> +/* Interrupt remapping table fields (Guest VAPIC not enabled) */
> +union irte {
> +    uint32_t val;
> +    struct {
> +        uint32_t valid:1,
> +                 no_fault:1,
> +                 int_type:3,
> +                 rq_eoi:1,
> +                 dm:1,
> +                 guest_mode:1,
> +                 destination:8,
> +                 vector:8,
> +                 rsvd:8;
> +    } fields;
> +};
>  
>  #define TYPE_AMD_IOMMU_DEVICE "amd-iommu"
>  #define AMD_IOMMU_DEVICE(obj)\
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index 41d533c..98150c9 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -106,6 +106,13 @@ amdvi_mem_ir_write(uint64_t addr, uint64_t val) "addr 0x%"PRIx64" data 0x%"PRIx6
>  amdvi_ir_remap_msi_req(uint64_t addr, uint64_t data, uint8_t devid) "addr 0x%"PRIx64" data 0x%"PRIx64" devid 0x%"PRIx8
>  amdvi_ir_remap_msi(uint64_t addr, uint64_t data, uint64_t addr2, uint64_t data2) "(addr 0x%"PRIx64", data 0x%"PRIx64") -> (addr 0x%"PRIx64", data 0x%"PRIx64")"
>  amdvi_err(const char *str) "%s"
> +amdvi_ir_irte(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" offset 0x%"PRIx64
> +amdvi_ir_irte_val(uint32_t data) "data 0x%"PRIx32
> +amdvi_ir_err(const char *str) "%s"
> +amdvi_ir_intctl(uint8_t val) "int_ctl 0x%"PRIx8
> +amdvi_ir_target_abort(const char *str) "%s"
> +amdvi_ir_delivery_mode(const char *str) "%s"
> +amdvi_ir_generate_msi_message(uint8_t vector, uint8_t delivery_mode, uint8_t dest_mode, uint8_t dest, uint8_t rh) "vector %d delivery-mode %d dest-mode %d dest-id %d rh %d"
>  
>  # hw/i386/vmport.c
>  vmport_register(unsigned char command, void *func, void *opaque) "command: 0x%02x func: %p opaque: %p"
> -- 
> 2.7.4
> 
> 

Regards,
Brijesh Singh Sept. 17, 2018, 3:09 p.m. UTC | #2
On 09/17/2018 12:52 AM, Peter Xu wrote:
> On Fri, Sep 14, 2018 at 01:27:00PM -0500, Brijesh Singh wrote:
>> Emulate the interrupt remapping support when guest virtual APIC is
>> not enabled.
>>
>> For more info Refer: AMD IOMMU spec Rev 3.0 - section 2.2.5.1
>>
>> When VAPIC is not enabled, it uses interrupt remapping as defined in
>> Table 20 and Figure 15 from IOMMU spec.
> 
> (feel free to cc me in your next post)


Will do so


> 
>>
>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: Richard Henderson <rth@twiddle.net>
>> Cc: Eduardo Habkost <ehabkost@redhat.com>
>> Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
>> Cc: Tom Lendacky <Thomas.Lendacky@amd.com>
>> Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
>> ---
>>   hw/i386/amd_iommu.c  | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   hw/i386/amd_iommu.h  |  46 ++++++++++++-
>>   hw/i386/trace-events |   7 ++
>>   3 files changed, 240 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
>> index b15962b..9c8e4de 100644
>> --- a/hw/i386/amd_iommu.c
>> +++ b/hw/i386/amd_iommu.c
>> @@ -28,6 +28,8 @@
>>   #include "qemu/error-report.h"
>>   #include "hw/i386/apic_internal.h"
>>   #include "trace.h"
>> +#include "cpu.h"
>> +#include "hw/i386/apic-msidef.h"
>>   
>>   /* used AMD-Vi MMIO registers */
>>   const char *amdvi_mmio_low[] = {
>> @@ -1029,17 +1031,144 @@ static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
>>       return ret;
>>   }
>>   
>> +static int amdvi_get_irte(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
>> +                          union irte *irte, uint16_t devid)
>> +{
>> +    uint64_t irte_root, offset;
>> +
>> +    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
> 
> (I'll have similar endianess question like previous patch, but I'll
>   stop looking at those since I'll need to know whether you plan to
>   support that first...)


I am not sure if we really need to be concern about big-endian
support in this series. If we need big-endian support then lets
work on new series to fix the endianess.


> 
>> +    offset = (origin->data & AMDVI_IRTE_OFFSET) << 2;
>> +
>> +    trace_amdvi_ir_irte(irte_root, offset);
>> +
>> +    if (dma_memory_read(&address_space_memory, irte_root + offset,
>> +                        irte, sizeof(*irte))) {
>> +        trace_amdvi_ir_err("failed to get irte");
>> +        return -AMDVI_IR_GET_IRTE;
>> +    }
>> +
>> +    trace_amdvi_ir_irte_val(irte->val);
>> +
>> +    return 0;
>> +}
>> +
>> +static int amdvi_int_remap_legacy(AMDVIState *iommu,
>> +                                  MSIMessage *origin,
>> +                                  MSIMessage *translated,
>> +                                  uint64_t *dte,
>> +                                  X86IOMMUIrq *irq,
>> +                                  uint16_t sid)
>> +{
>> +    int ret;
>> +    union irte irte;
>> +
>> +    /* get interrupt remapping table */
>> +    ret = amdvi_get_irte(iommu, origin, dte, &irte, sid);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    if (!irte.fields.valid) {
>> +        trace_amdvi_ir_target_abort("RemapEn is disabled");
> 
> Note that recently QEMU introduced error_report_once().  Feel free to
> switch to that if you want.  Many of the VT-d emulation code switched
> to that to make sure errors like this will be dumped at least once and
> we're also free from Dos attack.  This should at least apply to all of
> your below trace_amdvi_ir_target_abort() calls, even some other traces.
> 


IMHO we should not be using error_report_once() here. It's possible that
guest OS have DTE[IV]=1 but has not programmed the interrupt
remapping entries or have deactivated the remapping. I see that Linux
OS does it all the time and in those cases we will be printing out
error messages - which will give a wrong information.



>> +        return -AMDVI_IR_TARGET_ABORT;
>> +    }
>> +
>> +    if (irte.fields.guest_mode) {
>> +        trace_amdvi_ir_target_abort("guest mode is not zero");
>> +        return -AMDVI_IR_TARGET_ABORT;
>> +    }
>> +
>> +    if (irte.fields.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
>> +        trace_amdvi_ir_target_abort("reserved int_type");
>> +        return -AMDVI_IR_TARGET_ABORT;
>> +    }
>> +
>> +    irq->delivery_mode = irte.fields.int_type;
>> +    irq->vector = irte.fields.vector;
>> +    irq->dest_mode = irte.fields.dm;
>> +    irq->redir_hint = irte.fields.rq_eoi;
>> +    irq->dest = irte.fields.destination;
>> +
>> +    return 0;
>> +}
>> +
>> +static int __amdvi_int_remap_msi(AMDVIState *iommu,
>> +                                 MSIMessage *origin,
>> +                                 MSIMessage *translated,
>> +                                 uint64_t *dte,
>> +                                 X86IOMMUIrq *irq,
>> +                                 uint16_t sid)
>> +{
>> +    uint8_t int_ctl;
>> +
>> +    int_ctl = (dte[2] >> AMDVI_IR_INTCTL_SHIFT) & 3;
>> +    trace_amdvi_ir_intctl(int_ctl);
>> +
>> +    switch (int_ctl) {
>> +    case AMDVI_IR_INTCTL_PASS:
>> +        memcpy(translated, origin, sizeof(*origin));
>> +        return 0;
>> +    case AMDVI_IR_INTCTL_REMAP:
>> +        break;
>> +    case AMDVI_IR_INTCTL_ABORT:
>> +        trace_amdvi_ir_target_abort("int_ctl abort");
>> +        return -AMDVI_IR_TARGET_ABORT;
>> +    default:
>> +        trace_amdvi_ir_target_abort("int_ctl reserved");
>> +        return -AMDVI_IR_TARGET_ABORT;
>> +    }
>> +
>> +    return amdvi_int_remap_legacy(iommu, origin, translated, dte, irq, sid);
>> +}
>> +
>> +static bool amdvi_validate_int_reamp(AMDVIState *s, uint64_t *dte)
> 
> s/reamp/remap/
> 

Will fix it in next rev. thanks


>> +{
>> +    /* Check if IR is enabled in DTE */
>> +    if (!(dte[2] & AMDVI_IR_REMAP_ENABLE)) {
>> +        return false;
>> +    }
>> +
>> +    /* validate that we are configure with intremap=on */
>> +    if (!s->intr_enabled) {
>> +        error_report("Interrupt remapping is enabled in the guest but "
>> +                     "not in the host. Use intremap=on to enable interrupt "
>> +                     "remapping in amd-iommu.");
>> +        exit(1);
> 
> This will crash QEMU.  IMHO this is not a configuration error but a
> guest kernel error.  We'd better not crash QEMU for that.  Wny not
> just report the error upwards.
> 


Honestly, I did struggled whether to use "exit" or error. Actually this
is a configuration error.

AMD IOMMU does not have an explicit bit to indicate whether the IR
feature is supported. All versions of IOMMU supports IR. But in QEMU
we are using 'intermap' to enable/disable the feature. In case of AMD 
IOMMU, we will not be able to link intremap to control the exposure of
this feature to the guest. So we need to do some checks during the
runtime. If we get asked to remap the interrupts (when intremap=off)
then IMHO its a configuration error and we should kill the guest.
Having said so, I am flexible and if needed then I can remove the
exit from next patch.


(btw, patch 6 is Linux specific quirk)


>> +    }
>> +
>> +    return true;
>> +}
>> +
>>   /* Interrupt remapping for MSI/MSI-X entry */
>>   static int amdvi_int_remap_msi(AMDVIState *iommu,
>>                                  MSIMessage *origin,
>>                                  MSIMessage *translated,
>>                                  uint16_t sid)
>>   {
>> +    int ret = 0;
>> +    uint64_t pass = 0;
>> +    uint64_t dte[4] = { 0 };
>> +    X86IOMMUIrq irq = { 0 };
>> +    uint8_t dest_mode, delivery_mode;
>> +
>>       assert(origin && translated);
>>   
>> +    /*
>> +     * When IOMMU is enabled, interrupt remap request will come either from
>> +     * IO-APIC or PCI device. If interrupt is from PCI device then it will
>> +     * have a valid requester id but if the interrupt is from IO-APIC
>> +     * then requester id will be invalid.
>> +     */
>> +    if (sid == X86_IOMMU_SID_INVALID) {
>> +        sid = AMDVI_IOAPIC_SB_DEVID;
>> +    }
>> +
>>       trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid);
>>   
>> -    if (!iommu || !iommu->intr_enabled) {
>> +    /* verify that interrupt remapping is enabled before going further. */
>> +    if (!iommu ||
>> +        !amdvi_get_dte(iommu, sid, dte)  ||
>> +        !amdvi_validate_int_reamp(iommu, dte)) {
>>           memcpy(translated, origin, sizeof(*origin));
>>           goto out;
>>       }
>> @@ -1055,10 +1184,68 @@ static int amdvi_int_remap_msi(AMDVIState *iommu,
>>           return -AMDVI_IR_ERR;
>>       }
>>   
>> +    delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
> 
> IIRC you mentioned that you'll paste some field definition here, or
> which figure/table we should refer to.  But here I still see no
> comment, and I don't know where defined it.  Again, Figure 14, bits
> 10:8 are part of index there.


This time used the pre-defined macro's from the apic-msidef.h hence I
thought there was no need to include the field definitions. But, I will
go ahead and include the field definition in the next rev.

* The delivery_mode field from the MSI data is used to get the upstream
   interrupt type.
* If the interrupt type is Fixed/Arbitrated, MSI data 10:0 is used
   as an offset within the IRTE table to get the actual IRTE entry
   (Fig 14).

Please note that we don't emulate the HyperTransport hence instead we
should not treat MSI data 10:8 as index instead the MSI data 10:0 should
be used as "offset" within the IRTE table to get the interrupt remapping
table entries.


> 
>> +
>> +    switch (delivery_mode) {
>> +    case AMDVI_IOAPIC_INT_TYPE_FIXED:
>> +    case AMDVI_IOAPIC_INT_TYPE_ARBITRATED:
>> +        trace_amdvi_ir_delivery_mode("fixed/arbitrated");
>> +        ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, sid);
>> +        if (ret < 0) {
>> +            goto remap_fail;
>> +        } else {
>> +            /* Translate IRQ to MSI messages */
>> +            x86_iommu_irq_to_msi_message(&irq, translated);
>> +            goto out;
>> +        }
>> +        break;
>> +    case AMDVI_IOAPIC_INT_TYPE_SMI:
>> +        error_report("SMI is not supported!");
>> +        ret = -AMDVI_IR_ERR;
> 
> Is it intended to not have the break?


Ouch, it should have a break. thanks



> 
>> +    case AMDVI_IOAPIC_INT_TYPE_NMI:
>> +        pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK;
>> +        trace_amdvi_ir_delivery_mode("nmi");
>> +        break;
>> +    case AMDVI_IOAPIC_INT_TYPE_INIT:
>> +        pass = dte[3] & AMDVI_DEV_INT_PASS_MASK;
>> +        trace_amdvi_ir_delivery_mode("init");
>> +        break;
>> +    case AMDVI_IOAPIC_INT_TYPE_EINT:
>> +        pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK;
>> +        trace_amdvi_ir_delivery_mode("eint");
>> +        break;
>> +    default:
>> +        trace_amdvi_ir_delivery_mode("unsupported delivery_mode");
>> +        ret = -AMDVI_IR_ERR;
>> +        break;
>> +    }
>> +
>> +    if (ret < 0) {
>> +        goto remap_fail;
>> +    }
>> +
>> +    /* dest_mode 1 is valid for fixed and arbitrated interrupts only */
>> +    dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
>> +    if (dest_mode) {
>> +        trace_amdvi_ir_err("invalid dest_mode");
>> +        goto remap_fail;
>> +    }
>> +
>> +    if (pass) {
>> +        memcpy(translated, origin, sizeof(*origin));
>> +    } else {
>> +        /* pass through is not enabled */
>> +        trace_amdvi_ir_err("passthrough is not enabled");
>> +        goto remap_fail;
>> +    }
>> +
>>   out:
>>       trace_amdvi_ir_remap_msi(origin->address, origin->data,
>>                                translated->address, translated->data);
>>       return 0;
>> +
>> +remap_fail:
>> +    return -AMDVI_IR_TARGET_ABORT;
>>   }
>>   
>>   static int amdvi_int_remap(X86IOMMUState *iommu,
>> diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
>> index 71ff3c1..bd27cd2 100644
>> --- a/hw/i386/amd_iommu.h
>> +++ b/hw/i386/amd_iommu.h
>> @@ -217,7 +217,51 @@
>>   
>>   /* Interrupt remapping errors */
>>   #define AMDVI_IR_ERR            0x1
>> -
>> +#define AMDVI_IR_GET_IRTE       0x2
>> +#define AMDVI_IR_TARGET_ABORT   0x3
>> +
>> +/* Interrupt remapping */
>> +#define AMDVI_IR_REMAP_ENABLE           1ULL
>> +#define AMDVI_IR_INTCTL_SHIFT           60
>> +#define AMDVI_IR_INTCTL_ABORT           0
>> +#define AMDVI_IR_INTCTL_PASS            1
>> +#define AMDVI_IR_INTCTL_REMAP           2
>> +
>> +#define AMDVI_IR_PHYS_ADDR_MASK         (((1ULL << 45) - 1) << 6)
>> +
>> +/* MSI data 10:0 bits (section 2.2.5.1 Fig 14) */
>> +#define AMDVI_IRTE_OFFSET               0x7ff
>> +
>> +/* Delivery mode of MSI data (same as IOAPIC deilver mode encoding) */
>> +#define AMDVI_IOAPIC_INT_TYPE_FIXED          0x0
>> +#define AMDVI_IOAPIC_INT_TYPE_ARBITRATED     0x1
>> +#define AMDVI_IOAPIC_INT_TYPE_SMI            0x2
>> +#define AMDVI_IOAPIC_INT_TYPE_NMI            0x4
>> +#define AMDVI_IOAPIC_INT_TYPE_INIT           0x5
>> +#define AMDVI_IOAPIC_INT_TYPE_EINT           0x7
>> +
>> +/* Pass through interrupt */
>> +#define AMDVI_DEV_INT_PASS_MASK         (1UL << 56)
>> +#define AMDVI_DEV_EINT_PASS_MASK        (1UL << 57)
>> +#define AMDVI_DEV_NMI_PASS_MASK         (1UL << 58)
>> +#define AMDVI_DEV_LINT0_PASS_MASK       (1UL << 62)
>> +#define AMDVI_DEV_LINT1_PASS_MASK       (1UL << 63)
>> +
>> +/* Interrupt remapping table fields (Guest VAPIC not enabled) */
>> +union irte {
>> +    uint32_t val;
>> +    struct {
>> +        uint32_t valid:1,
>> +                 no_fault:1,
>> +                 int_type:3,
>> +                 rq_eoi:1,
>> +                 dm:1,
>> +                 guest_mode:1,
>> +                 destination:8,
>> +                 vector:8,
>> +                 rsvd:8;
>> +    } fields;
>> +};
>>   
>>   #define TYPE_AMD_IOMMU_DEVICE "amd-iommu"
>>   #define AMD_IOMMU_DEVICE(obj)\
>> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
>> index 41d533c..98150c9 100644
>> --- a/hw/i386/trace-events
>> +++ b/hw/i386/trace-events
>> @@ -106,6 +106,13 @@ amdvi_mem_ir_write(uint64_t addr, uint64_t val) "addr 0x%"PRIx64" data 0x%"PRIx6
>>   amdvi_ir_remap_msi_req(uint64_t addr, uint64_t data, uint8_t devid) "addr 0x%"PRIx64" data 0x%"PRIx64" devid 0x%"PRIx8
>>   amdvi_ir_remap_msi(uint64_t addr, uint64_t data, uint64_t addr2, uint64_t data2) "(addr 0x%"PRIx64", data 0x%"PRIx64") -> (addr 0x%"PRIx64", data 0x%"PRIx64")"
>>   amdvi_err(const char *str) "%s"
>> +amdvi_ir_irte(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" offset 0x%"PRIx64
>> +amdvi_ir_irte_val(uint32_t data) "data 0x%"PRIx32
>> +amdvi_ir_err(const char *str) "%s"
>> +amdvi_ir_intctl(uint8_t val) "int_ctl 0x%"PRIx8
>> +amdvi_ir_target_abort(const char *str) "%s"
>> +amdvi_ir_delivery_mode(const char *str) "%s"
>> +amdvi_ir_generate_msi_message(uint8_t vector, uint8_t delivery_mode, uint8_t dest_mode, uint8_t dest, uint8_t rh) "vector %d delivery-mode %d dest-mode %d dest-id %d rh %d"
>>   
>>   # hw/i386/vmport.c
>>   vmport_register(unsigned char command, void *func, void *opaque) "command: 0x%02x func: %p opaque: %p"
>> -- 
>> 2.7.4
>>
>>
> 
> Regards,
>
Peter Xu Sept. 18, 2018, 3:53 a.m. UTC | #3
On Mon, Sep 17, 2018 at 10:09:33AM -0500, Brijesh Singh wrote:

[...]

> > > +static int amdvi_int_remap_legacy(AMDVIState *iommu,
> > > +                                  MSIMessage *origin,
> > > +                                  MSIMessage *translated,
> > > +                                  uint64_t *dte,
> > > +                                  X86IOMMUIrq *irq,
> > > +                                  uint16_t sid)
> > > +{
> > > +    int ret;
> > > +    union irte irte;
> > > +
> > > +    /* get interrupt remapping table */
> > > +    ret = amdvi_get_irte(iommu, origin, dte, &irte, sid);
> > > +    if (ret < 0) {
> > > +        return ret;
> > > +    }
> > > +
> > > +    if (!irte.fields.valid) {
> > > +        trace_amdvi_ir_target_abort("RemapEn is disabled");
> > 
> > Note that recently QEMU introduced error_report_once().  Feel free to
> > switch to that if you want.  Many of the VT-d emulation code switched
> > to that to make sure errors like this will be dumped at least once and
> > we're also free from Dos attack.  This should at least apply to all of
> > your below trace_amdvi_ir_target_abort() calls, even some other traces.
> > 
> 
> 
> IMHO we should not be using error_report_once() here. It's possible that
> guest OS have DTE[IV]=1 but has not programmed the interrupt
> remapping entries or have deactivated the remapping. I see that Linux
> OS does it all the time and in those cases we will be printing out
> error messages - which will give a wrong information.

I would be a bit curious on why the guest would like to switch the irq
off if the device still needs to generate any... though this question
could be out of topic so feel free to skip.

But if that's normal then I would suggest you use a better naming of
the trace point, "*_target_abort" seems to be a severe issue.

Also, my suggestion is a general one - it still applies to the places
that you think a well-behaved guest should not do.  It's optional so
you can decide whether you want to use it.

[...]

> > > +    /* validate that we are configure with intremap=on */
> > > +    if (!s->intr_enabled) {
> > > +        error_report("Interrupt remapping is enabled in the guest but "
> > > +                     "not in the host. Use intremap=on to enable interrupt "
> > > +                     "remapping in amd-iommu.");
> > > +        exit(1);
> > 
> > This will crash QEMU.  IMHO this is not a configuration error but a
> > guest kernel error.  We'd better not crash QEMU for that.  Wny not
> > just report the error upwards.
> > 
> 
> 
> Honestly, I did struggled whether to use "exit" or error. Actually this
> is a configuration error.
> 
> AMD IOMMU does not have an explicit bit to indicate whether the IR
> feature is supported. All versions of IOMMU supports IR. But in QEMU
> we are using 'intermap' to enable/disable the feature. In case of AMD IOMMU,
> we will not be able to link intremap to control the exposure of
> this feature to the guest. So we need to do some checks during the
> runtime. If we get asked to remap the interrupts (when intremap=off)
> then IMHO its a configuration error and we should kill the guest.
> Having said so, I am flexible and if needed then I can remove the
> exit from next patch.

If all AMD IOMMU has IR, then I'm curious how the old AMD IOMMU
emulation worked since that does not have an IR, do you know how the
guest detects that?  (It must have done so otherwise IRQ won't work
there, isn't it?)

And if that's true, instead of check intr_enabled here, I would
suggest you check it in realize() - we'd better crash asap when we
know this error, and we know it from the very beginning.

> 
> 
> (btw, patch 6 is Linux specific quirk)
> 
> 
> > > +    }
> > > +
> > > +    return true;
> > > +}
> > > +
> > >   /* Interrupt remapping for MSI/MSI-X entry */
> > >   static int amdvi_int_remap_msi(AMDVIState *iommu,
> > >                                  MSIMessage *origin,
> > >                                  MSIMessage *translated,
> > >                                  uint16_t sid)
> > >   {
> > > +    int ret = 0;
> > > +    uint64_t pass = 0;
> > > +    uint64_t dte[4] = { 0 };
> > > +    X86IOMMUIrq irq = { 0 };
> > > +    uint8_t dest_mode, delivery_mode;
> > > +
> > >       assert(origin && translated);
> > > +    /*
> > > +     * When IOMMU is enabled, interrupt remap request will come either from
> > > +     * IO-APIC or PCI device. If interrupt is from PCI device then it will
> > > +     * have a valid requester id but if the interrupt is from IO-APIC
> > > +     * then requester id will be invalid.
> > > +     */
> > > +    if (sid == X86_IOMMU_SID_INVALID) {
> > > +        sid = AMDVI_IOAPIC_SB_DEVID;
> > > +    }
> > > +
> > >       trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid);
> > > -    if (!iommu || !iommu->intr_enabled) {
> > > +    /* verify that interrupt remapping is enabled before going further. */
> > > +    if (!iommu ||
> > > +        !amdvi_get_dte(iommu, sid, dte)  ||
> > > +        !amdvi_validate_int_reamp(iommu, dte)) {
> > >           memcpy(translated, origin, sizeof(*origin));
> > >           goto out;
> > >       }
> > > @@ -1055,10 +1184,68 @@ static int amdvi_int_remap_msi(AMDVIState *iommu,
> > >           return -AMDVI_IR_ERR;
> > >       }
> > > +    delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
> > 
> > IIRC you mentioned that you'll paste some field definition here, or
> > which figure/table we should refer to.  But here I still see no
> > comment, and I don't know where defined it.  Again, Figure 14, bits
> > 10:8 are part of index there.
> 
> 
> This time used the pre-defined macro's from the apic-msidef.h hence I
> thought there was no need to include the field definitions. But, I will
> go ahead and include the field definition in the next rev.
> 
> * The delivery_mode field from the MSI data is used to get the upstream
>   interrupt type.
> * If the interrupt type is Fixed/Arbitrated, MSI data 10:0 is used
>   as an offset within the IRTE table to get the actual IRTE entry
>   (Fig 14).
> 
> Please note that we don't emulate the HyperTransport hence instead we
> should not treat MSI data 10:8 as index instead the MSI data 10:0 should
> be used as "offset" within the IRTE table to get the interrupt remapping
> table entries.

Frankly speaking until now I don't quite understand what's that
HyperTransport thing... but ok I think I understand your point now:
you mean that the "Interrupt type" mentioned in Table 19 is exactly
the same bits (10:8) defined in general MSI data register.  I think
that's the only knowledge that I'm missing from there.  Say, Table 19
told me that "what interrupt type should be controlled by what",
however it never told me where could I find the interrupt types... or
is it defined somewhere but I didn't notice?

(I do feel hard to read the AMD spec some times...)

If my understanding is correct, maybe you could comment above the
delivery_mode with things like "Please refer to Table 19; delivery
mode is defined as the same as general MSI data format on bits 10:8".

Thanks,
Brijesh Singh Sept. 18, 2018, 8:27 p.m. UTC | #4
On 09/17/2018 10:53 PM, Peter Xu wrote:
[...]

>> IMHO we should not be using error_report_once() here. It's possible that
>> guest OS have DTE[IV]=1 but has not programmed the interrupt
>> remapping entries or have deactivated the remapping. I see that Linux
>> OS does it all the time and in those cases we will be printing out
>> error messages - which will give a wrong information.
> 
> I would be a bit curious on why the guest would like to switch the irq
> off if the device still needs to generate any... though this question
> could be out of topic so feel free to skip.

If you are interested then look at kernel irq domain [1] (Hierarchy IRQ
domain). There are 4 interfaces to the irq_domain:

- irq_domain_alloc_irqs() : when IR is enabled this will set DTE[IV]=1
- irq_domain_{activate,deactivate}: sets/clears the IRTE.fields.valid
   (aka IRTE.RemapEn bit).

You can find some additional info in this document on how a domain will
activate/deactivate resources.

[1] https://www.kernel.org/doc/Documentation/IRQ-domain.txt


> 
> But if that's normal then I would suggest you use a better naming of
> the trace point, "*_target_abort" seems to be a severe issue.
> 

I have been trying to adhere to IOMMU spec, if spec says that we should
cause an "target abort" then I try to use _target_abort tracepoint. In
case of vIOMMU we do not generate any IOMMU log event to communicate the
guest about the target aborts but on native system hardware will
generate a events which guest OS can parse to get additional info. Maybe
in future we can look at logging events from vIOMMU.


> Also, my suggestion is a general one - it still applies to the places
> that you think a well-behaved guest should not do.  It's optional so
> you can decide whether you want to use it.
> 

Agreed, I will certain look into it.


[...]

>> Honestly, I did struggled whether to use "exit" or error. Actually this
>> is a configuration error.
>>
>> AMD IOMMU does not have an explicit bit to indicate whether the IR
>> feature is supported. All versions of IOMMU supports IR. But in QEMU
>> we are using 'intermap' to enable/disable the feature. In case of AMD IOMMU,
>> we will not be able to link intremap to control the exposure of
>> this feature to the guest. So we need to do some checks during the
>> runtime. If we get asked to remap the interrupts (when intremap=off)
>> then IMHO its a configuration error and we should kill the guest.
>> Having said so, I am flexible and if needed then I can remove the
>> exit from next patch.
> 
> If all AMD IOMMU has IR, then I'm curious how the old AMD IOMMU
> emulation worked since that does not have an IR, do you know how the
> guest detects that?  (It must have done so otherwise IRQ won't work
> there, isn't it?)
> 
> And if that's true, instead of check intr_enabled here, I would
> suggest you check it in realize() - we'd better crash asap when we
> know this error, and we know it from the very beginning.
> 
>>


So far non of the guests were enabling the interrupt remap features
even when it was available. As I explained in previous patches (see
patch 6), Linux guest looks for a special IOAPIC device in IVHD before
enabling the interrupt remap. If you enable the amd_iommu_debug=1
in kernel cmdline then kernel bootup prints the message that its
attempting to enable the IR but since IOAPIC entry does not exist
in IVHD hence it disabled it.

Notes: for kicks you can apply patch 6 from this series in existing
code base and you will see that Linux OS will enable the IR (it does
care about intremap=<on|off> from amd-iommu.


[...]

>>
>> This time used the pre-defined macro's from the apic-msidef.h hence I
>> thought there was no need to include the field definitions. But, I will
>> go ahead and include the field definition in the next rev.
>>
>> * The delivery_mode field from the MSI data is used to get the upstream
>>    interrupt type.
>> * If the interrupt type is Fixed/Arbitrated, MSI data 10:0 is used
>>    as an offset within the IRTE table to get the actual IRTE entry
>>    (Fig 14).
>>
>> Please note that we don't emulate the HyperTransport hence instead we
>> should not treat MSI data 10:8 as index instead the MSI data 10:0 should
>> be used as "offset" within the IRTE table to get the interrupt remapping
>> table entries.
> 
> Frankly speaking until now I don't quite understand what's that
> HyperTransport thing... but ok I think I understand your point now:
> you mean that the "Interrupt type" mentioned in Table 19 is exactly
> the same bits (10:8) defined in general MSI data register.  I think
> that's the only knowledge that I'm missing from there. 


The MSI data register bit 10:8 definitions are similar to the Hyper
Transport MT encoding (Table 19), but they are not quite the same.

In our case we don't have HyperTransport so we need to use the
bit definition from the IOAPIC https://wiki.osdev.org/APIC

enum ioapic_irq_destination_types {
         dest_Fixed              = 0,
         dest_LowestPrio         = 1,
         dest_SMI                = 2,
         dest__reserved_1        = 3,
         dest_NMI                = 4,
         dest_INIT               = 5,
         dest__reserved_2        = 6,
         dest_ExtINT             = 7
};


  Say, Table 19
> told me that "what interrupt type should be controlled by what",
> however it never told me where could I find the interrupt types... or
> is it defined somewhere but I didn't notice?
> 
> (I do feel hard to read the AMD spec some times...)
> 

I hear you loud and clear :) sometimes its confusing ;)


> If my understanding is correct, maybe you could comment above the
> delivery_mode with things like "Please refer to Table 19; delivery
> mode is defined as the same as general MSI data format on bits 10:8".
> 

I will include some comments in next rev.

thanks for all your review feedbacks.

-Brijesh
Brijesh Singh Sept. 18, 2018, 8:31 p.m. UTC | #5
Small correction


On 09/18/2018 03:27 PM, Singh, Brijesh wrote:>
> So far non of the guests were enabling the interrupt remap features
> even when it was available. As I explained in previous patches (see
> patch 6), Linux guest looks for a special IOAPIC device in IVHD before
> enabling the interrupt remap. If you enable the amd_iommu_debug=1
> in kernel cmdline then kernel bootup prints the message that its
> attempting to enable the IR but since IOAPIC entry does not exist
> in IVHD hence it disabled it.
> 
> Notes: for kicks you can apply patch 6 from this series in existing
> code base and you will see that Linux OS will enable the IR (it does
> care about intremap=<on|off> from amd-iommu.
> 


I was meaning to say (it does *not* care about  intremap=...)
diff mbox series

Patch

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index b15962b..9c8e4de 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -28,6 +28,8 @@ 
 #include "qemu/error-report.h"
 #include "hw/i386/apic_internal.h"
 #include "trace.h"
+#include "cpu.h"
+#include "hw/i386/apic-msidef.h"
 
 /* used AMD-Vi MMIO registers */
 const char *amdvi_mmio_low[] = {
@@ -1029,17 +1031,144 @@  static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
     return ret;
 }
 
+static int amdvi_get_irte(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
+                          union irte *irte, uint16_t devid)
+{
+    uint64_t irte_root, offset;
+
+    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
+    offset = (origin->data & AMDVI_IRTE_OFFSET) << 2;
+
+    trace_amdvi_ir_irte(irte_root, offset);
+
+    if (dma_memory_read(&address_space_memory, irte_root + offset,
+                        irte, sizeof(*irte))) {
+        trace_amdvi_ir_err("failed to get irte");
+        return -AMDVI_IR_GET_IRTE;
+    }
+
+    trace_amdvi_ir_irte_val(irte->val);
+
+    return 0;
+}
+
+static int amdvi_int_remap_legacy(AMDVIState *iommu,
+                                  MSIMessage *origin,
+                                  MSIMessage *translated,
+                                  uint64_t *dte,
+                                  X86IOMMUIrq *irq,
+                                  uint16_t sid)
+{
+    int ret;
+    union irte irte;
+
+    /* get interrupt remapping table */
+    ret = amdvi_get_irte(iommu, origin, dte, &irte, sid);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (!irte.fields.valid) {
+        trace_amdvi_ir_target_abort("RemapEn is disabled");
+        return -AMDVI_IR_TARGET_ABORT;
+    }
+
+    if (irte.fields.guest_mode) {
+        trace_amdvi_ir_target_abort("guest mode is not zero");
+        return -AMDVI_IR_TARGET_ABORT;
+    }
+
+    if (irte.fields.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
+        trace_amdvi_ir_target_abort("reserved int_type");
+        return -AMDVI_IR_TARGET_ABORT;
+    }
+
+    irq->delivery_mode = irte.fields.int_type;
+    irq->vector = irte.fields.vector;
+    irq->dest_mode = irte.fields.dm;
+    irq->redir_hint = irte.fields.rq_eoi;
+    irq->dest = irte.fields.destination;
+
+    return 0;
+}
+
+static int __amdvi_int_remap_msi(AMDVIState *iommu,
+                                 MSIMessage *origin,
+                                 MSIMessage *translated,
+                                 uint64_t *dte,
+                                 X86IOMMUIrq *irq,
+                                 uint16_t sid)
+{
+    uint8_t int_ctl;
+
+    int_ctl = (dte[2] >> AMDVI_IR_INTCTL_SHIFT) & 3;
+    trace_amdvi_ir_intctl(int_ctl);
+
+    switch (int_ctl) {
+    case AMDVI_IR_INTCTL_PASS:
+        memcpy(translated, origin, sizeof(*origin));
+        return 0;
+    case AMDVI_IR_INTCTL_REMAP:
+        break;
+    case AMDVI_IR_INTCTL_ABORT:
+        trace_amdvi_ir_target_abort("int_ctl abort");
+        return -AMDVI_IR_TARGET_ABORT;
+    default:
+        trace_amdvi_ir_target_abort("int_ctl reserved");
+        return -AMDVI_IR_TARGET_ABORT;
+    }
+
+    return amdvi_int_remap_legacy(iommu, origin, translated, dte, irq, sid);
+}
+
+static bool amdvi_validate_int_reamp(AMDVIState *s, uint64_t *dte)
+{
+    /* Check if IR is enabled in DTE */
+    if (!(dte[2] & AMDVI_IR_REMAP_ENABLE)) {
+        return false;
+    }
+
+    /* validate that we are configure with intremap=on */
+    if (!s->intr_enabled) {
+        error_report("Interrupt remapping is enabled in the guest but "
+                     "not in the host. Use intremap=on to enable interrupt "
+                     "remapping in amd-iommu.");
+        exit(1);
+    }
+
+    return true;
+}
+
 /* Interrupt remapping for MSI/MSI-X entry */
 static int amdvi_int_remap_msi(AMDVIState *iommu,
                                MSIMessage *origin,
                                MSIMessage *translated,
                                uint16_t sid)
 {
+    int ret = 0;
+    uint64_t pass = 0;
+    uint64_t dte[4] = { 0 };
+    X86IOMMUIrq irq = { 0 };
+    uint8_t dest_mode, delivery_mode;
+
     assert(origin && translated);
 
+    /*
+     * When IOMMU is enabled, interrupt remap request will come either from
+     * IO-APIC or PCI device. If interrupt is from PCI device then it will
+     * have a valid requester id but if the interrupt is from IO-APIC
+     * then requester id will be invalid.
+     */
+    if (sid == X86_IOMMU_SID_INVALID) {
+        sid = AMDVI_IOAPIC_SB_DEVID;
+    }
+
     trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid);
 
-    if (!iommu || !iommu->intr_enabled) {
+    /* verify that interrupt remapping is enabled before going further. */
+    if (!iommu ||
+        !amdvi_get_dte(iommu, sid, dte)  ||
+        !amdvi_validate_int_reamp(iommu, dte)) {
         memcpy(translated, origin, sizeof(*origin));
         goto out;
     }
@@ -1055,10 +1184,68 @@  static int amdvi_int_remap_msi(AMDVIState *iommu,
         return -AMDVI_IR_ERR;
     }
 
+    delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
+
+    switch (delivery_mode) {
+    case AMDVI_IOAPIC_INT_TYPE_FIXED:
+    case AMDVI_IOAPIC_INT_TYPE_ARBITRATED:
+        trace_amdvi_ir_delivery_mode("fixed/arbitrated");
+        ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, sid);
+        if (ret < 0) {
+            goto remap_fail;
+        } else {
+            /* Translate IRQ to MSI messages */
+            x86_iommu_irq_to_msi_message(&irq, translated);
+            goto out;
+        }
+        break;
+    case AMDVI_IOAPIC_INT_TYPE_SMI:
+        error_report("SMI is not supported!");
+        ret = -AMDVI_IR_ERR;
+    case AMDVI_IOAPIC_INT_TYPE_NMI:
+        pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK;
+        trace_amdvi_ir_delivery_mode("nmi");
+        break;
+    case AMDVI_IOAPIC_INT_TYPE_INIT:
+        pass = dte[3] & AMDVI_DEV_INT_PASS_MASK;
+        trace_amdvi_ir_delivery_mode("init");
+        break;
+    case AMDVI_IOAPIC_INT_TYPE_EINT:
+        pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK;
+        trace_amdvi_ir_delivery_mode("eint");
+        break;
+    default:
+        trace_amdvi_ir_delivery_mode("unsupported delivery_mode");
+        ret = -AMDVI_IR_ERR;
+        break;
+    }
+
+    if (ret < 0) {
+        goto remap_fail;
+    }
+
+    /* dest_mode 1 is valid for fixed and arbitrated interrupts only */
+    dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
+    if (dest_mode) {
+        trace_amdvi_ir_err("invalid dest_mode");
+        goto remap_fail;
+    }
+
+    if (pass) {
+        memcpy(translated, origin, sizeof(*origin));
+    } else {
+        /* pass through is not enabled */
+        trace_amdvi_ir_err("passthrough is not enabled");
+        goto remap_fail;
+    }
+
 out:
     trace_amdvi_ir_remap_msi(origin->address, origin->data,
                              translated->address, translated->data);
     return 0;
+
+remap_fail:
+    return -AMDVI_IR_TARGET_ABORT;
 }
 
 static int amdvi_int_remap(X86IOMMUState *iommu,
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index 71ff3c1..bd27cd2 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -217,7 +217,51 @@ 
 
 /* Interrupt remapping errors */
 #define AMDVI_IR_ERR            0x1
-
+#define AMDVI_IR_GET_IRTE       0x2
+#define AMDVI_IR_TARGET_ABORT   0x3
+
+/* Interrupt remapping */
+#define AMDVI_IR_REMAP_ENABLE           1ULL
+#define AMDVI_IR_INTCTL_SHIFT           60
+#define AMDVI_IR_INTCTL_ABORT           0
+#define AMDVI_IR_INTCTL_PASS            1
+#define AMDVI_IR_INTCTL_REMAP           2
+
+#define AMDVI_IR_PHYS_ADDR_MASK         (((1ULL << 45) - 1) << 6)
+
+/* MSI data 10:0 bits (section 2.2.5.1 Fig 14) */
+#define AMDVI_IRTE_OFFSET               0x7ff
+
+/* Delivery mode of MSI data (same as IOAPIC deilver mode encoding) */
+#define AMDVI_IOAPIC_INT_TYPE_FIXED          0x0
+#define AMDVI_IOAPIC_INT_TYPE_ARBITRATED     0x1
+#define AMDVI_IOAPIC_INT_TYPE_SMI            0x2
+#define AMDVI_IOAPIC_INT_TYPE_NMI            0x4
+#define AMDVI_IOAPIC_INT_TYPE_INIT           0x5
+#define AMDVI_IOAPIC_INT_TYPE_EINT           0x7
+
+/* Pass through interrupt */
+#define AMDVI_DEV_INT_PASS_MASK         (1UL << 56)
+#define AMDVI_DEV_EINT_PASS_MASK        (1UL << 57)
+#define AMDVI_DEV_NMI_PASS_MASK         (1UL << 58)
+#define AMDVI_DEV_LINT0_PASS_MASK       (1UL << 62)
+#define AMDVI_DEV_LINT1_PASS_MASK       (1UL << 63)
+
+/* Interrupt remapping table fields (Guest VAPIC not enabled) */
+union irte {
+    uint32_t val;
+    struct {
+        uint32_t valid:1,
+                 no_fault:1,
+                 int_type:3,
+                 rq_eoi:1,
+                 dm:1,
+                 guest_mode:1,
+                 destination:8,
+                 vector:8,
+                 rsvd:8;
+    } fields;
+};
 
 #define TYPE_AMD_IOMMU_DEVICE "amd-iommu"
 #define AMD_IOMMU_DEVICE(obj)\
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 41d533c..98150c9 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -106,6 +106,13 @@  amdvi_mem_ir_write(uint64_t addr, uint64_t val) "addr 0x%"PRIx64" data 0x%"PRIx6
 amdvi_ir_remap_msi_req(uint64_t addr, uint64_t data, uint8_t devid) "addr 0x%"PRIx64" data 0x%"PRIx64" devid 0x%"PRIx8
 amdvi_ir_remap_msi(uint64_t addr, uint64_t data, uint64_t addr2, uint64_t data2) "(addr 0x%"PRIx64", data 0x%"PRIx64") -> (addr 0x%"PRIx64", data 0x%"PRIx64")"
 amdvi_err(const char *str) "%s"
+amdvi_ir_irte(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" offset 0x%"PRIx64
+amdvi_ir_irte_val(uint32_t data) "data 0x%"PRIx32
+amdvi_ir_err(const char *str) "%s"
+amdvi_ir_intctl(uint8_t val) "int_ctl 0x%"PRIx8
+amdvi_ir_target_abort(const char *str) "%s"
+amdvi_ir_delivery_mode(const char *str) "%s"
+amdvi_ir_generate_msi_message(uint8_t vector, uint8_t delivery_mode, uint8_t dest_mode, uint8_t dest, uint8_t rh) "vector %d delivery-mode %d dest-mode %d dest-id %d rh %d"
 
 # hw/i386/vmport.c
 vmport_register(unsigned char command, void *func, void *opaque) "command: 0x%02x func: %p opaque: %p"