diff mbox

[v4,4/6] hw/iommu: AMD IOMMU interrupt remapping

Message ID 1473674889-2727-5-git-send-email-davidkiarie4@gmail.com
State New
Headers show

Commit Message

David Kiarie Sept. 12, 2016, 10:08 a.m. UTC
Introduce AMD IOMMU interrupt remapping and hook it onto
the existing interrupt remapping infrastructure

Signed-off-by: David Kiarie <davidkiarie4@gmail.com>
---
 hw/i386/amd_iommu.c | 240 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 hw/i386/amd_iommu.h |   4 +-
 hw/intc/ioapic.c    |   1 -
 3 files changed, 242 insertions(+), 3 deletions(-)

Comments

Peter Xu Sept. 12, 2016, 11:34 a.m. UTC | #1
On Mon, Sep 12, 2016 at 01:08:07PM +0300, David Kiarie wrote:

[...]

>  /* configure MMIO registers at startup/reset */
>  static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
>                             uint64_t romask, uint64_t w1cmask)
> @@ -641,6 +667,11 @@ static void amdvi_inval_inttable(AMDVIState *s, CMDInvalIntrTable *inval)
>          amdvi_log_illegalcom_error(s, inval->type, s->cmdbuf + s->cmdbuf_head);
>          return;
>      }
> +
> +    if (s->ir_cache) {

Here, we notify IEC only if ir_cache == true, while...

[...]

> +static int amdvi_int_remap(X86IOMMUState *iommu, MSIMessage *src,
> +                           MSIMessage *dst, uint16_t sid)
> +{
> +    trace_amdvi_ir_request(src->data, src->address, sid);
> +
> +    AMDVIState *s = AMD_IOMMU_DEVICE(iommu);
> +    int ret = 0;
> +    uint64_t dte[4];
> +    uint32_t bitpos;
> +    IRTE irte;
> +
> +    amdvi_get_dte(s, sid, dte);
> +
> +    /* interrupt remapping disabled */
> +    if (!(dte[2] & AMDVI_IR_VALID)) {
> +        memcpy(dst, src, sizeof(*src));
> +        return ret;
> +    }
> +
> +    ret = amdvi_irte_get(s, src, &irte, dte, sid);
> +    if (ret < 0) {
> +        goto no_remap;
> +    }
> +    switch (src->data & AMDVI_IR_TYPE_MASK) {
> +    case AMDVI_MT_FIXED:
> +    case AMDVI_MT_ARBIT:
> +        ret = amdvi_remap_ir_intctl(dte[2], irte, src, dst);
> +        if (ret < 0) {
> +            goto no_remap;
> +        } else {
> +            s->ir_cache = true;

Here we set it only if the interrupts are triggered.

Shouldn't we notify IEC in all cases? Since the caches are setup
during configuration, not the first time the interrupt is triggered,
no?

> +            trace_amdvi_ir_remap(dst->data, dst->address, sid);
> +            return ret;
> +        }
> +    /* not handling SMI currently */
> +    case AMDVI_MT_SMI:
> +        error_report("SMI interrupts not currently handled");
> +        goto no_remap;
> +    case AMDVI_MT_NMI:
> +        bitpos = AMDVI_DTE_NMIPASS_LSHIFT;
> +        break;
> +    case AMDVI_MT_INIT:
> +        bitpos = AMDVI_DTE_INTPASS_LSHIFT;
> +        break;
> +    case AMDVI_MT_EXTINT:
> +        bitpos = AMDVI_DTE_EINTPASS_LSHIFT;
> +        break;
> +    case AMDVI_MT_LINT1:
> +        bitpos = AMDVI_DTE_LINT1PASS_LSHIFT;
> +        break;
> +    case AMDVI_MT_LINT0:
> +        bitpos = AMDVI_DTE_LINT0PASS_LSHIFT;
> +    default:
> +        goto no_remap;
> +    }
> +
> +    ret = amdvi_ir_handle_non_vectored(src, dst, bitpos, dte[2]);
> +    if (ret < 0){
> +        goto no_remap;
> +    }
> +    s->ir_cache = true;
> +    trace_amdvi_ir_remap(dst->data, dst->address, sid);
> +    return ret;
> +no_remap:
> +    memcpy(dst, src, sizeof(*src));

Shall we drop it and report when the remapping failed in some way?

I'm totally okay that we just drop it here, and we can do the
reporting in the future. But using the old is not a good one, since if
someone injects fault interrupts, it will be delivered just like
without IOMMU. So we have no protection at all.

Thanks,

-- peterx
David Kiarie Sept. 12, 2016, 11:51 a.m. UTC | #2
On Mon, Sep 12, 2016 at 2:34 PM, Peter Xu <peterx@redhat.com> wrote:

> On Mon, Sep 12, 2016 at 01:08:07PM +0300, David Kiarie wrote:
>
> [...]
>
> >  /* configure MMIO registers at startup/reset */
> >  static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
> >                             uint64_t romask, uint64_t w1cmask)
> > @@ -641,6 +667,11 @@ static void amdvi_inval_inttable(AMDVIState *s,
> CMDInvalIntrTable *inval)
> >          amdvi_log_illegalcom_error(s, inval->type, s->cmdbuf +
> s->cmdbuf_head);
> >          return;
> >      }
> > +
> > +    if (s->ir_cache) {
>
> Here, we notify IEC only if ir_cache == true, while...
>
> [...]
>
> > +static int amdvi_int_remap(X86IOMMUState *iommu, MSIMessage *src,
> > +                           MSIMessage *dst, uint16_t sid)
> > +{
> > +    trace_amdvi_ir_request(src->data, src->address, sid);
> > +
> > +    AMDVIState *s = AMD_IOMMU_DEVICE(iommu);
> > +    int ret = 0;
> > +    uint64_t dte[4];
> > +    uint32_t bitpos;
> > +    IRTE irte;
> > +
> > +    amdvi_get_dte(s, sid, dte);
> > +
> > +    /* interrupt remapping disabled */
> > +    if (!(dte[2] & AMDVI_IR_VALID)) {
> > +        memcpy(dst, src, sizeof(*src));
> > +        return ret;
> > +    }
> > +
> > +    ret = amdvi_irte_get(s, src, &irte, dte, sid);
> > +    if (ret < 0) {
> > +        goto no_remap;
> > +    }
> > +    switch (src->data & AMDVI_IR_TYPE_MASK) {
> > +    case AMDVI_MT_FIXED:
> > +    case AMDVI_MT_ARBIT:
> > +        ret = amdvi_remap_ir_intctl(dte[2], irte, src, dst);
> > +        if (ret < 0) {
> > +            goto no_remap;
> > +        } else {
> > +            s->ir_cache = true;
>
> Here we set it only if the interrupts are triggered.
>
> Shouldn't we notify IEC in all cases? Since the caches are setup
> during configuration, not the first time the interrupt is triggered,
> no?


I did have a problem with this. I don't know whether Intel IOMMU behaves
the same way but AMD IOMMU invalidates interrupt cache for each and every
device at boot(every possible device). Having the cache invalidation
trigger this many times bugs the guest at boot. I was of the opinion that
caches will not contain anything until translations actually happen.


>


> > +            trace_amdvi_ir_remap(dst->data, dst->address, sid);
> > +            return ret;
> > +        }
> > +    /* not handling SMI currently */
> > +    case AMDVI_MT_SMI:
> > +        error_report("SMI interrupts not currently handled");
> > +        goto no_remap;
> > +    case AMDVI_MT_NMI:
> > +        bitpos = AMDVI_DTE_NMIPASS_LSHIFT;
> > +        break;
> > +    case AMDVI_MT_INIT:
> > +        bitpos = AMDVI_DTE_INTPASS_LSHIFT;
> > +        break;
> > +    case AMDVI_MT_EXTINT:
> > +        bitpos = AMDVI_DTE_EINTPASS_LSHIFT;
> > +        break;
> > +    case AMDVI_MT_LINT1:
> > +        bitpos = AMDVI_DTE_LINT1PASS_LSHIFT;
> > +        break;
> > +    case AMDVI_MT_LINT0:
> > +        bitpos = AMDVI_DTE_LINT0PASS_LSHIFT;
> > +    default:
> > +        goto no_remap;
> > +    }
> > +
> > +    ret = amdvi_ir_handle_non_vectored(src, dst, bitpos, dte[2]);
> > +    if (ret < 0){
> > +        goto no_remap;
> > +    }
> > +    s->ir_cache = true;
> > +    trace_amdvi_ir_remap(dst->data, dst->address, sid);
> > +    return ret;
> > +no_remap:
> > +    memcpy(dst, src, sizeof(*src));
>
> Shall we drop it and report when the remapping failed in some way?
>
> I'm totally okay that we just drop it here, and we can do the
> reporting in the future. But using the old is not a good one, since if
> someone injects fault interrupts, it will be delivered just like
> without IOMMU. So we have no protection at all.
>

Wont this get dropped based on the return value ? I think the 'memcpy' is
not necessary but  my understanding is kvm will drop the translation based
on the return value, no ?


> Thanks,
>
> -- peterx
>
Peter Xu Sept. 12, 2016, 12:11 p.m. UTC | #3
On Mon, Sep 12, 2016 at 02:51:27PM +0300, David Kiarie wrote:
> On Mon, Sep 12, 2016 at 2:34 PM, Peter Xu <peterx@redhat.com> wrote:
> 
> > On Mon, Sep 12, 2016 at 01:08:07PM +0300, David Kiarie wrote:
> >
> > [...]
> >
> > >  /* configure MMIO registers at startup/reset */
> > >  static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
> > >                             uint64_t romask, uint64_t w1cmask)
> > > @@ -641,6 +667,11 @@ static void amdvi_inval_inttable(AMDVIState *s,
> > CMDInvalIntrTable *inval)
> > >          amdvi_log_illegalcom_error(s, inval->type, s->cmdbuf +
> > s->cmdbuf_head);
> > >          return;
> > >      }
> > > +
> > > +    if (s->ir_cache) {
> >
> > Here, we notify IEC only if ir_cache == true, while...
> >
> > [...]
> >
> > > +static int amdvi_int_remap(X86IOMMUState *iommu, MSIMessage *src,
> > > +                           MSIMessage *dst, uint16_t sid)
> > > +{
> > > +    trace_amdvi_ir_request(src->data, src->address, sid);
> > > +
> > > +    AMDVIState *s = AMD_IOMMU_DEVICE(iommu);
> > > +    int ret = 0;
> > > +    uint64_t dte[4];
> > > +    uint32_t bitpos;
> > > +    IRTE irte;
> > > +
> > > +    amdvi_get_dte(s, sid, dte);
> > > +
> > > +    /* interrupt remapping disabled */
> > > +    if (!(dte[2] & AMDVI_IR_VALID)) {
> > > +        memcpy(dst, src, sizeof(*src));
> > > +        return ret;
> > > +    }
> > > +
> > > +    ret = amdvi_irte_get(s, src, &irte, dte, sid);
> > > +    if (ret < 0) {
> > > +        goto no_remap;
> > > +    }
> > > +    switch (src->data & AMDVI_IR_TYPE_MASK) {
> > > +    case AMDVI_MT_FIXED:
> > > +    case AMDVI_MT_ARBIT:
> > > +        ret = amdvi_remap_ir_intctl(dte[2], irte, src, dst);
> > > +        if (ret < 0) {
> > > +            goto no_remap;
> > > +        } else {
> > > +            s->ir_cache = true;
> >
> > Here we set it only if the interrupts are triggered.
> >
> > Shouldn't we notify IEC in all cases? Since the caches are setup
> > during configuration, not the first time the interrupt is triggered,
> > no?
> 
> 
> I did have a problem with this. I don't know whether Intel IOMMU behaves
> the same way but AMD IOMMU invalidates interrupt cache for each and every
> device at boot(every possible device). Having the cache invalidation
> trigger this many times bugs the guest at boot. I was of the opinion that
> caches will not contain anything until translations actually happen.

When we say cache here, we are mostly talking about GSI routes in
kernel, right? Since we still don't have other kind of interrupt
caches AFAIK. If so, GSI routes should already been setup even if the
interrupts are not triggered for a single time. So we need to
invalidate them even ir_cache == false.

I think the problem is why cache invalidations during boot will bug
the system. Any clue?

> 
> 
> >
> 
> 
> > > +            trace_amdvi_ir_remap(dst->data, dst->address, sid);
> > > +            return ret;
> > > +        }
> > > +    /* not handling SMI currently */
> > > +    case AMDVI_MT_SMI:
> > > +        error_report("SMI interrupts not currently handled");
> > > +        goto no_remap;
> > > +    case AMDVI_MT_NMI:
> > > +        bitpos = AMDVI_DTE_NMIPASS_LSHIFT;
> > > +        break;
> > > +    case AMDVI_MT_INIT:
> > > +        bitpos = AMDVI_DTE_INTPASS_LSHIFT;
> > > +        break;
> > > +    case AMDVI_MT_EXTINT:
> > > +        bitpos = AMDVI_DTE_EINTPASS_LSHIFT;
> > > +        break;
> > > +    case AMDVI_MT_LINT1:
> > > +        bitpos = AMDVI_DTE_LINT1PASS_LSHIFT;
> > > +        break;
> > > +    case AMDVI_MT_LINT0:
> > > +        bitpos = AMDVI_DTE_LINT0PASS_LSHIFT;
> > > +    default:
> > > +        goto no_remap;
> > > +    }
> > > +
> > > +    ret = amdvi_ir_handle_non_vectored(src, dst, bitpos, dte[2]);
> > > +    if (ret < 0){
> > > +        goto no_remap;
> > > +    }
> > > +    s->ir_cache = true;
> > > +    trace_amdvi_ir_remap(dst->data, dst->address, sid);
> > > +    return ret;
> > > +no_remap:
> > > +    memcpy(dst, src, sizeof(*src));
> >
> > Shall we drop it and report when the remapping failed in some way?
> >
> > I'm totally okay that we just drop it here, and we can do the
> > reporting in the future. But using the old is not a good one, since if
> > someone injects fault interrupts, it will be delivered just like
> > without IOMMU. So we have no protection at all.
> >
> 
> Wont this get dropped based on the return value ? I think the 'memcpy' is
> not necessary but  my understanding is kvm will drop the translation based
> on the return value, no ?

Yeah you are right. Then I'll suggest something like:

    no_remap:
        memcpy(...);
    remap_fail:
        trace_...();
        return ret;

And we goto remap_fail when error happens. That'll be cleaner to me,
and after all we don't need to memcpy() if something failed.

Thanks,

-- peterx
David Kiarie Sept. 12, 2016, 12:45 p.m. UTC | #4
On Mon, Sep 12, 2016 at 3:11 PM, Peter Xu <peterx@redhat.com> wrote:

> On Mon, Sep 12, 2016 at 02:51:27PM +0300, David Kiarie wrote:
> > On Mon, Sep 12, 2016 at 2:34 PM, Peter Xu <peterx@redhat.com> wrote:
> >
> > > On Mon, Sep 12, 2016 at 01:08:07PM +0300, David Kiarie wrote:
> > >
> > > [...]
> > >
> > > >  /* configure MMIO registers at startup/reset */
> > > >  static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
> > > >                             uint64_t romask, uint64_t w1cmask)
> > > > @@ -641,6 +667,11 @@ static void amdvi_inval_inttable(AMDVIState *s,
> > > CMDInvalIntrTable *inval)
> > > >          amdvi_log_illegalcom_error(s, inval->type, s->cmdbuf +
> > > s->cmdbuf_head);
> > > >          return;
> > > >      }
> > > > +
> > > > +    if (s->ir_cache) {
> > >
> > > Here, we notify IEC only if ir_cache == true, while...
> > >
> > > [...]
> > >
> > > > +static int amdvi_int_remap(X86IOMMUState *iommu, MSIMessage *src,
> > > > +                           MSIMessage *dst, uint16_t sid)
> > > > +{
> > > > +    trace_amdvi_ir_request(src->data, src->address, sid);
> > > > +
> > > > +    AMDVIState *s = AMD_IOMMU_DEVICE(iommu);
> > > > +    int ret = 0;
> > > > +    uint64_t dte[4];
> > > > +    uint32_t bitpos;
> > > > +    IRTE irte;
> > > > +
> > > > +    amdvi_get_dte(s, sid, dte);
> > > > +
> > > > +    /* interrupt remapping disabled */
> > > > +    if (!(dte[2] & AMDVI_IR_VALID)) {
> > > > +        memcpy(dst, src, sizeof(*src));
> > > > +        return ret;
> > > > +    }
> > > > +
> > > > +    ret = amdvi_irte_get(s, src, &irte, dte, sid);
> > > > +    if (ret < 0) {
> > > > +        goto no_remap;
> > > > +    }
> > > > +    switch (src->data & AMDVI_IR_TYPE_MASK) {
> > > > +    case AMDVI_MT_FIXED:
> > > > +    case AMDVI_MT_ARBIT:
> > > > +        ret = amdvi_remap_ir_intctl(dte[2], irte, src, dst);
> > > > +        if (ret < 0) {
> > > > +            goto no_remap;
> > > > +        } else {
> > > > +            s->ir_cache = true;
> > >
> > > Here we set it only if the interrupts are triggered.
> > >
> > > Shouldn't we notify IEC in all cases? Since the caches are setup
> > > during configuration, not the first time the interrupt is triggered,
> > > no?
> >
> >
> > I did have a problem with this. I don't know whether Intel IOMMU behaves
> > the same way but AMD IOMMU invalidates interrupt cache for each and every
> > device at boot(every possible device). Having the cache invalidation
> > trigger this many times bugs the guest at boot. I was of the opinion that
> > caches will not contain anything until translations actually happen.
>
> When we say cache here, we are mostly talking about GSI routes in
> kernel, right? Since we still don't have other kind of interrupt
> caches AFAIK. If so, GSI routes should already been setup even if the
> interrupts are not triggered for a single time. So we need to
> invalidate them even ir_cache == false.
>

You're right but I'm not sure how to implement that while avoiding
triggering the notifier numerous pointless times during boot.


> I think the problem is why cache invalidations during boot will bug
> the system. Any clue?
>

The issue is not too many invalidations. I don't have a very clear idea of
how notifiers work but I would assume it spawns a thread or they somehow
use a multithreaded approach which would mean triggering the notifier too
many times within a very short period many trigger a bunch of issues.


> >
> >
> > >
> >
> >
> > > > +            trace_amdvi_ir_remap(dst->data, dst->address, sid);
> > > > +            return ret;
> > > > +        }
> > > > +    /* not handling SMI currently */
> > > > +    case AMDVI_MT_SMI:
> > > > +        error_report("SMI interrupts not currently handled");
> > > > +        goto no_remap;
> > > > +    case AMDVI_MT_NMI:
> > > > +        bitpos = AMDVI_DTE_NMIPASS_LSHIFT;
> > > > +        break;
> > > > +    case AMDVI_MT_INIT:
> > > > +        bitpos = AMDVI_DTE_INTPASS_LSHIFT;
> > > > +        break;
> > > > +    case AMDVI_MT_EXTINT:
> > > > +        bitpos = AMDVI_DTE_EINTPASS_LSHIFT;
> > > > +        break;
> > > > +    case AMDVI_MT_LINT1:
> > > > +        bitpos = AMDVI_DTE_LINT1PASS_LSHIFT;
> > > > +        break;
> > > > +    case AMDVI_MT_LINT0:
> > > > +        bitpos = AMDVI_DTE_LINT0PASS_LSHIFT;
> > > > +    default:
> > > > +        goto no_remap;
> > > > +    }
> > > > +
> > > > +    ret = amdvi_ir_handle_non_vectored(src, dst, bitpos, dte[2]);
> > > > +    if (ret < 0){
> > > > +        goto no_remap;
> > > > +    }
> > > > +    s->ir_cache = true;
> > > > +    trace_amdvi_ir_remap(dst->data, dst->address, sid);
> > > > +    return ret;
> > > > +no_remap:
> > > > +    memcpy(dst, src, sizeof(*src));
> > >
> > > Shall we drop it and report when the remapping failed in some way?
> > >
> > > I'm totally okay that we just drop it here, and we can do the
> > > reporting in the future. But using the old is not a good one, since if
> > > someone injects fault interrupts, it will be delivered just like
> > > without IOMMU. So we have no protection at all.
> > >
> >
> > Wont this get dropped based on the return value ? I think the 'memcpy' is
> > not necessary but  my understanding is kvm will drop the translation
> based
> > on the return value, no ?
>
> Yeah you are right. Then I'll suggest something like:
>
>     no_remap:
>         memcpy(...);
>     remap_fail:
>         trace_...();
>         return ret;
>
> And we goto remap_fail when error happens. That'll be cleaner to me,
> and after all we don't need to memcpy() if something failed.
>
> Thanks,
>
> -- peterx
>
Peter Xu Sept. 13, 2016, 7:38 a.m. UTC | #5
On Mon, Sep 12, 2016 at 03:45:48PM +0300, David Kiarie wrote:
> > When we say cache here, we are mostly talking about GSI routes in
> > kernel, right? Since we still don't have other kind of interrupt
> > caches AFAIK. If so, GSI routes should already been setup even if the
> > interrupts are not triggered for a single time. So we need to
> > invalidate them even ir_cache == false.
> >
> 
> You're right but I'm not sure how to implement that while avoiding
> triggering the notifier numerous pointless times during boot.
> 
> 
> > I think the problem is why cache invalidations during boot will bug
> > the system. Any clue?
> >
> 
> The issue is not too many invalidations. I don't have a very clear idea of
> how notifiers work but I would assume it spawns a thread or they somehow
> use a multithreaded approach which would mean triggering the notifier too
> many times within a very short period many trigger a bunch of issues.

No thread is spawn, we just call the notifier callbacks.

For me it's fairly acceptable that guest sends lots of invalidations
during boot. That should not lead to any functional issues. If there
is, then something might be wrong.

I don't know whether mst will like to merge this series even without
fixing this. Anyway I would still prefer to root cause the issue, or
at least comment this out in the commit message (or codes somewhere)
so that we know there is something TBD and might cause misterious
troubles...

Thanks,

-- peterx
Michael S. Tsirkin Sept. 13, 2016, 2:11 p.m. UTC | #6
On Tue, Sep 13, 2016 at 03:38:38PM +0800, Peter Xu wrote:
> On Mon, Sep 12, 2016 at 03:45:48PM +0300, David Kiarie wrote:
> > > When we say cache here, we are mostly talking about GSI routes in
> > > kernel, right? Since we still don't have other kind of interrupt
> > > caches AFAIK. If so, GSI routes should already been setup even if the
> > > interrupts are not triggered for a single time. So we need to
> > > invalidate them even ir_cache == false.
> > >
> > 
> > You're right but I'm not sure how to implement that while avoiding
> > triggering the notifier numerous pointless times during boot.
> > 
> > 
> > > I think the problem is why cache invalidations during boot will bug
> > > the system. Any clue?
> > >
> > 
> > The issue is not too many invalidations. I don't have a very clear idea of
> > how notifiers work but I would assume it spawns a thread or they somehow
> > use a multithreaded approach which would mean triggering the notifier too
> > many times within a very short period many trigger a bunch of issues.
> 
> No thread is spawn, we just call the notifier callbacks.
> 
> For me it's fairly acceptable that guest sends lots of invalidations
> during boot. That should not lead to any functional issues. If there
> is, then something might be wrong.
> 
> I don't know whether mst will like to merge this series even without
> fixing this. Anyway I would still prefer to root cause the issue, or
> at least comment this out in the commit message (or codes somewhere)
> so that we know there is something TBD and might cause misterious
> troubles...
> 
> Thanks,
> 
> -- peterx

By now it's minimally intrusive, so yes, I think I'll merge
and we can apply fixes on top incrementally.
E.g. would you like to post the suggested comment as a patch?
Michael S. Tsirkin Sept. 13, 2016, 2:13 p.m. UTC | #7
On Tue, Sep 13, 2016 at 05:11:00PM +0300, Michael S. Tsirkin wrote:
> On Tue, Sep 13, 2016 at 03:38:38PM +0800, Peter Xu wrote:
> > On Mon, Sep 12, 2016 at 03:45:48PM +0300, David Kiarie wrote:
> > > > When we say cache here, we are mostly talking about GSI routes in
> > > > kernel, right? Since we still don't have other kind of interrupt
> > > > caches AFAIK. If so, GSI routes should already been setup even if the
> > > > interrupts are not triggered for a single time. So we need to
> > > > invalidate them even ir_cache == false.
> > > >
> > > 
> > > You're right but I'm not sure how to implement that while avoiding
> > > triggering the notifier numerous pointless times during boot.
> > > 
> > > 
> > > > I think the problem is why cache invalidations during boot will bug
> > > > the system. Any clue?
> > > >
> > > 
> > > The issue is not too many invalidations. I don't have a very clear idea of
> > > how notifiers work but I would assume it spawns a thread or they somehow
> > > use a multithreaded approach which would mean triggering the notifier too
> > > many times within a very short period many trigger a bunch of issues.
> > 
> > No thread is spawn, we just call the notifier callbacks.
> > 
> > For me it's fairly acceptable that guest sends lots of invalidations
> > during boot. That should not lead to any functional issues. If there
> > is, then something might be wrong.
> > 
> > I don't know whether mst will like to merge this series even without
> > fixing this. Anyway I would still prefer to root cause the issue, or
> > at least comment this out in the commit message (or codes somewhere)
> > so that we know there is something TBD and might cause misterious
> > troubles...
> > 
> > Thanks,
> > 
> > -- peterx
> 
> By now it's minimally intrusive, so yes, I think I'll merge
> and we can apply fixes on top incrementally.
> E.g. would you like to post the suggested comment as a patch?
> 
> -- 
> MST

Sorry, in fact I was referring to amd iommu itself, while posting
in the int remapping thread.
There will be more versions of this one
so including a comment there directly is better.
David Kiarie Sept. 14, 2016, 10:12 a.m. UTC | #8
On Tue, Sep 13, 2016 at 10:38 AM, Peter Xu <peterx@redhat.com> wrote:

> On Mon, Sep 12, 2016 at 03:45:48PM +0300, David Kiarie wrote:
> > > When we say cache here, we are mostly talking about GSI routes in
> > > kernel, right? Since we still don't have other kind of interrupt
> > > caches AFAIK. If so, GSI routes should already been setup even if the
> > > interrupts are not triggered for a single time. So we need to
> > > invalidate them even ir_cache == false.
> > >
> >
> > You're right but I'm not sure how to implement that while avoiding
> > triggering the notifier numerous pointless times during boot.
> >
> >
> > > I think the problem is why cache invalidations during boot will bug
> > > the system. Any clue?
> > >
> >
> > The issue is not too many invalidations. I don't have a very clear idea
> of
> > how notifiers work but I would assume it spawns a thread or they somehow
> > use a multithreaded approach which would mean triggering the notifier too
> > many times within a very short period many trigger a bunch of issues.
>
> No thread is spawn, we just call the notifier callbacks.
>
> For me it's fairly acceptable that guest sends lots of invalidations
> during boot. That should not lead to any functional issues. If there
> is, then something might be wrong.
>
> I don't know whether mst will like to merge this series even without
> fixing this. Anyway I would still prefer to root cause the issue, or
> at least comment this out in the commit message (or codes somewhere)
> so that we know there is something TBD and might cause misterious
> troubles...
>

Unfortunately, I, since recently can't reproduce this issue. I am however
going to change the current code so I invalidate kernel interrupt cache
each time AMD IOMMU issues an invalidate command(hence will get rid of the
ir_cache variable and it's dependency).

Thanks for taking time to review this patchset a new version coming up soon!


> Thanks,
>
> -- peterx
>
diff mbox

Patch

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 226fea5..76d3816 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -20,6 +20,7 @@ 
  * Cache implementation inspired by hw/i386/intel_iommu.c
  */
 #include "qemu/osdep.h"
+#include "qemu/error-report.h"
 #include "hw/i386/amd_iommu.h"
 #include "trace.h"
 
@@ -255,6 +256,31 @@  typedef struct QEMU_PACKED {
     uint32_t reserved_5:16;
 } CMDCompletePPR;
 
+typedef union IRTE {
+    struct {
+#ifdef HOST_WORDS_BIGENDIAN
+        uint32_t destination:8;
+        uint32_t rsvd_1:1;
+        uint32_t dm:1;
+        uint32_t rq_eoi:1;
+        uint32_t int_type:3;
+        uint32_t no_fault:1;
+        uint32_t valid:1;
+#else
+        uint32_t valid:1;
+        uint32_t no_fault:1;
+        uint32_t int_type:3;
+        uint32_t rq_eoi:1;
+        uint32_t dm:1;
+        uint32_t rsvd_1:1;
+        uint32_t destination:8;
+#endif
+        uint32_t vector:8;
+        uint32_t rsvd_2:8;
+    } bits;
+    uint32_t data;
+} IRTE;
+
 /* configure MMIO registers at startup/reset */
 static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
                            uint64_t romask, uint64_t w1cmask)
@@ -641,6 +667,11 @@  static void amdvi_inval_inttable(AMDVIState *s, CMDInvalIntrTable *inval)
         amdvi_log_illegalcom_error(s, inval->type, s->cmdbuf + s->cmdbuf_head);
         return;
     }
+
+    if (s->ir_cache) {
+        x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), true, 0, 0);
+    }
+
     trace_amdvi_intr_inval();
 }
 
@@ -1203,6 +1234,197 @@  static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr,
     return ret;
 }
 
+static inline int amdvi_ir_handle_non_vectored(MSIMessage *src,
+                                               MSIMessage *dst, uint8_t bitpos,
+                                               uint64_t dte)
+{
+    if ((dte & (1UL << bitpos))) {
+        /* passing interrupt enabled */
+        memcpy(dst, src, sizeof(*dst));
+    } else {
+        /* should be target aborted */
+        return -AMDVI_TARGET_ABORT;
+    }
+    return 0;
+}
+
+static int amdvi_remap_ir_intctl(uint64_t dte, IRTE irte,
+                                 MSIMessage *src, MSIMessage *dst)
+{
+    int ret = 0;
+
+    switch ((dte >> AMDVI_DTE_INTCTL_RSHIFT) & 3UL) {
+    case AMDVI_INTCTL_PASS:
+        /* pass */
+        memcpy(dst, src, sizeof(*dst));
+        break;
+    case AMDVI_INTCTL_REMAP:
+        /* remap */
+        if (irte.bits.valid) {
+            /* LOCAL APIC address */
+            dst->address = AMDVI_LOCAL_APIC_ADDR;
+            /* destination mode */
+            dst->address |= ((uint64_t)irte.bits.dm) <<
+                            AMDVI_MSI_ADDR_DM_RSHIFT;
+            /* RH */
+            dst->address |= ((uint64_t)irte.bits.rq_eoi) <<
+                            AMDVI_MSI_ADDR_RH_RSHIFT;
+            /* Destination ID */
+            dst->address |= ((uint64_t)irte.bits.destination) <<
+                            AMDVI_MSI_ADDR_DEST_RSHIFT;
+            /* construct data - vector */
+            dst->data |= irte.bits.vector;
+            /* Interrupt type */
+            dst->data |= ((uint64_t)irte.bits.int_type) <<
+                         AMDVI_MSI_DATA_DM_RSHIFT;
+        } else  {
+            ret = -AMDVI_TARGET_ABORT;
+        }
+        break;
+    case AMDVI_INTCTL_ABORT:
+    case AMDVI_INTCTL_RSVD:
+        ret = -AMDVI_TARGET_ABORT;
+    }
+    return ret;
+}
+
+static int amdvi_irte_get(AMDVIState *s, MSIMessage *src, IRTE *irte,
+                          uint64_t *dte, uint16_t devid)
+{
+    uint64_t irte_root, offset = devid * AMDVI_DEVTAB_ENTRY_SIZE,
+             ir_table_size;
+
+    irte_root = dte[2] & AMDVI_IRTEROOT_MASK;
+    offset = (src->data & AMDVI_IRTE_INDEX_MASK) << 2;
+    ir_table_size = 1UL << (dte[2] & AMDVI_IR_TABLE_SIZE_MASK);
+    /* enforce IR table size */
+    if (offset > (ir_table_size * AMDVI_DEFAULT_IRTE_SIZE)) {
+        trace_amdvi_invalid_irte_entry(offset, ir_table_size);
+        return -AMDVI_TARGET_ABORT;
+    }
+    /* read IRTE */
+    if (dma_memory_read(&address_space_memory, irte_root + offset,
+        irte, sizeof(*irte))) {
+        trace_amdvi_irte_get_fail(irte_root, offset);
+        return -AMDVI_DEV_TAB_HW;
+    }
+    return 0;
+}
+
+static int amdvi_int_remap(X86IOMMUState *iommu, MSIMessage *src,
+                           MSIMessage *dst, uint16_t sid)
+{
+    trace_amdvi_ir_request(src->data, src->address, sid);
+
+    AMDVIState *s = AMD_IOMMU_DEVICE(iommu);
+    int ret = 0;
+    uint64_t dte[4];
+    uint32_t bitpos;
+    IRTE irte;
+
+    amdvi_get_dte(s, sid, dte);
+
+    /* interrupt remapping disabled */
+    if (!(dte[2] & AMDVI_IR_VALID)) {
+        memcpy(dst, src, sizeof(*src));
+        return ret;
+    }
+
+    ret = amdvi_irte_get(s, src, &irte, dte, sid);
+    if (ret < 0) {
+        goto no_remap;
+    }
+    switch (src->data & AMDVI_IR_TYPE_MASK) {
+    case AMDVI_MT_FIXED:
+    case AMDVI_MT_ARBIT:
+        ret = amdvi_remap_ir_intctl(dte[2], irte, src, dst);
+        if (ret < 0) {
+            goto no_remap;
+        } else {
+            s->ir_cache = true;
+            trace_amdvi_ir_remap(dst->data, dst->address, sid);
+            return ret;
+        }
+    /* not handling SMI currently */
+    case AMDVI_MT_SMI:
+        error_report("SMI interrupts not currently handled");
+        goto no_remap;
+    case AMDVI_MT_NMI:
+        bitpos = AMDVI_DTE_NMIPASS_LSHIFT;
+        break;
+    case AMDVI_MT_INIT:
+        bitpos = AMDVI_DTE_INTPASS_LSHIFT;
+        break;
+    case AMDVI_MT_EXTINT:
+        bitpos = AMDVI_DTE_EINTPASS_LSHIFT;
+        break;
+    case AMDVI_MT_LINT1:
+        bitpos = AMDVI_DTE_LINT1PASS_LSHIFT;
+        break;
+    case AMDVI_MT_LINT0:
+        bitpos = AMDVI_DTE_LINT0PASS_LSHIFT;
+    default:
+        goto no_remap;
+    }
+
+    ret = amdvi_ir_handle_non_vectored(src, dst, bitpos, dte[2]);
+    if (ret < 0){
+        goto no_remap;
+    }
+    s->ir_cache = true;
+    trace_amdvi_ir_remap(dst->data, dst->address, sid);
+    return ret;
+no_remap:
+    memcpy(dst, src, sizeof(*src));
+    trace_amdvi_ir_target_abort(dst->data, dst->address, sid);
+    return ret;
+}
+
+static MemTxResult amdvi_ir_read(void *opaque, hwaddr addr,
+                                 uint64_t *data, unsigned size,
+                                 MemTxAttrs attrs)
+{
+    return MEMTX_OK;
+}
+
+static MemTxResult amdvi_ir_write(void *opaque, hwaddr addr, uint64_t val,
+                                  unsigned size, MemTxAttrs attrs)
+{
+    AMDVIAddressSpace *as = opaque;
+    MSIMessage from = { addr + AMDVI_INT_ADDR_FIRST, val }, to = { 0, 0};
+    int ret = 0;
+
+    ret  = amdvi_int_remap(X86_IOMMU_DEVICE(as->iommu_state), &from, &to,
+                           attrs.requester_id);
+
+    if (ret < 0) {
+        trace_amdvi_ir_target_abort(from.data, from.address,
+                                    attrs.requester_id);
+        return MEMTX_ERROR;
+    }
+
+    if(dma_memory_write(&address_space_memory, to.address, &to.data, size)) {
+        trace_amdvi_ir_write_fail(to.address, to.data);
+        return MEMTX_ERROR;
+    }
+
+    return MEMTX_OK;
+}
+
+static const MemoryRegionOps amdvi_ir_ops = {
+    .read_with_attrs = amdvi_ir_read,
+    .write_with_attrs = amdvi_ir_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    }
+};
+
 static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
 {
     AMDVIState *s = opaque;
@@ -1226,6 +1448,12 @@  static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
 
         memory_region_init_iommu(&iommu_as[devfn]->iommu, OBJECT(s),
                                  &s->iommu_ops, "amd-iommu", UINT64_MAX);
+        memory_region_init_io(&iommu_as[devfn]->iommu_ir, OBJECT(s),
+                              &amdvi_ir_ops, iommu_as[devfn], "amd-iommu-ir",
+                              AMDVI_INT_ADDR_SIZE);
+        memory_region_add_subregion(&iommu_as[devfn]->iommu,
+                                    AMDVI_INT_ADDR_FIRST,
+                                    &iommu_as[devfn]->iommu_ir);
         address_space_init(&iommu_as[devfn]->as, &iommu_as[devfn]->iommu,
                            "amd-iommu");
     }
@@ -1274,6 +1502,7 @@  static void amdvi_init(AMDVIState *s)
     s->enabled = false;
     s->ats_enabled = false;
     s->cmdbuf_enabled = false;
+    s->ir_cache = false;
 
     /* reset MMIO */
     memset(s->mmior, 0, AMDVI_MMIO_SIZE);
@@ -1313,11 +1542,15 @@  static void amdvi_realize(DeviceState *dev, Error **err)
     AMDVIState *s = AMD_IOMMU_DEVICE(dev);
     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
     PCIBus *bus = PC_MACHINE(qdev_get_machine())->bus;
+    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
     s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
                                      amdvi_uint64_equal, g_free, g_free);
 
-    /* This device should take care of IOMMU PCI properties */
+    /* AMD IOMMU has Interrupt Remapping on by default */
+    x86_iommu->intr_supported = true;
     x86_iommu->type = TYPE_AMD;
+
+    /* This device should take care of IOMMU PCI properties */
     qdev_set_parent_bus(DEVICE(&s->pci), &bus->qbus);
     object_property_set_bool(OBJECT(&s->pci), true, "realized", err);
     s->capab_offset = pci_add_capability(&s->pci.dev, AMDVI_CAPAB_ID_SEC, 0,
@@ -1329,9 +1562,13 @@  static void amdvi_realize(DeviceState *dev, Error **err)
     memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio",
                           AMDVI_MMIO_SIZE);
 
+    x86_iommu->ioapic_bdf = PCI_BUILD_BDF(AMDVI_BUS_NUM,
+             AMDVI_SB_IOAPIC_ID);
+
     sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio);
     sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
     pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
+    pcms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_SB_IOAPIC_ID);
     s->devid = object_property_get_int(OBJECT(&s->pci), "addr", err);
     msi_init(&s->pci.dev, 0, 1, true, false, err);
     amdvi_init(s);
@@ -1358,6 +1595,7 @@  static void amdvi_class_init(ObjectClass *klass, void* data)
     dc->vmsd = &vmstate_amdvi;
     dc->hotpluggable = false;
     dc_class->realize = amdvi_realize;
+    dc_class->int_remap = amdvi_int_remap;
 }
 
 static const TypeInfo amdvi = {
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index 5c4a13b..28b9603 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -235,7 +235,7 @@ 
 
 #define AMDVI_BUS_NUM                  0x0
 /* AMD-Vi specific IOAPIC Device function */
-#define AMDVI_DEVFN_IOAPIC             0xa0
+#define AMDVI_SB_IOAPIC_ID            0xa0
 
 #define AMDVI_LOCAL_APIC_ADDR     0xfee00000
 
@@ -338,6 +338,8 @@  typedef struct AMDVIState {
     uint32_t evtlog_len;         /* event log length             */
     uint32_t evtlog_head;        /* current IOMMU write position */
     uint32_t evtlog_tail;        /* current Software read position */
+    /* whether we have remapped any interrupts and hence IR cache */
+    bool ir_cache;
 
     /* unused for now */
     hwaddr excl_base;            /* base DVA - IOMMU exclusion range */
diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index f2d4c15..3fefe4a 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -412,7 +412,6 @@  static void ioapic_machine_done_notify(Notifier *notifier, void *data)
             }
             kvm_irqchip_commit_routes(kvm_state);
         }
-
     }
 #endif
 }