diff mbox series

[RFC,v1,2/3] intel_iommu: add 256 bits qi_desc support

Message ID 1548824953-23413-3-git-send-email-yi.y.sun@linux.intel.com
State New
Headers show
Series intel_iommu: support scalable mode | expand

Commit Message

Yi Sun Jan. 30, 2019, 5:09 a.m. UTC
From: "Liu, Yi L" <yi.l.liu@intel.com>

Per Intel(R) VT-d 3.0, the qi_desc is 256 bits in Scalable
Mode. This patch adds emulation of 256bits qi_desc.

[Yi Sun is co-developer to rebase and refine the patch.]
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>
---
 hw/i386/intel_iommu.c          | 182 +++++++++++++++++++++++++----------------
 hw/i386/intel_iommu_internal.h |   8 +-
 include/hw/i386/intel_iommu.h  |   1 +
 3 files changed, 116 insertions(+), 75 deletions(-)

Comments

Peter Xu Feb. 12, 2019, 6:27 a.m. UTC | #1
On Wed, Jan 30, 2019 at 01:09:12PM +0800, Yi Sun wrote:
> From: "Liu, Yi L" <yi.l.liu@intel.com>
> 
> Per Intel(R) VT-d 3.0, the qi_desc is 256 bits in Scalable
> Mode. This patch adds emulation of 256bits qi_desc.
> 
> [Yi Sun is co-developer to rebase and refine the patch.]
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>

Same here; I think you should have your s-o-b last. ;)

> ---
>  hw/i386/intel_iommu.c          | 182 +++++++++++++++++++++++++----------------
>  hw/i386/intel_iommu_internal.h |   8 +-
>  include/hw/i386/intel_iommu.h  |   1 +
>  3 files changed, 116 insertions(+), 75 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 396ac8e..3664a00 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -38,6 +38,7 @@
>  #include "trace.h"
>  
>  #define vtd_devfn_check(devfn) ((devfn & VTD_DEVFN_CHECK_MASK) ? true : false)
> +#define vtd_ecap_smts(s) ((s)->ecap & VTD_ECAP_SMTS)

I'd prefer capital letters for macros.  Your call.

>  
>  /* context entry operations */
>  #define vtd_get_ce_size(s, ce) \
> @@ -65,6 +66,9 @@
>  #define vtd_pe_get_slpt_base(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_SLPTPTR)
>  #define vtd_pe_get_domain_id(pe) VTD_SM_PASID_ENTRY_DID((pe)->val[1])
>  
> +/* invalidation desc */
> +#define vtd_get_inv_desc_width(s) ((s)->iq_dw ? 32 : 16)

Nit: I'll prefer dropping all the "get" wordings in these macros to be
"vtd_inv_desc_width" since that "get" doesn't help much on
understanding its meanings.  But it's personal preference too.

And since you've already have the iq_dw variable - why not store the
width directly into it?  An uint8_t would suffice.

> +
>  static void vtd_address_space_refresh_all(IntelIOMMUState *s);
>  static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
>  
> @@ -1759,6 +1763,11 @@ static void vtd_root_table_setup(IntelIOMMUState *s)
>      s->root_scalable = s->root & VTD_RTADDR_SMT;
>      s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
>  
> +    /* if Scalable mode is not enabled, enforce iq_dw to be 16 byte */
> +    if (!s->root_scalable) {
> +        s->iq_dw = 0;

This is ok but I feel it a bit awkward to change IQ setup specifically
in root table setup.  You can simply do this:

- in vtd_init(), always set iq_dw=16.  This will cover system resets
  or IOMMU device reset, then

- only update iq_dw to 32 in vtd_mem_write() where guest specified

> +    }
> +
>      trace_vtd_reg_dmar_root(s->root, s->root_extended);
>  }
>  
> @@ -2052,7 +2061,7 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
>      if (en) {
>          s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
>          /* 2^(x+8) entries */
> -        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
> +        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
>          s->qi_enabled = true;
>          trace_vtd_inv_qi_setup(s->iq, s->iq_size);
>          /* Ok - report back to driver */
> @@ -2219,54 +2228,66 @@ static void vtd_handle_iotlb_write(IntelIOMMUState *s)
>  }
>  
>  /* Fetch an Invalidation Descriptor from the Invalidation Queue */
> -static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
> +static bool vtd_get_inv_desc(IntelIOMMUState *s,
>                               VTDInvDesc *inv_desc)
>  {
> -    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
> -    if (dma_memory_read(&address_space_memory, addr, inv_desc,
> -        sizeof(*inv_desc))) {
> -        error_report_once("Read INV DESC failed");
> -        inv_desc->lo = 0;
> -        inv_desc->hi = 0;
> +    dma_addr_t base_addr = s->iq;
> +    uint32_t offset = s->iq_head;
> +    uint32_t dw = vtd_get_inv_desc_width(s);
> +    dma_addr_t addr = base_addr + offset * dw;
> +
> +    /* init */
> +    inv_desc->val[0] = 0;
> +    inv_desc->val[1] = 0;
> +    inv_desc->val[2] = 0;
> +    inv_desc->val[3] = 0;

No need?

> +
> +    if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) {
> +        error_report_once("Read INV DESC failed.");
>          return false;
>      }
> -    inv_desc->lo = le64_to_cpu(inv_desc->lo);
> -    inv_desc->hi = le64_to_cpu(inv_desc->hi);
> +    inv_desc->val[0] = le64_to_cpu(inv_desc->val[0]);
> +    inv_desc->val[1] = le64_to_cpu(inv_desc->val[1]);
> +    inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
> +    inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);

Similar comments here on VTDInvDesc - you can keep the hi/lo fields,
but instead you can define the val[4] into the union too.  Then:

  if (dw == 16) {
    inv_desc->lo = ...
    inv_desc->hi = ...
  } else {
    inv_desc->val[0] = ...
    inv_desc->val[1] = ...
    inv_desc->val[2] = ...
    inv_desc->val[3] = ...
  }

Then this patch can be greatly simplified too since you don't need to
switch VTDInvDesc.{lo|hi} to val[0|1].

[...]

> @@ -2525,7 +2558,7 @@ static void vtd_handle_iqt_write(IntelIOMMUState *s)
>  {
>      uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
>  
> -    s->iq_tail = VTD_IQT_QT(val);
> +    s->iq_tail = VTD_IQT_QT(s->iq_dw, val);

You can check against bit 4 when iq_dw==32 and report error otherwise.
That's explicitly mentioned by the spec (chap 10.4.22).

>      trace_vtd_inv_qi_tail(s->iq_tail);
>  
>      if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {

[...]


Regards,
Yi Sun Feb. 13, 2019, 9 a.m. UTC | #2
On 19-02-12 14:27:28, Peter Xu wrote:
> On Wed, Jan 30, 2019 at 01:09:12PM +0800, Yi Sun wrote:
> > From: "Liu, Yi L" <yi.l.liu@intel.com>
> > 
> > Per Intel(R) VT-d 3.0, the qi_desc is 256 bits in Scalable
> > Mode. This patch adds emulation of 256bits qi_desc.
> > 
> > [Yi Sun is co-developer to rebase and refine the patch.]
> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>
> 
> Same here; I think you should have your s-o-b last. ;)
> 
Sure.

> > ---
> >  hw/i386/intel_iommu.c          | 182 +++++++++++++++++++++++++----------------
> >  hw/i386/intel_iommu_internal.h |   8 +-
> >  include/hw/i386/intel_iommu.h  |   1 +
> >  3 files changed, 116 insertions(+), 75 deletions(-)
> > 
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 396ac8e..3664a00 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -38,6 +38,7 @@
> >  #include "trace.h"
> >  
> >  #define vtd_devfn_check(devfn) ((devfn & VTD_DEVFN_CHECK_MASK) ? true : false)
> > +#define vtd_ecap_smts(s) ((s)->ecap & VTD_ECAP_SMTS)
> 
> I'd prefer capital letters for macros.  Your call.
> 
Will change them.

> >  
> >  /* context entry operations */
> >  #define vtd_get_ce_size(s, ce) \
> > @@ -65,6 +66,9 @@
> >  #define vtd_pe_get_slpt_base(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_SLPTPTR)
> >  #define vtd_pe_get_domain_id(pe) VTD_SM_PASID_ENTRY_DID((pe)->val[1])
> >  
> > +/* invalidation desc */
> > +#define vtd_get_inv_desc_width(s) ((s)->iq_dw ? 32 : 16)
> 
> Nit: I'll prefer dropping all the "get" wordings in these macros to be
> "vtd_inv_desc_width" since that "get" doesn't help much on
> understanding its meanings.  But it's personal preference too.
> 
That is fine.

> And since you've already have the iq_dw variable - why not store the
> width directly into it?  An uint8_t would suffice.
> 
iq_dw corresponds to VTD_IQA_DW_MASK (Descriptor Width defined in IQA
register). 1 means 256-bit descriptor, 0 means 128-bit descriptor.

It is also used in vtd_handle_gcmd_qie() and VTD_IQT_QT() by checking if
its value is 1.

So, I would prefer to keep the original design.

> > +
> >  static void vtd_address_space_refresh_all(IntelIOMMUState *s);
> >  static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
> >  
> > @@ -1759,6 +1763,11 @@ static void vtd_root_table_setup(IntelIOMMUState *s)
> >      s->root_scalable = s->root & VTD_RTADDR_SMT;
> >      s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
> >  
> > +    /* if Scalable mode is not enabled, enforce iq_dw to be 16 byte */
> > +    if (!s->root_scalable) {
> > +        s->iq_dw = 0;
> 
> This is ok but I feel it a bit awkward to change IQ setup specifically
> in root table setup.  You can simply do this:
> 
> - in vtd_init(), always set iq_dw=16.  This will cover system resets
>   or IOMMU device reset, then
> 
Per above explanation, I can make iq_dw=0 in vtd_init().

> - only update iq_dw to 32 in vtd_mem_write() where guest specified
> 
May add check of s->root_scalable in vtd_mem_write().

> > +    }
> > +
> >      trace_vtd_reg_dmar_root(s->root, s->root_extended);
> >  }
> >  
> > @@ -2052,7 +2061,7 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
> >      if (en) {
> >          s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
> >          /* 2^(x+8) entries */
> > -        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
> > +        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
> >          s->qi_enabled = true;
> >          trace_vtd_inv_qi_setup(s->iq, s->iq_size);
> >          /* Ok - report back to driver */
> > @@ -2219,54 +2228,66 @@ static void vtd_handle_iotlb_write(IntelIOMMUState *s)
> >  }
> >  
> >  /* Fetch an Invalidation Descriptor from the Invalidation Queue */
> > -static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
> > +static bool vtd_get_inv_desc(IntelIOMMUState *s,
> >                               VTDInvDesc *inv_desc)
> >  {
> > -    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
> > -    if (dma_memory_read(&address_space_memory, addr, inv_desc,
> > -        sizeof(*inv_desc))) {
> > -        error_report_once("Read INV DESC failed");
> > -        inv_desc->lo = 0;
> > -        inv_desc->hi = 0;
> > +    dma_addr_t base_addr = s->iq;
> > +    uint32_t offset = s->iq_head;
> > +    uint32_t dw = vtd_get_inv_desc_width(s);
> > +    dma_addr_t addr = base_addr + offset * dw;
> > +
> > +    /* init */
> > +    inv_desc->val[0] = 0;
> > +    inv_desc->val[1] = 0;
> > +    inv_desc->val[2] = 0;
> > +    inv_desc->val[3] = 0;
> 
> No need?
> 
This is necessary. Per my test, the val[] are not 0 by default. That
makes bug happen.

> > +
> > +    if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) {
> > +        error_report_once("Read INV DESC failed.");
> >          return false;
> >      }
> > -    inv_desc->lo = le64_to_cpu(inv_desc->lo);
> > -    inv_desc->hi = le64_to_cpu(inv_desc->hi);
> > +    inv_desc->val[0] = le64_to_cpu(inv_desc->val[0]);
> > +    inv_desc->val[1] = le64_to_cpu(inv_desc->val[1]);
> > +    inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
> > +    inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);
> 
> Similar comments here on VTDInvDesc - you can keep the hi/lo fields,
> but instead you can define the val[4] into the union too.  Then:
> 
>   if (dw == 16) {
>     inv_desc->lo = ...
>     inv_desc->hi = ...
>   } else {
>     inv_desc->val[0] = ...
>     inv_desc->val[1] = ...
>     inv_desc->val[2] = ...
>     inv_desc->val[3] = ...
>   }
> 
> Then this patch can be greatly simplified too since you don't need to
> switch VTDInvDesc.{lo|hi} to val[0|1].
> 
Ok, I will consider it.

> [...]
> 
> > @@ -2525,7 +2558,7 @@ static void vtd_handle_iqt_write(IntelIOMMUState *s)
> >  {
> >      uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
> >  
> > -    s->iq_tail = VTD_IQT_QT(val);
> > +    s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
> 
> You can check against bit 4 when iq_dw==32 and report error otherwise.
> That's explicitly mentioned by the spec (chap 10.4.22).
> 
OK, will add this check. Thanks!

> >      trace_vtd_inv_qi_tail(s->iq_tail);
> >  
> >      if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
> 
> [...]
> 
> 
> Regards,
> 
> -- 
> Peter Xu
Peter Xu Feb. 13, 2019, 10:42 a.m. UTC | #3
On Wed, Feb 13, 2019 at 05:00:41PM +0800, Yi Sun wrote:

[...]

> > >  
> > >  /* context entry operations */
> > >  #define vtd_get_ce_size(s, ce) \
> > > @@ -65,6 +66,9 @@
> > >  #define vtd_pe_get_slpt_base(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_SLPTPTR)
> > >  #define vtd_pe_get_domain_id(pe) VTD_SM_PASID_ENTRY_DID((pe)->val[1])
> > >  
> > > +/* invalidation desc */
> > > +#define vtd_get_inv_desc_width(s) ((s)->iq_dw ? 32 : 16)
> > 
> > Nit: I'll prefer dropping all the "get" wordings in these macros to be
> > "vtd_inv_desc_width" since that "get" doesn't help much on
> > understanding its meanings.  But it's personal preference too.
> > 
> That is fine.
> 
> > And since you've already have the iq_dw variable - why not store the
> > width directly into it?  An uint8_t would suffice.
> > 
> iq_dw corresponds to VTD_IQA_DW_MASK (Descriptor Width defined in IQA
> register). 1 means 256-bit descriptor, 0 means 128-bit descriptor.
> 
> It is also used in vtd_handle_gcmd_qie() and VTD_IQT_QT() by checking if
> its value is 1.
> 
> So, I would prefer to keep the original design.

It's ok.   But please make it a boolean.  Now iq_dw can be 0x800.

[...]

> > >  /* Fetch an Invalidation Descriptor from the Invalidation Queue */
> > > -static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
> > > +static bool vtd_get_inv_desc(IntelIOMMUState *s,
> > >                               VTDInvDesc *inv_desc)
> > >  {
> > > -    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
> > > -    if (dma_memory_read(&address_space_memory, addr, inv_desc,
> > > -        sizeof(*inv_desc))) {
> > > -        error_report_once("Read INV DESC failed");
> > > -        inv_desc->lo = 0;
> > > -        inv_desc->hi = 0;
> > > +    dma_addr_t base_addr = s->iq;
> > > +    uint32_t offset = s->iq_head;
> > > +    uint32_t dw = vtd_get_inv_desc_width(s);
> > > +    dma_addr_t addr = base_addr + offset * dw;
> > > +
> > > +    /* init */
> > > +    inv_desc->val[0] = 0;
> > > +    inv_desc->val[1] = 0;
> > > +    inv_desc->val[2] = 0;
> > > +    inv_desc->val[3] = 0;
> > 
> > No need?
> > 
> This is necessary. Per my test, the val[] are not 0 by default.

I agree, it's a stack variable. However...

> That makes bug happen.

... could you explain the bug?

Regards,
Yi Sun Feb. 14, 2019, 1:52 a.m. UTC | #4
On 19-02-13 18:42:24, Peter Xu wrote:
> On Wed, Feb 13, 2019 at 05:00:41PM +0800, Yi Sun wrote:
> 
> [...]
> 
> > > >  
> > > >  /* context entry operations */
> > > >  #define vtd_get_ce_size(s, ce) \
> > > > @@ -65,6 +66,9 @@
> > > >  #define vtd_pe_get_slpt_base(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_SLPTPTR)
> > > >  #define vtd_pe_get_domain_id(pe) VTD_SM_PASID_ENTRY_DID((pe)->val[1])
> > > >  
> > > > +/* invalidation desc */
> > > > +#define vtd_get_inv_desc_width(s) ((s)->iq_dw ? 32 : 16)
> > > 
> > > Nit: I'll prefer dropping all the "get" wordings in these macros to be
> > > "vtd_inv_desc_width" since that "get" doesn't help much on
> > > understanding its meanings.  But it's personal preference too.
> > > 
> > That is fine.
> > 
> > > And since you've already have the iq_dw variable - why not store the
> > > width directly into it?  An uint8_t would suffice.
> > > 
> > iq_dw corresponds to VTD_IQA_DW_MASK (Descriptor Width defined in IQA
> > register). 1 means 256-bit descriptor, 0 means 128-bit descriptor.
> > 
> > It is also used in vtd_handle_gcmd_qie() and VTD_IQT_QT() by checking if
> > its value is 1.
> > 
> > So, I would prefer to keep the original design.
> 
> It's ok.   But please make it a boolean.  Now iq_dw can be 0x800.
> 
Sure.

> [...]
> 
> > > >  /* Fetch an Invalidation Descriptor from the Invalidation Queue */
> > > > -static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
> > > > +static bool vtd_get_inv_desc(IntelIOMMUState *s,
> > > >                               VTDInvDesc *inv_desc)
> > > >  {
> > > > -    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
> > > > -    if (dma_memory_read(&address_space_memory, addr, inv_desc,
> > > > -        sizeof(*inv_desc))) {
> > > > -        error_report_once("Read INV DESC failed");
> > > > -        inv_desc->lo = 0;
> > > > -        inv_desc->hi = 0;
> > > > +    dma_addr_t base_addr = s->iq;
> > > > +    uint32_t offset = s->iq_head;
> > > > +    uint32_t dw = vtd_get_inv_desc_width(s);
> > > > +    dma_addr_t addr = base_addr + offset * dw;
> > > > +
> > > > +    /* init */
> > > > +    inv_desc->val[0] = 0;
> > > > +    inv_desc->val[1] = 0;
> > > > +    inv_desc->val[2] = 0;
> > > > +    inv_desc->val[3] = 0;
> > > 
> > > No need?
> > > 
> > This is necessary. Per my test, the val[] are not 0 by default.
> 
> I agree, it's a stack variable. However...
> 
> > That makes bug happen.
> 
> ... could you explain the bug?
> 
Below error can be observed.

qemu-system-x86_64: vtd_process_inv_desc: invalid inv desc: val[3]=10, val[2]=0 (detect reserve non-zero)

> Regards,
> 
> -- 
> Peter Xu
Peter Xu Feb. 14, 2019, 3:24 a.m. UTC | #5
On Thu, Feb 14, 2019 at 09:52:04AM +0800, Yi Sun wrote:

[...]

> > > > >  /* Fetch an Invalidation Descriptor from the Invalidation Queue */
> > > > > -static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
> > > > > +static bool vtd_get_inv_desc(IntelIOMMUState *s,
> > > > >                               VTDInvDesc *inv_desc)
> > > > >  {
> > > > > -    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
> > > > > -    if (dma_memory_read(&address_space_memory, addr, inv_desc,
> > > > > -        sizeof(*inv_desc))) {
> > > > > -        error_report_once("Read INV DESC failed");
> > > > > -        inv_desc->lo = 0;
> > > > > -        inv_desc->hi = 0;
> > > > > +    dma_addr_t base_addr = s->iq;
> > > > > +    uint32_t offset = s->iq_head;
> > > > > +    uint32_t dw = vtd_get_inv_desc_width(s);
> > > > > +    dma_addr_t addr = base_addr + offset * dw;
> > > > > +
> > > > > +    /* init */
> > > > > +    inv_desc->val[0] = 0;
> > > > > +    inv_desc->val[1] = 0;
> > > > > +    inv_desc->val[2] = 0;
> > > > > +    inv_desc->val[3] = 0;
> > > > 
> > > > No need?
> > > > 
> > > This is necessary. Per my test, the val[] are not 0 by default.
> > 
> > I agree, it's a stack variable. However...
> > 
> > > That makes bug happen.
> > 
> > ... could you explain the bug?
> > 
> Below error can be observed.
> 
> qemu-system-x86_64: vtd_process_inv_desc: invalid inv desc: val[3]=10, val[2]=0 (detect reserve non-zero)

Ok so you're checking val[2] & val[3] unconditionally:

    if (inv_desc.val[3] || inv_desc.val[2]) {
        error_report_once("%s: invalid inv desc: val[3]=%"PRIx64
                          ", val[2]=%"PRIx64
                          " (detect reserve non-zero)", __func__,
                          inv_desc.val[3],
                          inv_desc.val[2]);
        return false;
    }

Why?  Shouldn't they invalid if inv desc width is 128bits?

When 256 bits invalidation descriptor is used, the guest driver
should be responsible to fill in zeros into reserved fields.

Another question: is val[2] & val[3] used in any place even with
256bits mode?  From what I see from the spec (chap 6.5.2), all of them
seems to be reserved as zeros, then I don't understand why bother
extending this to 256bits...  Did I miss something?

Regards,
Yi Sun Feb. 14, 2019, 6:27 a.m. UTC | #6
On 19-02-14 11:24:35, Peter Xu wrote:
> On Thu, Feb 14, 2019 at 09:52:04AM +0800, Yi Sun wrote:
> 
> [...]
> 
> > > > > >  /* Fetch an Invalidation Descriptor from the Invalidation Queue */
> > > > > > -static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
> > > > > > +static bool vtd_get_inv_desc(IntelIOMMUState *s,
> > > > > >                               VTDInvDesc *inv_desc)
> > > > > >  {
> > > > > > -    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
> > > > > > -    if (dma_memory_read(&address_space_memory, addr, inv_desc,
> > > > > > -        sizeof(*inv_desc))) {
> > > > > > -        error_report_once("Read INV DESC failed");
> > > > > > -        inv_desc->lo = 0;
> > > > > > -        inv_desc->hi = 0;
> > > > > > +    dma_addr_t base_addr = s->iq;
> > > > > > +    uint32_t offset = s->iq_head;
> > > > > > +    uint32_t dw = vtd_get_inv_desc_width(s);
> > > > > > +    dma_addr_t addr = base_addr + offset * dw;
> > > > > > +
> > > > > > +    /* init */
> > > > > > +    inv_desc->val[0] = 0;
> > > > > > +    inv_desc->val[1] = 0;
> > > > > > +    inv_desc->val[2] = 0;
> > > > > > +    inv_desc->val[3] = 0;
> > > > > 
> > > > > No need?
> > > > > 
> > > > This is necessary. Per my test, the val[] are not 0 by default.
> > > 
> > > I agree, it's a stack variable. However...
> > > 
> > > > That makes bug happen.
> > > 
> > > ... could you explain the bug?
> > > 
> > Below error can be observed.
> > 
> > qemu-system-x86_64: vtd_process_inv_desc: invalid inv desc: val[3]=10, val[2]=0 (detect reserve non-zero)
> 
> Ok so you're checking val[2] & val[3] unconditionally:
> 
>     if (inv_desc.val[3] || inv_desc.val[2]) {
>         error_report_once("%s: invalid inv desc: val[3]=%"PRIx64
>                           ", val[2]=%"PRIx64
>                           " (detect reserve non-zero)", __func__,
>                           inv_desc.val[3],
>                           inv_desc.val[2]);
>         return false;
>     }
> 
> Why?  Shouldn't they invalid if inv desc width is 128bits?
> 
Here, my intention is to simplify the codes. If inv_desc.val[] has been
initialized to 0, the error will not happen no matter 128bits or 256bits.

> When 256 bits invalidation descriptor is used, the guest driver
> should be responsible to fill in zeros into reserved fields.
> 
> Another question: is val[2] & val[3] used in any place even with
> 256bits mode?  From what I see from the spec (chap 6.5.2), all of them
> seems to be reserved as zeros, then I don't understand why bother
> extending this to 256bits...  Did I miss something?
> 
Although the high 128bits are not used now, they are defined by spec
so that we should handle them. At least we can know if there is error
by checking if high 128bits are 0. Furthermore, we handle them in codes
now can avoid some changes in the future.

> Regards,
> 
> -- 
> Peter Xu
Peter Xu Feb. 14, 2019, 7:13 a.m. UTC | #7
On Thu, Feb 14, 2019 at 02:27:30PM +0800, Yi Sun wrote:
> On 19-02-14 11:24:35, Peter Xu wrote:
> > On Thu, Feb 14, 2019 at 09:52:04AM +0800, Yi Sun wrote:
> > 
> > [...]
> > 
> > > > > > >  /* Fetch an Invalidation Descriptor from the Invalidation Queue */
> > > > > > > -static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
> > > > > > > +static bool vtd_get_inv_desc(IntelIOMMUState *s,
> > > > > > >                               VTDInvDesc *inv_desc)
> > > > > > >  {
> > > > > > > -    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
> > > > > > > -    if (dma_memory_read(&address_space_memory, addr, inv_desc,
> > > > > > > -        sizeof(*inv_desc))) {
> > > > > > > -        error_report_once("Read INV DESC failed");
> > > > > > > -        inv_desc->lo = 0;
> > > > > > > -        inv_desc->hi = 0;
> > > > > > > +    dma_addr_t base_addr = s->iq;
> > > > > > > +    uint32_t offset = s->iq_head;
> > > > > > > +    uint32_t dw = vtd_get_inv_desc_width(s);
> > > > > > > +    dma_addr_t addr = base_addr + offset * dw;
> > > > > > > +
> > > > > > > +    /* init */
> > > > > > > +    inv_desc->val[0] = 0;
> > > > > > > +    inv_desc->val[1] = 0;
> > > > > > > +    inv_desc->val[2] = 0;
> > > > > > > +    inv_desc->val[3] = 0;
> > > > > > 
> > > > > > No need?
> > > > > > 
> > > > > This is necessary. Per my test, the val[] are not 0 by default.
> > > > 
> > > > I agree, it's a stack variable. However...
> > > > 
> > > > > That makes bug happen.
> > > > 
> > > > ... could you explain the bug?
> > > > 
> > > Below error can be observed.
> > > 
> > > qemu-system-x86_64: vtd_process_inv_desc: invalid inv desc: val[3]=10, val[2]=0 (detect reserve non-zero)
> > 
> > Ok so you're checking val[2] & val[3] unconditionally:
> > 
> >     if (inv_desc.val[3] || inv_desc.val[2]) {
> >         error_report_once("%s: invalid inv desc: val[3]=%"PRIx64
> >                           ", val[2]=%"PRIx64
> >                           " (detect reserve non-zero)", __func__,
> >                           inv_desc.val[3],
> >                           inv_desc.val[2]);
> >         return false;
> >     }
> > 
> > Why?  Shouldn't they invalid if inv desc width is 128bits?
> > 
> Here, my intention is to simplify the codes. If inv_desc.val[] has been
> initialized to 0, the error will not happen no matter 128bits or 256bits.

You set something to zero and then you check it against zero... IMHO
it doesn't simplify the code but instead it is more confusing.  I
would prefer to only check when necessary, then drop the inits of the
stack variables.

> 
> > When 256 bits invalidation descriptor is used, the guest driver
> > should be responsible to fill in zeros into reserved fields.
> > 
> > Another question: is val[2] & val[3] used in any place even with
> > 256bits mode?  From what I see from the spec (chap 6.5.2), all of them
> > seems to be reserved as zeros, then I don't understand why bother
> > extending this to 256bits...  Did I miss something?
> > 
> Although the high 128bits are not used now, they are defined by spec
> so that we should handle them. At least we can know if there is error
> by checking if high 128bits are 0. Furthermore, we handle them in codes
> now can avoid some changes in the future.

Yes I understand you should implement the code by following the spec.
My question was to the spec not the code.  Please feel free to skip
this question if you don't know the answer.

Regards,
Tian, Kevin Feb. 14, 2019, 7:35 a.m. UTC | #8
> From: Peter Xu [mailto:peterx@redhat.com]
> Sent: Thursday, February 14, 2019 3:14 PM
> 
> >
> > > When 256 bits invalidation descriptor is used, the guest driver
> > > should be responsible to fill in zeros into reserved fields.
> > >
> > > Another question: is val[2] & val[3] used in any place even with
> > > 256bits mode?  From what I see from the spec (chap 6.5.2), all of them
> > > seems to be reserved as zeros, then I don't understand why bother
> > > extending this to 256bits...  Did I miss something?
> > >

PRQ is extended to carry larger private data which requires 256bits
mode. You can take a look at 7.5.1.1 Page Request Descriptor.

Thanks
Kevin
Peter Xu Feb. 14, 2019, 8:13 a.m. UTC | #9
On Thu, Feb 14, 2019 at 07:35:20AM +0000, Tian, Kevin wrote:
> > From: Peter Xu [mailto:peterx@redhat.com]
> > Sent: Thursday, February 14, 2019 3:14 PM
> > 
> > >
> > > > When 256 bits invalidation descriptor is used, the guest driver
> > > > should be responsible to fill in zeros into reserved fields.
> > > >
> > > > Another question: is val[2] & val[3] used in any place even with
> > > > 256bits mode?  From what I see from the spec (chap 6.5.2), all of them
> > > > seems to be reserved as zeros, then I don't understand why bother
> > > > extending this to 256bits...  Did I miss something?
> > > >
> 
> PRQ is extended to carry larger private data which requires 256bits
> mode. You can take a look at 7.5.1.1 Page Request Descriptor.

But we are talking about IQ (Invalidation Queue), not PRQ, right?

I see that Page Request Queue seems to always have 256bits descriptors
(chap 10.4.32, there is no Descriptor Width field, and I think it
means DW==1 always), however the Invalidation Queue seems to support
both modes (chap 10.4.23, there is Descriptor Width field, DW==0
should be the legacy mode, and DW==1 should be the new mode).  While,
none of the invalidate descriptors described in chap 6.5.2 is using
the upper 128bits even if 256bits mode is supported.

Regards,
Tian, Kevin Feb. 14, 2019, 8:22 a.m. UTC | #10
> From: Peter Xu [mailto:peterx@redhat.com]
> Sent: Thursday, February 14, 2019 4:13 PM
> 
> On Thu, Feb 14, 2019 at 07:35:20AM +0000, Tian, Kevin wrote:
> > > From: Peter Xu [mailto:peterx@redhat.com]
> > > Sent: Thursday, February 14, 2019 3:14 PM
> > >
> > > >
> > > > > When 256 bits invalidation descriptor is used, the guest driver
> > > > > should be responsible to fill in zeros into reserved fields.
> > > > >
> > > > > Another question: is val[2] & val[3] used in any place even with
> > > > > 256bits mode?  From what I see from the spec (chap 6.5.2), all of
> them
> > > > > seems to be reserved as zeros, then I don't understand why bother
> > > > > extending this to 256bits...  Did I miss something?
> > > > >
> >
> > PRQ is extended to carry larger private data which requires 256bits
> > mode. You can take a look at 7.5.1.1 Page Request Descriptor.
> 
> But we are talking about IQ (Invalidation Queue), not PRQ, right?
> 
> I see that Page Request Queue seems to always have 256bits descriptors
> (chap 10.4.32, there is no Descriptor Width field, and I think it
> means DW==1 always), however the Invalidation Queue seems to support
> both modes (chap 10.4.23, there is Descriptor Width field, DW==0
> should be the legacy mode, and DW==1 should be the new mode).  While,
> none of the invalidate descriptors described in chap 6.5.2 is using
> the upper 128bits even if 256bits mode is supported.
> 

Page Group Response descriptor is composed by software through
invalidation queue, which needs to carry same private data back
(as in page request descriptor). it's described in 7.7.1.

Thanks
Kevin
Peter Xu Feb. 14, 2019, 8:43 a.m. UTC | #11
On Thu, Feb 14, 2019 at 08:22:05AM +0000, Tian, Kevin wrote:
> > From: Peter Xu [mailto:peterx@redhat.com]
> > Sent: Thursday, February 14, 2019 4:13 PM
> > 
> > On Thu, Feb 14, 2019 at 07:35:20AM +0000, Tian, Kevin wrote:
> > > > From: Peter Xu [mailto:peterx@redhat.com]
> > > > Sent: Thursday, February 14, 2019 3:14 PM
> > > >
> > > > >
> > > > > > When 256 bits invalidation descriptor is used, the guest driver
> > > > > > should be responsible to fill in zeros into reserved fields.
> > > > > >
> > > > > > Another question: is val[2] & val[3] used in any place even with
> > > > > > 256bits mode?  From what I see from the spec (chap 6.5.2), all of
> > them
> > > > > > seems to be reserved as zeros, then I don't understand why bother
> > > > > > extending this to 256bits...  Did I miss something?
> > > > > >
> > >
> > > PRQ is extended to carry larger private data which requires 256bits
> > > mode. You can take a look at 7.5.1.1 Page Request Descriptor.
> > 
> > But we are talking about IQ (Invalidation Queue), not PRQ, right?
> > 
> > I see that Page Request Queue seems to always have 256bits descriptors
> > (chap 10.4.32, there is no Descriptor Width field, and I think it
> > means DW==1 always), however the Invalidation Queue seems to support
> > both modes (chap 10.4.23, there is Descriptor Width field, DW==0
> > should be the legacy mode, and DW==1 should be the new mode).  While,
> > none of the invalidate descriptors described in chap 6.5.2 is using
> > the upper 128bits even if 256bits mode is supported.
> > 
> 
> Page Group Response descriptor is composed by software through
> invalidation queue, which needs to carry same private data back
> (as in page request descriptor). it's described in 7.7.1.

I see the point.  Thanks, Kevin.

Regards,
diff mbox series

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 396ac8e..3664a00 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -38,6 +38,7 @@ 
 #include "trace.h"
 
 #define vtd_devfn_check(devfn) ((devfn & VTD_DEVFN_CHECK_MASK) ? true : false)
+#define vtd_ecap_smts(s) ((s)->ecap & VTD_ECAP_SMTS)
 
 /* context entry operations */
 #define vtd_get_ce_size(s, ce) \
@@ -65,6 +66,9 @@ 
 #define vtd_pe_get_slpt_base(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_SLPTPTR)
 #define vtd_pe_get_domain_id(pe) VTD_SM_PASID_ENTRY_DID((pe)->val[1])
 
+/* invalidation desc */
+#define vtd_get_inv_desc_width(s) ((s)->iq_dw ? 32 : 16)
+
 static void vtd_address_space_refresh_all(IntelIOMMUState *s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
 
@@ -1759,6 +1763,11 @@  static void vtd_root_table_setup(IntelIOMMUState *s)
     s->root_scalable = s->root & VTD_RTADDR_SMT;
     s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
 
+    /* if Scalable mode is not enabled, enforce iq_dw to be 16 byte */
+    if (!s->root_scalable) {
+        s->iq_dw = 0;
+    }
+
     trace_vtd_reg_dmar_root(s->root, s->root_extended);
 }
 
@@ -2052,7 +2061,7 @@  static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
     if (en) {
         s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
         /* 2^(x+8) entries */
-        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
+        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
         s->qi_enabled = true;
         trace_vtd_inv_qi_setup(s->iq, s->iq_size);
         /* Ok - report back to driver */
@@ -2219,54 +2228,66 @@  static void vtd_handle_iotlb_write(IntelIOMMUState *s)
 }
 
 /* Fetch an Invalidation Descriptor from the Invalidation Queue */
-static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
+static bool vtd_get_inv_desc(IntelIOMMUState *s,
                              VTDInvDesc *inv_desc)
 {
-    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
-    if (dma_memory_read(&address_space_memory, addr, inv_desc,
-        sizeof(*inv_desc))) {
-        error_report_once("Read INV DESC failed");
-        inv_desc->lo = 0;
-        inv_desc->hi = 0;
+    dma_addr_t base_addr = s->iq;
+    uint32_t offset = s->iq_head;
+    uint32_t dw = vtd_get_inv_desc_width(s);
+    dma_addr_t addr = base_addr + offset * dw;
+
+    /* init */
+    inv_desc->val[0] = 0;
+    inv_desc->val[1] = 0;
+    inv_desc->val[2] = 0;
+    inv_desc->val[3] = 0;
+
+    if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) {
+        error_report_once("Read INV DESC failed.");
         return false;
     }
-    inv_desc->lo = le64_to_cpu(inv_desc->lo);
-    inv_desc->hi = le64_to_cpu(inv_desc->hi);
+    inv_desc->val[0] = le64_to_cpu(inv_desc->val[0]);
+    inv_desc->val[1] = le64_to_cpu(inv_desc->val[1]);
+    inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
+    inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);
     return true;
 }
 
 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
 {
-    if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
-        (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
-        error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
-                          " (reserved nonzero)", __func__, inv_desc->hi,
-                          inv_desc->lo);
+    if ((inv_desc->val[1] & VTD_INV_DESC_WAIT_RSVD_HI) ||
+        (inv_desc->val[0] & VTD_INV_DESC_WAIT_RSVD_LO)) {
+        error_report_once("%s: invalid wait desc: val[1]=%"PRIx64
+                          ", val[0]=%"PRIx64
+                          " (reserved nonzero)", __func__, inv_desc->val[1],
+                          inv_desc->val[0]);
         return false;
     }
-    if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
+    if (inv_desc->val[0] & VTD_INV_DESC_WAIT_SW) {
         /* Status Write */
-        uint32_t status_data = (uint32_t)(inv_desc->lo >>
+        uint32_t status_data = (uint32_t)(inv_desc->val[0] >>
                                VTD_INV_DESC_WAIT_DATA_SHIFT);
 
-        assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
+        assert(!(inv_desc->val[0] & VTD_INV_DESC_WAIT_IF));
 
         /* FIXME: need to be masked with HAW? */
-        dma_addr_t status_addr = inv_desc->hi;
+        dma_addr_t status_addr = inv_desc->val[1];
         trace_vtd_inv_desc_wait_sw(status_addr, status_data);
         status_data = cpu_to_le32(status_data);
         if (dma_memory_write(&address_space_memory, status_addr, &status_data,
                              sizeof(status_data))) {
-            trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo);
+            trace_vtd_inv_desc_wait_write_fail(inv_desc->val[1],
+                                               inv_desc->val[0]);
             return false;
         }
-    } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
+    } else if (inv_desc->val[0] & VTD_INV_DESC_WAIT_IF) {
         /* Interrupt flag */
         vtd_generate_completion_event(s);
     } else {
-        error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
-                          " (unknown type)", __func__, inv_desc->hi,
-                          inv_desc->lo);
+        error_report_once("%s: invalid wait desc: val[1]=%"PRIx64
+                          ", val[0]=%"PRIx64
+                          " (unknown type)", __func__, inv_desc->val[1],
+                          inv_desc->val[0]);
         return false;
     }
     return true;
@@ -2277,31 +2298,33 @@  static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
 {
     uint16_t sid, fmask;
 
-    if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) {
-        error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
-                          " (reserved nonzero)", __func__, inv_desc->hi,
-                          inv_desc->lo);
+    if ((inv_desc->val[0] & VTD_INV_DESC_CC_RSVD) || inv_desc->val[1]) {
+        error_report_once("%s: invalid cc inv desc: val[1]=%"PRIx64
+                          ", val[0]=%"PRIx64
+                          " (reserved nonzero)", __func__, inv_desc->val[1],
+                          inv_desc->val[0]);
         return false;
     }
-    switch (inv_desc->lo & VTD_INV_DESC_CC_G) {
+    switch (inv_desc->val[0] & VTD_INV_DESC_CC_G) {
     case VTD_INV_DESC_CC_DOMAIN:
         trace_vtd_inv_desc_cc_domain(
-            (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo));
+            (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->val[0]));
         /* Fall through */
     case VTD_INV_DESC_CC_GLOBAL:
         vtd_context_global_invalidate(s);
         break;
 
     case VTD_INV_DESC_CC_DEVICE:
-        sid = VTD_INV_DESC_CC_SID(inv_desc->lo);
-        fmask = VTD_INV_DESC_CC_FM(inv_desc->lo);
+        sid = VTD_INV_DESC_CC_SID(inv_desc->val[0]);
+        fmask = VTD_INV_DESC_CC_FM(inv_desc->val[0]);
         vtd_context_device_invalidate(s, sid, fmask);
         break;
 
     default:
-        error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
-                          " (invalid type)", __func__, inv_desc->hi,
-                          inv_desc->lo);
+        error_report_once("%s: invalid cc inv desc: val[1]=%"PRIx64
+                          ", val[0]=%"PRIx64
+                          " (invalid type)", __func__, inv_desc->val[1],
+                          inv_desc->val[0]);
         return false;
     }
     return true;
@@ -2313,32 +2336,32 @@  static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
     uint8_t am;
     hwaddr addr;
 
-    if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) ||
-        (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) {
-        error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
-                          ", lo=0x%"PRIx64" (reserved bits unzero)\n",
-                          __func__, inv_desc->hi, inv_desc->lo);
+    if ((inv_desc->val[0] & VTD_INV_DESC_IOTLB_RSVD_LO) ||
+        (inv_desc->val[1] & VTD_INV_DESC_IOTLB_RSVD_HI)) {
+        error_report_once("%s: invalid iotlb inv desc: val[1]=0x%"PRIx64
+                          ", val[0]=0x%"PRIx64" (reserved bits unzero)\n",
+                          __func__, inv_desc->val[1], inv_desc->val[0]);
         return false;
     }
 
-    switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
+    switch (inv_desc->val[0] & VTD_INV_DESC_IOTLB_G) {
     case VTD_INV_DESC_IOTLB_GLOBAL:
         vtd_iotlb_global_invalidate(s);
         break;
 
     case VTD_INV_DESC_IOTLB_DOMAIN:
-        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
+        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->val[0]);
         vtd_iotlb_domain_invalidate(s, domain_id);
         break;
 
     case VTD_INV_DESC_IOTLB_PAGE:
-        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
-        addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
-        am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
+        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->val[0]);
+        addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->val[1]);
+        am = VTD_INV_DESC_IOTLB_AM(inv_desc->val[1]);
         if (am > VTD_MAMV) {
-            error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
-                              ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)\n",
-                              __func__, inv_desc->hi, inv_desc->lo,
+            error_report_once("%s: invalid iotlb inv desc: val[1]=0x%"PRIx64
+                              ", val[0]=0x%"PRIx64" (am=%u > VTD_MAMV=%u)\n",
+                              __func__, inv_desc->val[1], inv_desc->val[0],
                               am, (unsigned)VTD_MAMV);
             return false;
         }
@@ -2346,10 +2369,10 @@  static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
         break;
 
     default:
-        error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
-                          ", lo=0x%"PRIx64" (type mismatch: 0x%llx)\n",
-                          __func__, inv_desc->hi, inv_desc->lo,
-                          inv_desc->lo & VTD_INV_DESC_IOTLB_G);
+        error_report_once("%s: invalid iotlb inv desc: val[1]=0x%"PRIx64
+                          ", val[0]=0x%"PRIx64" (type mismatch: 0x%llx)\n",
+                          __func__, inv_desc->val[1], inv_desc->val[0],
+                          inv_desc->val[0] & VTD_INV_DESC_IOTLB_G);
         return false;
     }
     return true;
@@ -2381,17 +2404,17 @@  static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
     bool size;
     uint8_t bus_num;
 
-    addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi);
-    sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo);
+    addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->val[1]);
+    sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->val[0]);
     devfn = sid & 0xff;
     bus_num = sid >> 8;
-    size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi);
+    size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->val[1]);
 
-    if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
-        (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
-        error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64
-                          ", lo=%"PRIx64" (reserved nonzero)", __func__,
-                          inv_desc->hi, inv_desc->lo);
+    if ((inv_desc->val[0] & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
+        (inv_desc->val[1] & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
+        error_report_once("%s: invalid dev-iotlb inv desc: val[1]=%"PRIx64
+                          ", val[0]=%"PRIx64" (reserved nonzero)", __func__,
+                          inv_desc->val[1], inv_desc->val[0]);
         return false;
     }
 
@@ -2437,54 +2460,64 @@  static bool vtd_process_inv_desc(IntelIOMMUState *s)
     uint8_t desc_type;
 
     trace_vtd_inv_qi_head(s->iq_head);
-    if (!vtd_get_inv_desc(s->iq, s->iq_head, &inv_desc)) {
+    if (!vtd_get_inv_desc(s, &inv_desc)) {
         s->iq_last_desc_type = VTD_INV_DESC_NONE;
         return false;
     }
-    desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
+    if (inv_desc.val[3] || inv_desc.val[2]) {
+        error_report_once("%s: invalid inv desc: val[3]=%"PRIx64
+                          ", val[2]=%"PRIx64
+                          " (detect reserve non-zero)", __func__,
+                          inv_desc.val[3],
+                          inv_desc.val[2]);
+        return false;
+    }
+
+    desc_type = inv_desc.val[0] & VTD_INV_DESC_TYPE;
     /* FIXME: should update at first or at last? */
     s->iq_last_desc_type = desc_type;
 
     switch (desc_type) {
     case VTD_INV_DESC_CC:
-        trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo);
+        trace_vtd_inv_desc("context-cache", inv_desc.val[1], inv_desc.val[0]);
         if (!vtd_process_context_cache_desc(s, &inv_desc)) {
             return false;
         }
         break;
 
     case VTD_INV_DESC_IOTLB:
-        trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo);
+        trace_vtd_inv_desc("iotlb", inv_desc.val[1], inv_desc.val[0]);
         if (!vtd_process_iotlb_desc(s, &inv_desc)) {
             return false;
         }
         break;
 
     case VTD_INV_DESC_WAIT:
-        trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
+        trace_vtd_inv_desc("wait", inv_desc.val[1], inv_desc.val[0]);
         if (!vtd_process_wait_desc(s, &inv_desc)) {
             return false;
         }
         break;
 
     case VTD_INV_DESC_IEC:
-        trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo);
+        trace_vtd_inv_desc("iec", inv_desc.val[1], inv_desc.val[0]);
         if (!vtd_process_inv_iec_desc(s, &inv_desc)) {
             return false;
         }
         break;
 
     case VTD_INV_DESC_DEVICE:
-        trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
+        trace_vtd_inv_desc("device", inv_desc.val[1], inv_desc.val[0]);
         if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
             return false;
         }
         break;
 
     default:
-        error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64
-                          " (unknown type)", __func__, inv_desc.hi,
-                          inv_desc.lo);
+        error_report_once("%s: invalid inv desc: val[1]=%"PRIx64
+                          ", val[0]=%"PRIx64
+                          " (unknown type)", __func__, inv_desc.val[1],
+                          inv_desc.val[0]);
         return false;
     }
     s->iq_head++;
@@ -2525,7 +2558,7 @@  static void vtd_handle_iqt_write(IntelIOMMUState *s)
 {
     uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
 
-    s->iq_tail = VTD_IQT_QT(val);
+    s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
     trace_vtd_inv_qi_tail(s->iq_tail);
 
     if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
@@ -2794,6 +2827,11 @@  static void vtd_mem_write(void *opaque, hwaddr addr,
         } else {
             vtd_set_quad(s, addr, val);
         }
+        if (vtd_ecap_smts(s)) {
+            s->iq_dw = val & VTD_IQA_DW_MASK;
+        } else {
+            s->iq_dw = 0;
+        }
         break;
 
     case DMAR_IQA_REG_HI:
@@ -3577,7 +3615,7 @@  static void vtd_init(IntelIOMMUState *s)
 
     vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
     vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
-    vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff007ULL, 0);
+    vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0);
     vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
     vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
     vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 02674f9..2a753c5 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -190,6 +190,7 @@ 
 #define VTD_ECAP_EIM                (1ULL << 4)
 #define VTD_ECAP_PT                 (1ULL << 6)
 #define VTD_ECAP_MHMV               (15ULL << 20)
+#define VTD_ECAP_SMTS               (1ULL << 43)
 
 /* CAP_REG */
 /* (offset >> 4) << 24 */
@@ -218,11 +219,13 @@ 
 #define VTD_CAP_SAGAW_48bit         (0x4ULL << VTD_CAP_SAGAW_SHIFT)
 
 /* IQT_REG */
-#define VTD_IQT_QT(val)             (((val) >> 4) & 0x7fffULL)
+#define VTD_IQT_QT(dw_bit, val)     (dw_bit ? (((val) >> 5) & 0x3fffULL) : \
+                                     (((val) >> 4) & 0x7fffULL))
 
 /* IQA_REG */
 #define VTD_IQA_IQA_MASK(aw)        (VTD_HAW_MASK(aw) ^ 0xfffULL)
 #define VTD_IQA_QS                  0x7ULL
+#define VTD_IQA_DW_MASK             0x800
 
 /* IQH_REG */
 #define VTD_IQH_QH_SHIFT            4
@@ -321,8 +324,7 @@  typedef struct VTDInvDescIEC VTDInvDescIEC;
 /* Queued Invalidation Descriptor */
 union VTDInvDesc {
     struct {
-        uint64_t lo;
-        uint64_t hi;
+        uint64_t val[4];
     };
     union {
         VTDInvDescIEC iec;
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index ff13ff27..a5da139 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -230,6 +230,7 @@  struct IntelIOMMUState {
     uint16_t iq_tail;               /* Current invalidation queue tail */
     dma_addr_t iq;                  /* Current invalidation queue pointer */
     uint16_t iq_size;               /* IQ Size in number of entries */
+    uint16_t iq_dw;                 /* IQ descriptor width */
     bool qi_enabled;                /* Set if the QI is enabled */
     uint8_t iq_last_desc_type;      /* The type of last completed descriptor */