diff mbox series

[RFC,v1,3/3] intel_iommu: add scalable-mode option to make scalable mode work

Message ID 1548824953-23413-4-git-send-email-yi.y.sun@linux.intel.com
State New
Headers show
Series intel_iommu: support scalable mode | expand

Commit Message

Yi Sun Jan. 30, 2019, 5:09 a.m. UTC
This patch adds an option to provide flexibility for user to expose
Scalable Mode to guest. User could expose Scalable Mode to guest by
the config as below:

"-device intel-iommu,caching-mode=on,scalable-mode=on"

The Linux iommu driver has supported scalable mode. Please refer below
patch set:

    https://www.spinics.net/lists/kernel/msg2985279.html

Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
---
 hw/i386/intel_iommu.c          | 22 ++++++++++++++++++++++
 hw/i386/intel_iommu_internal.h |  6 ++++++
 include/hw/i386/intel_iommu.h  |  3 ++-
 3 files changed, 30 insertions(+), 1 deletion(-)

Comments

Peter Xu Feb. 12, 2019, 6:46 a.m. UTC | #1
On Wed, Jan 30, 2019 at 01:09:13PM +0800, Yi Sun wrote:
> This patch adds an option to provide flexibility for user to expose
> Scalable Mode to guest. User could expose Scalable Mode to guest by
> the config as below:
> 
> "-device intel-iommu,caching-mode=on,scalable-mode=on"
> 
> The Linux iommu driver has supported scalable mode. Please refer below
> patch set:
> 
>     https://www.spinics.net/lists/kernel/msg2985279.html
> 
> Signed-off-by: Liu, Yi L <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> ---
>  hw/i386/intel_iommu.c          | 22 ++++++++++++++++++++++
>  hw/i386/intel_iommu_internal.h |  6 ++++++
>  include/hw/i386/intel_iommu.h  |  3 ++-
>  3 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 3664a00..447fdf3 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -2492,6 +2492,17 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
>          }
>          break;
>  
> +    /*
> +     * TODO: the entity of below two cases will be implemented in future series.
> +     * To make guest (which integrates scalable mode support patch set in
> +     * iommu driver) work, just return true is enough so far.

When you repost, could you mention about what tests have you done?  I
can think of some general usages:

Device matrix:

- virtio-net, vhost=on|off
- device assignment

Configuration matrix:

- legacy/scalable
- passthrough=on|off

I believe smoke test (like netperf a few seconds, or even ping) would
be enough, cover some (or all, which would be very nice to have :) of
above scenarios.

> +     */
> +    case VTD_INV_DESC_PC:
> +        break;
> +
> +    case VTD_INV_DESC_PIOTLB:
> +        break;
> +
>      case VTD_INV_DESC_WAIT:
>          trace_vtd_inv_desc("wait", inv_desc.val[1], inv_desc.val[0]);
>          if (!vtd_process_wait_desc(s, &inv_desc)) {
> @@ -3051,6 +3062,7 @@ static Property vtd_properties[] = {
>      DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
>                        VTD_HOST_ADDRESS_WIDTH),
>      DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
> +    DEFINE_PROP_BOOL("scalable-mode", IntelIOMMUState, scalable_mode, FALSE),

Let's start with x-scalable-mode?  Less burden for all.

>      DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
>      DEFINE_PROP_END_OF_LIST(),
>  };
> @@ -3583,6 +3595,16 @@ static void vtd_init(IntelIOMMUState *s)
>          s->cap |= VTD_CAP_CM;
>      }
>  
> +    /* TODO: read cap/ecap from host to decide which cap to be exposed. */
> +    if (s->scalable_mode) {
> +        if (!s->caching_mode) {
> +            error_report("Need to set caching-mode for scalable mode");

Could I ask why?

> +            exit(1);
> +        }
> +        s->cap |= VTD_CAP_DWD | VTD_CAP_DRD;

Draining is supported now so this line is not needed.

> +        s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
> +    }
> +
>      vtd_reset_caches(s);
>  
>      /* Define registers with default values and bit semantics */
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index 2a753c5..b01953a 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -190,7 +190,9 @@
>  #define VTD_ECAP_EIM                (1ULL << 4)
>  #define VTD_ECAP_PT                 (1ULL << 6)
>  #define VTD_ECAP_MHMV               (15ULL << 20)
> +#define VTD_ECAP_SRS                (1ULL << 31)
>  #define VTD_ECAP_SMTS               (1ULL << 43)
> +#define VTD_ECAP_SLTS               (1ULL << 46)
>  
>  /* CAP_REG */
>  /* (offset >> 4) << 24 */
> @@ -209,6 +211,8 @@
>  #define VTD_CAP_DRAIN_READ          (1ULL << 55)
>  #define VTD_CAP_DRAIN               (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE)
>  #define VTD_CAP_CM                  (1ULL << 7)
> +#define VTD_CAP_DWD                 (1ULL << 54)
> +#define VTD_CAP_DRD                 (1ULL << 55)

These can be dropped too (see VTD_CAP_DRAIN above).

>  
>  /* Supported Adjusted Guest Address Widths */
>  #define VTD_CAP_SAGAW_SHIFT         8
> @@ -340,6 +344,8 @@ typedef union VTDInvDesc VTDInvDesc;
>  #define VTD_INV_DESC_IEC                0x4 /* Interrupt Entry Cache
>                                                 Invalidate Descriptor */
>  #define VTD_INV_DESC_WAIT               0x5 /* Invalidation Wait Descriptor */
> +#define VTD_INV_DESC_PIOTLB             0x6 /* PASID-IOTLB Invalidate Desc */
> +#define VTD_INV_DESC_PC                 0x7 /* PASID-cache Invalidate Desc */
>  #define VTD_INV_DESC_NONE               0   /* Not an Invalidate Descriptor */
>  
>  /* Masks for Invalidation Wait Descriptor*/
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index a5da139..a04fad6 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -219,7 +219,8 @@ struct IntelIOMMUState {
>      uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
>      uint32_t version;
>  
> -    bool caching_mode;          /* RO - is cap CM enabled? */
> +    bool caching_mode;              /* RO - is cap CM enabled? */
> +    bool scalable_mode;             /* RO - is Scalable Mode supported? */
>  
>      dma_addr_t root;                /* Current root table pointer */
>      bool root_extended;             /* Type of root table (extended or not) */
> -- 
> 1.9.1
> 

Regards,
Yi Sun Feb. 15, 2019, 5:22 a.m. UTC | #2
On 19-02-12 14:46:29, Peter Xu wrote:

[...]

> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 3664a00..447fdf3 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -2492,6 +2492,17 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
> >          }
> >          break;
> >  
> > +    /*
> > +     * TODO: the entity of below two cases will be implemented in future series.
> > +     * To make guest (which integrates scalable mode support patch set in
> > +     * iommu driver) work, just return true is enough so far.
> 
> When you repost, could you mention about what tests have you done?  I
> can think of some general usages:
> 
> Device matrix:
> 
> - virtio-net, vhost=on|off
> - device assignment
> 
> Configuration matrix:
> 
> - legacy/scalable
> - passthrough=on|off
> 
> I believe smoke test (like netperf a few seconds, or even ping) would
> be enough, cover some (or all, which would be very nice to have :) of
> above scenarios.
> 
Thanks for the test cases! I have covered some of them but I think I
can do more tests.

> > +     */
> > +    case VTD_INV_DESC_PC:
> > +        break;
> > +
> > +    case VTD_INV_DESC_PIOTLB:
> > +        break;
> > +
> >      case VTD_INV_DESC_WAIT:
> >          trace_vtd_inv_desc("wait", inv_desc.val[1], inv_desc.val[0]);
> >          if (!vtd_process_wait_desc(s, &inv_desc)) {
> > @@ -3051,6 +3062,7 @@ static Property vtd_properties[] = {
> >      DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
> >                        VTD_HOST_ADDRESS_WIDTH),
> >      DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
> > +    DEFINE_PROP_BOOL("scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
> 
> Let's start with x-scalable-mode?  Less burden for all.
> 
Sure.

> >      DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> > @@ -3583,6 +3595,16 @@ static void vtd_init(IntelIOMMUState *s)
> >          s->cap |= VTD_CAP_CM;
> >      }
> >  
> > +    /* TODO: read cap/ecap from host to decide which cap to be exposed. */
> > +    if (s->scalable_mode) {
> > +        if (!s->caching_mode) {
> > +            error_report("Need to set caching-mode for scalable mode");
> 
> Could I ask why?
> 
My intention is to make guest explicitly send to make sure SLT shadowing
correctly.

For this point, I also have question. Why does legacy mode not check CM?
If CM is not set, may the DMA remapping be wrong because SLT cannot
match guest's latest change?

> > +            exit(1);
> > +        }
> > +        s->cap |= VTD_CAP_DWD | VTD_CAP_DRD;
> 
> Draining is supported now so this line is not needed.
> 
Got it, thanks!

> > +        s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
> > +    }
> > +
> >      vtd_reset_caches(s);
> >  
> >      /* Define registers with default values and bit semantics */
> > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> > index 2a753c5..b01953a 100644
> > --- a/hw/i386/intel_iommu_internal.h
> > +++ b/hw/i386/intel_iommu_internal.h
> > @@ -190,7 +190,9 @@
> >  #define VTD_ECAP_EIM                (1ULL << 4)
> >  #define VTD_ECAP_PT                 (1ULL << 6)
> >  #define VTD_ECAP_MHMV               (15ULL << 20)
> > +#define VTD_ECAP_SRS                (1ULL << 31)
> >  #define VTD_ECAP_SMTS               (1ULL << 43)
> > +#define VTD_ECAP_SLTS               (1ULL << 46)
> >  
> >  /* CAP_REG */
> >  /* (offset >> 4) << 24 */
> > @@ -209,6 +211,8 @@
> >  #define VTD_CAP_DRAIN_READ          (1ULL << 55)
> >  #define VTD_CAP_DRAIN               (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE)
> >  #define VTD_CAP_CM                  (1ULL << 7)
> > +#define VTD_CAP_DWD                 (1ULL << 54)
> > +#define VTD_CAP_DRD                 (1ULL << 55)
> 
> These can be dropped too (see VTD_CAP_DRAIN above).
> 
Thanks!

[...]

> > -- 
> > 1.9.1
> > 
> 
> Regards,
> 
> -- 
> Peter Xu
Peter Xu Feb. 15, 2019, 5:39 a.m. UTC | #3
On Fri, Feb 15, 2019 at 01:22:34PM +0800, Yi Sun wrote:

[...]

> > > +    /* TODO: read cap/ecap from host to decide which cap to be exposed. */
> > > +    if (s->scalable_mode) {
> > > +        if (!s->caching_mode) {
> > > +            error_report("Need to set caching-mode for scalable mode");
> > 
> > Could I ask why?
> > 
> My intention is to make guest explicitly send to make sure SLT shadowing
> correctly.
> 
> For this point, I also have question. Why does legacy mode not check CM?
> If CM is not set, may the DMA remapping be wrong because SLT cannot
> match guest's latest change?

Because CM is currently only required for device assignment.  For
example, if you only have an emulated device like virtio-net-pci in
the guest, then you don't need the CM capability to let it work with
the vIOMMU.  That's because we'll walk the 2nd level IOMMU page table
only when the virtio-net-pci device does DMA, and QEMU can easily do
that (QEMU is emulating the DMA of the virtio-net-pci device, and QEMU
has the knowledge of the guest 2nd level IOMMU page tables).  Assigned
devices are special because the host hardware knows nothing about
guest 2nd level page tables, so QEMU needs to shadow them before DMA
starts.

That's also why I listed "device assignment" as a special case in the
test device matrix, because that's the only case where we can torture
the IOMMU page shadowing code a bit.

For the scalable mode I would suppose you will still allow it to work
without caching mode.  The example is the same as above - when there's
only emulated devices in the guest.

Regards,
Yi Sun Feb. 15, 2019, 7:44 a.m. UTC | #4
On 19-02-15 13:39:05, Peter Xu wrote:
> On Fri, Feb 15, 2019 at 01:22:34PM +0800, Yi Sun wrote:
> 
> [...]
> 
> > > > +    /* TODO: read cap/ecap from host to decide which cap to be exposed. */
> > > > +    if (s->scalable_mode) {
> > > > +        if (!s->caching_mode) {
> > > > +            error_report("Need to set caching-mode for scalable mode");
> > > 
> > > Could I ask why?
> > > 
> > My intention is to make guest explicitly send to make sure SLT shadowing
> > correctly.
> > 
> > For this point, I also have question. Why does legacy mode not check CM?
> > If CM is not set, may the DMA remapping be wrong because SLT cannot
> > match guest's latest change?
> 
> Because CM is currently only required for device assignment.  For
> example, if you only have an emulated device like virtio-net-pci in
> the guest, then you don't need the CM capability to let it work with
> the vIOMMU.  That's because we'll walk the 2nd level IOMMU page table
> only when the virtio-net-pci device does DMA, and QEMU can easily do
> that (QEMU is emulating the DMA of the virtio-net-pci device, and QEMU
> has the knowledge of the guest 2nd level IOMMU page tables).  Assigned
> devices are special because the host hardware knows nothing about
> guest 2nd level page tables, so QEMU needs to shadow them before DMA
> starts.
> 
Thanks for the explanation! As CM is must for device assignment, I may
remove the check in current codes. In fact, CM is optional for SM.

> That's also why I listed "device assignment" as a special case in the
> test device matrix, because that's the only case where we can torture
> the IOMMU page shadowing code a bit.
> 
> For the scalable mode I would suppose you will still allow it to work
> without caching mode.  The example is the same as above - when there's
> only emulated devices in the guest.
> 
Yes, I verified it by using virtio-net-pci without device assignment.
The netperf simple case runs well.

> Regards,
> 
> -- 
> Peter Xu
Jason Wang Feb. 15, 2019, 8:22 a.m. UTC | #5
On 2019/2/15 下午1:39, Peter Xu wrote:
> On Fri, Feb 15, 2019 at 01:22:34PM +0800, Yi Sun wrote:
>
> [...]
>
>>>> +    /* TODO: read cap/ecap from host to decide which cap to be exposed. */
>>>> +    if (s->scalable_mode) {
>>>> +        if (!s->caching_mode) {
>>>> +            error_report("Need to set caching-mode for scalable mode");
>>> Could I ask why?
>>>
>> My intention is to make guest explicitly send to make sure SLT shadowing
>> correctly.
>>
>> For this point, I also have question. Why does legacy mode not check CM?
>> If CM is not set, may the DMA remapping be wrong because SLT cannot
>> match guest's latest change?
> Because CM is currently only required for device assignment.  For
> example, if you only have an emulated device like virtio-net-pci in
> the guest, then you don't need the CM capability to let it work with
> the vIOMMU.  That's because we'll walk the 2nd level IOMMU page table
> only when the virtio-net-pci device does DMA, and QEMU can easily do
> that (QEMU is emulating the DMA of the virtio-net-pci device, and QEMU
> has the knowledge of the guest 2nd level IOMMU page tables).  Assigned
> devices are special because the host hardware knows nothing about
> guest 2nd level page tables, so QEMU needs to shadow them before DMA
> starts.


+1

And there's dataplane (vhost) which implements device IOTLB that doesn't 
require shadow page table. The can ask for translation for some address 
themselves.


>
> That's also why I listed "device assignment" as a special case in the
> test device matrix, because that's the only case where we can torture
> the IOMMU page shadowing code a bit.
>
> For the scalable mode I would suppose you will still allow it to work
> without caching mode.  The example is the same as above - when there's
> only emulated devices in the guest.


Agree.

Thanks


>
> Regards,
>
diff mbox series

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 3664a00..447fdf3 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2492,6 +2492,17 @@  static bool vtd_process_inv_desc(IntelIOMMUState *s)
         }
         break;
 
+    /*
+     * TODO: the entity of below two cases will be implemented in future series.
+     * To make guest (which integrates scalable mode support patch set in
+     * iommu driver) work, just return true is enough so far.
+     */
+    case VTD_INV_DESC_PC:
+        break;
+
+    case VTD_INV_DESC_PIOTLB:
+        break;
+
     case VTD_INV_DESC_WAIT:
         trace_vtd_inv_desc("wait", inv_desc.val[1], inv_desc.val[0]);
         if (!vtd_process_wait_desc(s, &inv_desc)) {
@@ -3051,6 +3062,7 @@  static Property vtd_properties[] = {
     DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
                       VTD_HOST_ADDRESS_WIDTH),
     DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
+    DEFINE_PROP_BOOL("scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
     DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
     DEFINE_PROP_END_OF_LIST(),
 };
@@ -3583,6 +3595,16 @@  static void vtd_init(IntelIOMMUState *s)
         s->cap |= VTD_CAP_CM;
     }
 
+    /* TODO: read cap/ecap from host to decide which cap to be exposed. */
+    if (s->scalable_mode) {
+        if (!s->caching_mode) {
+            error_report("Need to set caching-mode for scalable mode");
+            exit(1);
+        }
+        s->cap |= VTD_CAP_DWD | VTD_CAP_DRD;
+        s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
+    }
+
     vtd_reset_caches(s);
 
     /* Define registers with default values and bit semantics */
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 2a753c5..b01953a 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -190,7 +190,9 @@ 
 #define VTD_ECAP_EIM                (1ULL << 4)
 #define VTD_ECAP_PT                 (1ULL << 6)
 #define VTD_ECAP_MHMV               (15ULL << 20)
+#define VTD_ECAP_SRS                (1ULL << 31)
 #define VTD_ECAP_SMTS               (1ULL << 43)
+#define VTD_ECAP_SLTS               (1ULL << 46)
 
 /* CAP_REG */
 /* (offset >> 4) << 24 */
@@ -209,6 +211,8 @@ 
 #define VTD_CAP_DRAIN_READ          (1ULL << 55)
 #define VTD_CAP_DRAIN               (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE)
 #define VTD_CAP_CM                  (1ULL << 7)
+#define VTD_CAP_DWD                 (1ULL << 54)
+#define VTD_CAP_DRD                 (1ULL << 55)
 
 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT         8
@@ -340,6 +344,8 @@  typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_IEC                0x4 /* Interrupt Entry Cache
                                                Invalidate Descriptor */
 #define VTD_INV_DESC_WAIT               0x5 /* Invalidation Wait Descriptor */
+#define VTD_INV_DESC_PIOTLB             0x6 /* PASID-IOTLB Invalidate Desc */
+#define VTD_INV_DESC_PC                 0x7 /* PASID-cache Invalidate Desc */
 #define VTD_INV_DESC_NONE               0   /* Not an Invalidate Descriptor */
 
 /* Masks for Invalidation Wait Descriptor*/
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index a5da139..a04fad6 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -219,7 +219,8 @@  struct IntelIOMMUState {
     uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
     uint32_t version;
 
-    bool caching_mode;          /* RO - is cap CM enabled? */
+    bool caching_mode;              /* RO - is cap CM enabled? */
+    bool scalable_mode;             /* RO - is Scalable Mode supported? */
 
     dma_addr_t root;                /* Current root table pointer */
     bool root_extended;             /* Type of root table (extended or not) */