Message ID | 1548824953-23413-4-git-send-email-yi.y.sun@linux.intel.com |
---|---|
State | New |
Headers | show |
Series | intel_iommu: support scalable mode | expand |
On Wed, Jan 30, 2019 at 01:09:13PM +0800, Yi Sun wrote: > This patch adds an option to provide flexibility for user to expose > Scalable Mode to guest. User could expose Scalable Mode to guest by > the config as below: > > "-device intel-iommu,caching-mode=on,scalable-mode=on" > > The Linux iommu driver has supported scalable mode. Please refer below > patch set: > > https://www.spinics.net/lists/kernel/msg2985279.html > > Signed-off-by: Liu, Yi L <yi.l.liu@intel.com> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> > --- > hw/i386/intel_iommu.c | 22 ++++++++++++++++++++++ > hw/i386/intel_iommu_internal.h | 6 ++++++ > include/hw/i386/intel_iommu.h | 3 ++- > 3 files changed, 30 insertions(+), 1 deletion(-) > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 3664a00..447fdf3 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -2492,6 +2492,17 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) > } > break; > > + /* > + * TODO: the entity of below two cases will be implemented in future series. > + * To make guest (which integrates scalable mode support patch set in > + * iommu driver) work, just return true is enough so far. When you repost, could you mention about what tests have you done? I can think of some general usages: Device matrix: - virtio-net, vhost=on|off - device assignment Configuration matrix: - legacy/scalable - passthrough=on|off I believe smoke test (like netperf a few seconds, or even ping) would be enough, cover some (or all, which would be very nice to have :) of above scenarios. > + */ > + case VTD_INV_DESC_PC: > + break; > + > + case VTD_INV_DESC_PIOTLB: > + break; > + > case VTD_INV_DESC_WAIT: > trace_vtd_inv_desc("wait", inv_desc.val[1], inv_desc.val[0]); > if (!vtd_process_wait_desc(s, &inv_desc)) { > @@ -3051,6 +3062,7 @@ static Property vtd_properties[] = { > DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits, > VTD_HOST_ADDRESS_WIDTH), > DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), > + DEFINE_PROP_BOOL("scalable-mode", IntelIOMMUState, scalable_mode, FALSE), Let's start with x-scalable-mode? Less burden for all. > DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), > DEFINE_PROP_END_OF_LIST(), > }; > @@ -3583,6 +3595,16 @@ static void vtd_init(IntelIOMMUState *s) > s->cap |= VTD_CAP_CM; > } > > + /* TODO: read cap/ecap from host to decide which cap to be exposed. */ > + if (s->scalable_mode) { > + if (!s->caching_mode) { > + error_report("Need to set caching-mode for scalable mode"); Could I ask why? > + exit(1); > + } > + s->cap |= VTD_CAP_DWD | VTD_CAP_DRD; Draining is supported now so this line is not needed. > + s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; > + } > + > vtd_reset_caches(s); > > /* Define registers with default values and bit semantics */ > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h > index 2a753c5..b01953a 100644 > --- a/hw/i386/intel_iommu_internal.h > +++ b/hw/i386/intel_iommu_internal.h > @@ -190,7 +190,9 @@ > #define VTD_ECAP_EIM (1ULL << 4) > #define VTD_ECAP_PT (1ULL << 6) > #define VTD_ECAP_MHMV (15ULL << 20) > +#define VTD_ECAP_SRS (1ULL << 31) > #define VTD_ECAP_SMTS (1ULL << 43) > +#define VTD_ECAP_SLTS (1ULL << 46) > > /* CAP_REG */ > /* (offset >> 4) << 24 */ > @@ -209,6 +211,8 @@ > #define VTD_CAP_DRAIN_READ (1ULL << 55) > #define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE) > #define VTD_CAP_CM (1ULL << 7) > +#define VTD_CAP_DWD (1ULL << 54) > +#define VTD_CAP_DRD (1ULL << 55) These can be dropped too (see VTD_CAP_DRAIN above). > > /* Supported Adjusted Guest Address Widths */ > #define VTD_CAP_SAGAW_SHIFT 8 > @@ -340,6 +344,8 @@ typedef union VTDInvDesc VTDInvDesc; > #define VTD_INV_DESC_IEC 0x4 /* Interrupt Entry Cache > Invalidate Descriptor */ > #define VTD_INV_DESC_WAIT 0x5 /* Invalidation Wait Descriptor */ > +#define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */ > +#define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */ > #define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */ > > /* Masks for Invalidation Wait Descriptor*/ > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h > index a5da139..a04fad6 100644 > --- a/include/hw/i386/intel_iommu.h > +++ b/include/hw/i386/intel_iommu.h > @@ -219,7 +219,8 @@ struct IntelIOMMUState { > uint8_t womask[DMAR_REG_SIZE]; /* WO (write only - read returns 0) */ > uint32_t version; > > - bool caching_mode; /* RO - is cap CM enabled? */ > + bool caching_mode; /* RO - is cap CM enabled? */ > + bool scalable_mode; /* RO - is Scalable Mode supported? */ > > dma_addr_t root; /* Current root table pointer */ > bool root_extended; /* Type of root table (extended or not) */ > -- > 1.9.1 > Regards,
On 19-02-12 14:46:29, Peter Xu wrote: [...] > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > > index 3664a00..447fdf3 100644 > > --- a/hw/i386/intel_iommu.c > > +++ b/hw/i386/intel_iommu.c > > @@ -2492,6 +2492,17 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) > > } > > break; > > > > + /* > > + * TODO: the entity of below two cases will be implemented in future series. > > + * To make guest (which integrates scalable mode support patch set in > > + * iommu driver) work, just return true is enough so far. > > When you repost, could you mention about what tests have you done? I > can think of some general usages: > > Device matrix: > > - virtio-net, vhost=on|off > - device assignment > > Configuration matrix: > > - legacy/scalable > - passthrough=on|off > > I believe smoke test (like netperf a few seconds, or even ping) would > be enough, cover some (or all, which would be very nice to have :) of > above scenarios. > Thanks for the test cases! I have covered some of them but I think I can do more tests. > > + */ > > + case VTD_INV_DESC_PC: > > + break; > > + > > + case VTD_INV_DESC_PIOTLB: > > + break; > > + > > case VTD_INV_DESC_WAIT: > > trace_vtd_inv_desc("wait", inv_desc.val[1], inv_desc.val[0]); > > if (!vtd_process_wait_desc(s, &inv_desc)) { > > @@ -3051,6 +3062,7 @@ static Property vtd_properties[] = { > > DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits, > > VTD_HOST_ADDRESS_WIDTH), > > DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), > > + DEFINE_PROP_BOOL("scalable-mode", IntelIOMMUState, scalable_mode, FALSE), > > Let's start with x-scalable-mode? Less burden for all. > Sure. > > DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), > > DEFINE_PROP_END_OF_LIST(), > > }; > > @@ -3583,6 +3595,16 @@ static void vtd_init(IntelIOMMUState *s) > > s->cap |= VTD_CAP_CM; > > } > > > > + /* TODO: read cap/ecap from host to decide which cap to be exposed. */ > > + if (s->scalable_mode) { > > + if (!s->caching_mode) { > > + error_report("Need to set caching-mode for scalable mode"); > > Could I ask why? > My intention is to make guest explicitly send to make sure SLT shadowing correctly. For this point, I also have question. Why does legacy mode not check CM? If CM is not set, may the DMA remapping be wrong because SLT cannot match guest's latest change? > > + exit(1); > > + } > > + s->cap |= VTD_CAP_DWD | VTD_CAP_DRD; > > Draining is supported now so this line is not needed. > Got it, thanks! > > + s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; > > + } > > + > > vtd_reset_caches(s); > > > > /* Define registers with default values and bit semantics */ > > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h > > index 2a753c5..b01953a 100644 > > --- a/hw/i386/intel_iommu_internal.h > > +++ b/hw/i386/intel_iommu_internal.h > > @@ -190,7 +190,9 @@ > > #define VTD_ECAP_EIM (1ULL << 4) > > #define VTD_ECAP_PT (1ULL << 6) > > #define VTD_ECAP_MHMV (15ULL << 20) > > +#define VTD_ECAP_SRS (1ULL << 31) > > #define VTD_ECAP_SMTS (1ULL << 43) > > +#define VTD_ECAP_SLTS (1ULL << 46) > > > > /* CAP_REG */ > > /* (offset >> 4) << 24 */ > > @@ -209,6 +211,8 @@ > > #define VTD_CAP_DRAIN_READ (1ULL << 55) > > #define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE) > > #define VTD_CAP_CM (1ULL << 7) > > +#define VTD_CAP_DWD (1ULL << 54) > > +#define VTD_CAP_DRD (1ULL << 55) > > These can be dropped too (see VTD_CAP_DRAIN above). > Thanks! [...] > > -- > > 1.9.1 > > > > Regards, > > -- > Peter Xu
On Fri, Feb 15, 2019 at 01:22:34PM +0800, Yi Sun wrote: [...] > > > + /* TODO: read cap/ecap from host to decide which cap to be exposed. */ > > > + if (s->scalable_mode) { > > > + if (!s->caching_mode) { > > > + error_report("Need to set caching-mode for scalable mode"); > > > > Could I ask why? > > > My intention is to make guest explicitly send to make sure SLT shadowing > correctly. > > For this point, I also have question. Why does legacy mode not check CM? > If CM is not set, may the DMA remapping be wrong because SLT cannot > match guest's latest change? Because CM is currently only required for device assignment. For example, if you only have an emulated device like virtio-net-pci in the guest, then you don't need the CM capability to let it work with the vIOMMU. That's because we'll walk the 2nd level IOMMU page table only when the virtio-net-pci device does DMA, and QEMU can easily do that (QEMU is emulating the DMA of the virtio-net-pci device, and QEMU has the knowledge of the guest 2nd level IOMMU page tables). Assigned devices are special because the host hardware knows nothing about guest 2nd level page tables, so QEMU needs to shadow them before DMA starts. That's also why I listed "device assignment" as a special case in the test device matrix, because that's the only case where we can torture the IOMMU page shadowing code a bit. For the scalable mode I would suppose you will still allow it to work without caching mode. The example is the same as above - when there's only emulated devices in the guest. Regards,
On 19-02-15 13:39:05, Peter Xu wrote: > On Fri, Feb 15, 2019 at 01:22:34PM +0800, Yi Sun wrote: > > [...] > > > > > + /* TODO: read cap/ecap from host to decide which cap to be exposed. */ > > > > + if (s->scalable_mode) { > > > > + if (!s->caching_mode) { > > > > + error_report("Need to set caching-mode for scalable mode"); > > > > > > Could I ask why? > > > > > My intention is to make guest explicitly send to make sure SLT shadowing > > correctly. > > > > For this point, I also have question. Why does legacy mode not check CM? > > If CM is not set, may the DMA remapping be wrong because SLT cannot > > match guest's latest change? > > Because CM is currently only required for device assignment. For > example, if you only have an emulated device like virtio-net-pci in > the guest, then you don't need the CM capability to let it work with > the vIOMMU. That's because we'll walk the 2nd level IOMMU page table > only when the virtio-net-pci device does DMA, and QEMU can easily do > that (QEMU is emulating the DMA of the virtio-net-pci device, and QEMU > has the knowledge of the guest 2nd level IOMMU page tables). Assigned > devices are special because the host hardware knows nothing about > guest 2nd level page tables, so QEMU needs to shadow them before DMA > starts. > Thanks for the explanation! As CM is must for device assignment, I may remove the check in current codes. In fact, CM is optional for SM. > That's also why I listed "device assignment" as a special case in the > test device matrix, because that's the only case where we can torture > the IOMMU page shadowing code a bit. > > For the scalable mode I would suppose you will still allow it to work > without caching mode. The example is the same as above - when there's > only emulated devices in the guest. > Yes, I verified it by using virtio-net-pci without device assignment. The netperf simple case runs well. > Regards, > > -- > Peter Xu
On 2019/2/15 下午1:39, Peter Xu wrote: > On Fri, Feb 15, 2019 at 01:22:34PM +0800, Yi Sun wrote: > > [...] > >>>> + /* TODO: read cap/ecap from host to decide which cap to be exposed. */ >>>> + if (s->scalable_mode) { >>>> + if (!s->caching_mode) { >>>> + error_report("Need to set caching-mode for scalable mode"); >>> Could I ask why? >>> >> My intention is to make guest explicitly send to make sure SLT shadowing >> correctly. >> >> For this point, I also have question. Why does legacy mode not check CM? >> If CM is not set, may the DMA remapping be wrong because SLT cannot >> match guest's latest change? > Because CM is currently only required for device assignment. For > example, if you only have an emulated device like virtio-net-pci in > the guest, then you don't need the CM capability to let it work with > the vIOMMU. That's because we'll walk the 2nd level IOMMU page table > only when the virtio-net-pci device does DMA, and QEMU can easily do > that (QEMU is emulating the DMA of the virtio-net-pci device, and QEMU > has the knowledge of the guest 2nd level IOMMU page tables). Assigned > devices are special because the host hardware knows nothing about > guest 2nd level page tables, so QEMU needs to shadow them before DMA > starts. +1 And there's dataplane (vhost) which implements device IOTLB that doesn't require shadow page table. The can ask for translation for some address themselves. > > That's also why I listed "device assignment" as a special case in the > test device matrix, because that's the only case where we can torture > the IOMMU page shadowing code a bit. > > For the scalable mode I would suppose you will still allow it to work > without caching mode. The example is the same as above - when there's > only emulated devices in the guest. Agree. Thanks > > Regards, >
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 3664a00..447fdf3 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2492,6 +2492,17 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; + /* + * TODO: the entity of below two cases will be implemented in future series. + * To make guest (which integrates scalable mode support patch set in + * iommu driver) work, just return true is enough so far. + */ + case VTD_INV_DESC_PC: + break; + + case VTD_INV_DESC_PIOTLB: + break; + case VTD_INV_DESC_WAIT: trace_vtd_inv_desc("wait", inv_desc.val[1], inv_desc.val[0]); if (!vtd_process_wait_desc(s, &inv_desc)) { @@ -3051,6 +3062,7 @@ static Property vtd_properties[] = { DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits, VTD_HOST_ADDRESS_WIDTH), DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), + DEFINE_PROP_BOOL("scalable-mode", IntelIOMMUState, scalable_mode, FALSE), DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), DEFINE_PROP_END_OF_LIST(), }; @@ -3583,6 +3595,16 @@ static void vtd_init(IntelIOMMUState *s) s->cap |= VTD_CAP_CM; } + /* TODO: read cap/ecap from host to decide which cap to be exposed. */ + if (s->scalable_mode) { + if (!s->caching_mode) { + error_report("Need to set caching-mode for scalable mode"); + exit(1); + } + s->cap |= VTD_CAP_DWD | VTD_CAP_DRD; + s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; + } + vtd_reset_caches(s); /* Define registers with default values and bit semantics */ diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 2a753c5..b01953a 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -190,7 +190,9 @@ #define VTD_ECAP_EIM (1ULL << 4) #define VTD_ECAP_PT (1ULL << 6) #define VTD_ECAP_MHMV (15ULL << 20) +#define VTD_ECAP_SRS (1ULL << 31) #define VTD_ECAP_SMTS (1ULL << 43) +#define VTD_ECAP_SLTS (1ULL << 46) /* CAP_REG */ /* (offset >> 4) << 24 */ @@ -209,6 +211,8 @@ #define VTD_CAP_DRAIN_READ (1ULL << 55) #define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE) #define VTD_CAP_CM (1ULL << 7) +#define VTD_CAP_DWD (1ULL << 54) +#define VTD_CAP_DRD (1ULL << 55) /* Supported Adjusted Guest Address Widths */ #define VTD_CAP_SAGAW_SHIFT 8 @@ -340,6 +344,8 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_IEC 0x4 /* Interrupt Entry Cache Invalidate Descriptor */ #define VTD_INV_DESC_WAIT 0x5 /* Invalidation Wait Descriptor */ +#define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */ +#define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */ #define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */ /* Masks for Invalidation Wait Descriptor*/ diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index a5da139..a04fad6 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -219,7 +219,8 @@ struct IntelIOMMUState { uint8_t womask[DMAR_REG_SIZE]; /* WO (write only - read returns 0) */ uint32_t version; - bool caching_mode; /* RO - is cap CM enabled? */ + bool caching_mode; /* RO - is cap CM enabled? */ + bool scalable_mode; /* RO - is Scalable Mode supported? */ dma_addr_t root; /* Current root table pointer */ bool root_extended; /* Type of root table (extended or not) */