diff mbox series

[1/2] virtio-iommu: Fix 64kB host page size VFIO device assignment

Message ID 20230704111527.3424992-2-eric.auger@redhat.com
State New
Headers show
Series VIRTIO-IOMMU/VFIO page size related fixes | expand

Commit Message

Eric Auger July 4, 2023, 11:15 a.m. UTC
When running on a 64kB page size host and protecting a VFIO device
with the virtio-iommu, qemu crashes with this kind of message:

qemu-kvm: virtio-iommu page mask 0xfffffffffffff000 is incompatible
with mask 0x20010000
qemu: hardware error: vfio: DMA mapping failed, unable to continue

This is due to the fact the IOMMU MR corresponding to the VFIO device
is enabled very late on domain attach, after the machine init.
The device reports a minimal 64kB page size but it is too late to be
applied. virtio_iommu_set_page_size_mask() fails and this causes
vfio_listener_region_add() to end up with hw_error();

To work around this issue, we transiently enable the IOMMU MR on
machine init to collect the page size requirements and then restore
the bypass state.

Fixes: 90519b9053 ("virtio-iommu: Add bypass mode support to assigned device")
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
 include/hw/virtio/virtio-iommu.h |  2 ++
 hw/virtio/virtio-iommu.c         | 30 ++++++++++++++++++++++++++++--
 hw/virtio/trace-events           |  1 +
 3 files changed, 31 insertions(+), 2 deletions(-)

Comments

Duan, Zhenzhong July 5, 2023, 4:52 a.m. UTC | #1
Hi Eric,

>-----Original Message-----
>From: Eric Auger <eric.auger@redhat.com>
>Sent: Tuesday, July 4, 2023 7:15 PM
>Subject: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO device
>assignment
>
>When running on a 64kB page size host and protecting a VFIO device
>with the virtio-iommu, qemu crashes with this kind of message:
>
>qemu-kvm: virtio-iommu page mask 0xfffffffffffff000 is incompatible
>with mask 0x20010000

Does 0x20010000 mean only  512MB and 64KB super page mapping is
supported for host iommu hw? 4KB mapping not supported?

There is a check in guest kernel side hint only 4KB is supported, with
this patch we force viommu->pgsize_bitmap to 0x20010000
and fail below check? Does this device work in guest?
Please correct me if I understand wrong.

static int viommu_domain_finalise(struct viommu_endpoint *vdev,
                                  struct iommu_domain *domain)
{
...
        viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap);
        if (viommu_page_size > PAGE_SIZE) {
                dev_err(vdev->dev,
                        "granule 0x%lx larger than system page size 0x%lx\n",
                        viommu_page_size, PAGE_SIZE);
                return -ENODEV;
        }

Another question is: Presume 0x20010000 does mean only 512MB and 64KB
is supported. Is host hva->hpa mapping ensured to be compatible with at least
64KB mapping? If host mmu only support 4KB mapping or other non-compatible
with 0x20010000, will vfio_dma_map() fail?

>qemu: hardware error: vfio: DMA mapping failed, unable to continue
>
>This is due to the fact the IOMMU MR corresponding to the VFIO device
>is enabled very late on domain attach, after the machine init.
>The device reports a minimal 64kB page size but it is too late to be
>applied. virtio_iommu_set_page_size_mask() fails and this causes
>vfio_listener_region_add() to end up with hw_error();
>
>To work around this issue, we transiently enable the IOMMU MR on
>machine init to collect the page size requirements and then restore
>the bypass state.
>
>Fixes: 90519b9053 ("virtio-iommu: Add bypass mode support to assigned
>device")
>Signed-off-by: Eric Auger <eric.auger@redhat.com>
>---
> include/hw/virtio/virtio-iommu.h |  2 ++
> hw/virtio/virtio-iommu.c         | 30 ++++++++++++++++++++++++++++--
> hw/virtio/trace-events           |  1 +
> 3 files changed, 31 insertions(+), 2 deletions(-)
>
>diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-
>iommu.h
>index 2ad5ee320b..a93fc5383e 100644
>--- a/include/hw/virtio/virtio-iommu.h
>+++ b/include/hw/virtio/virtio-iommu.h
>@@ -61,6 +61,8 @@ struct VirtIOIOMMU {
>     QemuRecMutex mutex;
>     GTree *endpoints;
>     bool boot_bypass;
>+    Notifier machine_done;
>+    bool granule_frozen;
> };
>
> #endif
>diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
>index 1cd258135d..1eaf81bab5 100644
>--- a/hw/virtio/virtio-iommu.c
>+++ b/hw/virtio/virtio-iommu.c
>@@ -24,6 +24,7 @@
> #include "hw/virtio/virtio.h"
> #include "sysemu/kvm.h"
> #include "sysemu/reset.h"
>+#include "sysemu/sysemu.h"
> #include "qapi/error.h"
> #include "qemu/error-report.h"
> #include "trace.h"
>@@ -1106,12 +1107,12 @@ static int
>virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
>     }
>
>     /*
>-     * After the machine is finalized, we can't change the mask anymore. If by
>+     * Once the granule is frozen we can't change the mask anymore. If by
>      * chance the hotplugged device supports the same granule, we can still
>      * accept it. Having a different masks is possible but the guest will use
>      * sub-optimal block sizes, so warn about it.
>      */
>-    if (phase_check(PHASE_MACHINE_READY)) {
>+    if (s->granule_frozen) {
>         int new_granule = ctz64(new_mask);
>         int cur_granule = ctz64(cur_mask);
>
>@@ -1146,6 +1147,27 @@ static void virtio_iommu_system_reset(void
>*opaque)
>
> }
>
>+static void virtio_iommu_freeze_granule(Notifier *notifier, void *data)
>+{
>+    VirtIOIOMMU *s = container_of(notifier, VirtIOIOMMU, machine_done);
>+    bool boot_bypass = s->config.bypass;
>+    int granule;
>+
>+    /*
>+     * Transient IOMMU MR enable to collect page_size_mask requirement
>+     * through memory_region_iommu_set_page_size_mask() called by
>+     * VFIO region_add() callback
>+     */
>+    s->config.bypass = 0;
>+    virtio_iommu_switch_address_space_all(s);
>+    /* restore default */
>+    s->config.bypass = boot_bypass;
>+    virtio_iommu_switch_address_space_all(s);
>+    s->granule_frozen = true;
>+    granule = ctz64(s->config.page_size_mask);
>+    trace_virtio_iommu_freeze_granule(BIT(granule));
>+}

It looks a bit heavy here just in order to get page_size_mask from host side.
But maybe this is the only way with current interface.

Thanks
Zhenzhong

>+
> static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
> {
>     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>@@ -1189,6 +1211,9 @@ static void virtio_iommu_device_realize(DeviceState
>*dev, Error **errp)
>         error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
>     }
>
>+    s->machine_done.notify = virtio_iommu_freeze_granule;
>+    qemu_add_machine_init_done_notifier(&s->machine_done);
>+
>     qemu_register_reset(virtio_iommu_system_reset, s);
> }
>
>@@ -1198,6 +1223,7 @@ static void
>virtio_iommu_device_unrealize(DeviceState *dev)
>     VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
>
>     qemu_unregister_reset(virtio_iommu_system_reset, s);
>+    qemu_remove_machine_init_done_notifier(&s->machine_done);
>
>     g_hash_table_destroy(s->as_by_busptr);
>     if (s->domains) {
>diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
>index 8f8d05cf9b..68b752e304 100644
>--- a/hw/virtio/trace-events
>+++ b/hw/virtio/trace-events
>@@ -131,6 +131,7 @@ virtio_iommu_set_page_size_mask(const char *name,
>uint64_t old, uint64_t new) "m
> virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s"
> virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s"
> virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn,
>bool on) "Device %02x:%02x.%x switching address space (iommu
>enabled=%d)"
>+virtio_iommu_freeze_granule(uint64_t page_size_mask) "granule set to
>0x%"PRIx64
>
> # virtio-mem.c
> virtio_mem_send_response(uint16_t type) "type=%" PRIu16
>--
>2.38.1
Eric Auger July 5, 2023, 6:19 a.m. UTC | #2
Hi Zhenzhong,
On 7/5/23 06:52, Duan, Zhenzhong wrote:
> Hi Eric,
>
>> -----Original Message-----
>> From: Eric Auger <eric.auger@redhat.com>
>> Sent: Tuesday, July 4, 2023 7:15 PM
>> Subject: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO device
>> assignment
>>
>> When running on a 64kB page size host and protecting a VFIO device
>> with the virtio-iommu, qemu crashes with this kind of message:
>>
>> qemu-kvm: virtio-iommu page mask 0xfffffffffffff000 is incompatible
>> with mask 0x20010000
> Does 0x20010000 mean only  512MB and 64KB super page mapping is
> supported for host iommu hw? 4KB mapping not supported?
Yes that's correct. In that case the host has 64kB page and 4kB is not
supported.
>
> There is a check in guest kernel side hint only 4KB is supported, with
> this patch we force viommu->pgsize_bitmap to 0x20010000
> and fail below check? Does this device work in guest?
> Please correct me if I understand wrong.
my guest also has 64kB so the check hereafter succeeds. effectively in
case your host has a larger page size than your guest it fails with
[    2.147031] virtio-pci 0000:00:01.0: granule 0x10000 larger than
system page size 0x1000
[    7.231128] ixgbevf 0000:00:02.0: granule 0x10000 larger than system
page size 0x1000

>
> static int viommu_domain_finalise(struct viommu_endpoint *vdev,
>                                   struct iommu_domain *domain)
> {
> ...
>         viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap);
>         if (viommu_page_size > PAGE_SIZE) {
>                 dev_err(vdev->dev,
>                         "granule 0x%lx larger than system page size 0x%lx\n",
>                         viommu_page_size, PAGE_SIZE);
>                 return -ENODEV;
>         }
>
> Another question is: Presume 0x20010000 does mean only 512MB and 64KB
> is supported. Is host hva->hpa mapping ensured to be compatible with at least
> 64KB mapping? If host mmu only support 4KB mapping or other non-compatible
> with 0x20010000, will vfio_dma_map() fail?
the page size mask is retrieved with VFIO_IOMMU_GET_INFO uapi
which returns host vfio_iommu_type1 vfio_iommu->pgsize_bitmap. This
latter is initialized to host PAGE_MASK and later restricted on
vfio_iommu_type1_attach_group though the vfio_update_pgsize_bitmap() calls

so yes both IOMMU and CPU page size are garanteed to be compatible.

>
>> qemu: hardware error: vfio: DMA mapping failed, unable to continue
>>
>> This is due to the fact the IOMMU MR corresponding to the VFIO device
>> is enabled very late on domain attach, after the machine init.
>> The device reports a minimal 64kB page size but it is too late to be
>> applied. virtio_iommu_set_page_size_mask() fails and this causes
>> vfio_listener_region_add() to end up with hw_error();
>>
>> To work around this issue, we transiently enable the IOMMU MR on
>> machine init to collect the page size requirements and then restore
>> the bypass state.
>>
>> Fixes: 90519b9053 ("virtio-iommu: Add bypass mode support to assigned
>> device")
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>> ---
>> include/hw/virtio/virtio-iommu.h |  2 ++
>> hw/virtio/virtio-iommu.c         | 30 ++++++++++++++++++++++++++++--
>> hw/virtio/trace-events           |  1 +
>> 3 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-
>> iommu.h
>> index 2ad5ee320b..a93fc5383e 100644
>> --- a/include/hw/virtio/virtio-iommu.h
>> +++ b/include/hw/virtio/virtio-iommu.h
>> @@ -61,6 +61,8 @@ struct VirtIOIOMMU {
>>     QemuRecMutex mutex;
>>     GTree *endpoints;
>>     bool boot_bypass;
>> +    Notifier machine_done;
>> +    bool granule_frozen;
>> };
>>
>> #endif
>> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
>> index 1cd258135d..1eaf81bab5 100644
>> --- a/hw/virtio/virtio-iommu.c
>> +++ b/hw/virtio/virtio-iommu.c
>> @@ -24,6 +24,7 @@
>> #include "hw/virtio/virtio.h"
>> #include "sysemu/kvm.h"
>> #include "sysemu/reset.h"
>> +#include "sysemu/sysemu.h"
>> #include "qapi/error.h"
>> #include "qemu/error-report.h"
>> #include "trace.h"
>> @@ -1106,12 +1107,12 @@ static int
>> virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
>>     }
>>
>>     /*
>> -     * After the machine is finalized, we can't change the mask anymore. If by
>> +     * Once the granule is frozen we can't change the mask anymore. If by
>>      * chance the hotplugged device supports the same granule, we can still
>>      * accept it. Having a different masks is possible but the guest will use
>>      * sub-optimal block sizes, so warn about it.
>>      */
>> -    if (phase_check(PHASE_MACHINE_READY)) {
>> +    if (s->granule_frozen) {
>>         int new_granule = ctz64(new_mask);
>>         int cur_granule = ctz64(cur_mask);
>>
>> @@ -1146,6 +1147,27 @@ static void virtio_iommu_system_reset(void
>> *opaque)
>>
>> }
>>
>> +static void virtio_iommu_freeze_granule(Notifier *notifier, void *data)
>> +{
>> +    VirtIOIOMMU *s = container_of(notifier, VirtIOIOMMU, machine_done);
>> +    bool boot_bypass = s->config.bypass;
>> +    int granule;
>> +
>> +    /*
>> +     * Transient IOMMU MR enable to collect page_size_mask requirement
>> +     * through memory_region_iommu_set_page_size_mask() called by
>> +     * VFIO region_add() callback
>> +     */
>> +    s->config.bypass = 0;
>> +    virtio_iommu_switch_address_space_all(s);
>> +    /* restore default */
>> +    s->config.bypass = boot_bypass;
>> +    virtio_iommu_switch_address_space_all(s);
>> +    s->granule_frozen = true;
>> +    granule = ctz64(s->config.page_size_mask);
>> +    trace_virtio_iommu_freeze_granule(BIT(granule));
>> +}
> It looks a bit heavy here just in order to get page_size_mask from host side.
> But maybe this is the only way with current interface.

the problem comes from the fact the regions are aliased due to the
bypass and vfio_listener_region_add() does not get a chance to be called
until the actual domain attach. So I do not see any other way to
transiently enable the region.

At least I could check if boot bypass is set before doing that dance. I
will respin with that.

Thanks

Eric
>
> Thanks
> Zhenzhong
>
>> +
>> static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
>> {
>>     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>> @@ -1189,6 +1211,9 @@ static void virtio_iommu_device_realize(DeviceState
>> *dev, Error **errp)
>>         error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
>>     }
>>
>> +    s->machine_done.notify = virtio_iommu_freeze_granule;
>> +    qemu_add_machine_init_done_notifier(&s->machine_done);
>> +
>>     qemu_register_reset(virtio_iommu_system_reset, s);
>> }
>>
>> @@ -1198,6 +1223,7 @@ static void
>> virtio_iommu_device_unrealize(DeviceState *dev)
>>     VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
>>
>>     qemu_unregister_reset(virtio_iommu_system_reset, s);
>> +    qemu_remove_machine_init_done_notifier(&s->machine_done);
>>
>>     g_hash_table_destroy(s->as_by_busptr);
>>     if (s->domains) {
>> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
>> index 8f8d05cf9b..68b752e304 100644
>> --- a/hw/virtio/trace-events
>> +++ b/hw/virtio/trace-events
>> @@ -131,6 +131,7 @@ virtio_iommu_set_page_size_mask(const char *name,
>> uint64_t old, uint64_t new) "m
>> virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s"
>> virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s"
>> virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn,
>> bool on) "Device %02x:%02x.%x switching address space (iommu
>> enabled=%d)"
>> +virtio_iommu_freeze_granule(uint64_t page_size_mask) "granule set to
>> 0x%"PRIx64
>>
>> # virtio-mem.c
>> virtio_mem_send_response(uint16_t type) "type=%" PRIu16
>> --
>> 2.38.1
Duan, Zhenzhong July 5, 2023, 8:11 a.m. UTC | #3
>-----Original Message-----
>From: Eric Auger <eric.auger@redhat.com>
>Sent: Wednesday, July 5, 2023 2:20 PM
>Subject: Re: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO device
>assignment
>
>Hi Zhenzhong,
>On 7/5/23 06:52, Duan, Zhenzhong wrote:
>> Hi Eric,
>>
>>> -----Original Message-----
>>> From: Eric Auger <eric.auger@redhat.com>
>>> Sent: Tuesday, July 4, 2023 7:15 PM
>>> Subject: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO device
>>> assignment
>>>
>>> When running on a 64kB page size host and protecting a VFIO device
>>> with the virtio-iommu, qemu crashes with this kind of message:
>>>
>>> qemu-kvm: virtio-iommu page mask 0xfffffffffffff000 is incompatible
>>> with mask 0x20010000
>> Does 0x20010000 mean only  512MB and 64KB super page mapping is
>> supported for host iommu hw? 4KB mapping not supported?
>Yes that's correct. In that case the host has 64kB page and 4kB is not
>supported.
>>
>> There is a check in guest kernel side hint only 4KB is supported, with
>> this patch we force viommu->pgsize_bitmap to 0x20010000
>> and fail below check? Does this device work in guest?
>> Please correct me if I understand wrong.
>my guest also has 64kB so the check hereafter succeeds. effectively in
>case your host has a larger page size than your guest it fails with
>[    2.147031] virtio-pci 0000:00:01.0: granule 0x10000 larger than
>system page size 0x1000
>[    7.231128] ixgbevf 0000:00:02.0: granule 0x10000 larger than system
>page size 0x1000

Oh, I see, I took PAGE_SIZE as 4KB for granted.

>
>>
>> static int viommu_domain_finalise(struct viommu_endpoint *vdev,
>>                                   struct iommu_domain *domain)
>> {
>> ...
>>         viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap);
>>         if (viommu_page_size > PAGE_SIZE) {
>>                 dev_err(vdev->dev,
>>                         "granule 0x%lx larger than system page size 0x%lx\n",
>>                         viommu_page_size, PAGE_SIZE);
>>                 return -ENODEV;
>>         }
>>
>> Another question is: Presume 0x20010000 does mean only 512MB and 64KB
>> is supported. Is host hva->hpa mapping ensured to be compatible with at
>least
>> 64KB mapping? If host mmu only support 4KB mapping or other non-
>compatible
>> with 0x20010000, will vfio_dma_map() fail?
>the page size mask is retrieved with VFIO_IOMMU_GET_INFO uapi
>which returns host vfio_iommu_type1 vfio_iommu->pgsize_bitmap. This
>latter is initialized to host PAGE_MASK and later restricted on
>vfio_iommu_type1_attach_group though the vfio_update_pgsize_bitmap()
>calls

Understood, thanks for your analysis.

>
>so yes both IOMMU and CPU page size are garanteed to be compatible.
>
>>
>>> qemu: hardware error: vfio: DMA mapping failed, unable to continue
>>>
>>> This is due to the fact the IOMMU MR corresponding to the VFIO device
>>> is enabled very late on domain attach, after the machine init.
>>> The device reports a minimal 64kB page size but it is too late to be
>>> applied. virtio_iommu_set_page_size_mask() fails and this causes
>>> vfio_listener_region_add() to end up with hw_error();
>>>
>>> To work around this issue, we transiently enable the IOMMU MR on
>>> machine init to collect the page size requirements and then restore
>>> the bypass state.
>>>
>>> Fixes: 90519b9053 ("virtio-iommu: Add bypass mode support to assigned
>>> device")
>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>> ---
>>> include/hw/virtio/virtio-iommu.h |  2 ++
>>> hw/virtio/virtio-iommu.c         | 30 ++++++++++++++++++++++++++++--
>>> hw/virtio/trace-events           |  1 +
>>> 3 files changed, 31 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-
>>> iommu.h
>>> index 2ad5ee320b..a93fc5383e 100644
>>> --- a/include/hw/virtio/virtio-iommu.h
>>> +++ b/include/hw/virtio/virtio-iommu.h
>>> @@ -61,6 +61,8 @@ struct VirtIOIOMMU {
>>>     QemuRecMutex mutex;
>>>     GTree *endpoints;
>>>     bool boot_bypass;
>>> +    Notifier machine_done;
>>> +    bool granule_frozen;
>>> };
>>>
>>> #endif
>>> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
>>> index 1cd258135d..1eaf81bab5 100644
>>> --- a/hw/virtio/virtio-iommu.c
>>> +++ b/hw/virtio/virtio-iommu.c
>>> @@ -24,6 +24,7 @@
>>> #include "hw/virtio/virtio.h"
>>> #include "sysemu/kvm.h"
>>> #include "sysemu/reset.h"
>>> +#include "sysemu/sysemu.h"
>>> #include "qapi/error.h"
>>> #include "qemu/error-report.h"
>>> #include "trace.h"
>>> @@ -1106,12 +1107,12 @@ static int
>>> virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
>>>     }
>>>
>>>     /*
>>> -     * After the machine is finalized, we can't change the mask anymore. If
>by
>>> +     * Once the granule is frozen we can't change the mask anymore. If by
>>>      * chance the hotplugged device supports the same granule, we can still
>>>      * accept it. Having a different masks is possible but the guest will use
>>>      * sub-optimal block sizes, so warn about it.
>>>      */
>>> -    if (phase_check(PHASE_MACHINE_READY)) {
>>> +    if (s->granule_frozen) {
>>>         int new_granule = ctz64(new_mask);
>>>         int cur_granule = ctz64(cur_mask);
>>>
>>> @@ -1146,6 +1147,27 @@ static void virtio_iommu_system_reset(void
>>> *opaque)
>>>
>>> }
>>>
>>> +static void virtio_iommu_freeze_granule(Notifier *notifier, void *data)
>>> +{
>>> +    VirtIOIOMMU *s = container_of(notifier, VirtIOIOMMU, machine_done);
>>> +    bool boot_bypass = s->config.bypass;
>>> +    int granule;
>>> +
>>> +    /*
>>> +     * Transient IOMMU MR enable to collect page_size_mask requirement
>>> +     * through memory_region_iommu_set_page_size_mask() called by
>>> +     * VFIO region_add() callback
>>> +     */
>>> +    s->config.bypass = 0;
>>> +    virtio_iommu_switch_address_space_all(s);
>>> +    /* restore default */
>>> +    s->config.bypass = boot_bypass;
>>> +    virtio_iommu_switch_address_space_all(s);
>>> +    s->granule_frozen = true;
>>> +    granule = ctz64(s->config.page_size_mask);
>>> +    trace_virtio_iommu_freeze_granule(BIT(granule));
>>> +}
>> It looks a bit heavy here just in order to get page_size_mask from host side.
>> But maybe this is the only way with current interface.
>
>the problem comes from the fact the regions are aliased due to the
>bypass and vfio_listener_region_add() does not get a chance to be called
>until the actual domain attach. So I do not see any other way to
>transiently enable the region.
Agree.

>
>At least I could check if boot bypass is set before doing that dance. I
>will respin with that.
Make sense.

Thanks
Zhenzhong
Jean-Philippe Brucker July 5, 2023, 8:29 a.m. UTC | #4
On Wed, Jul 05, 2023 at 04:52:09AM +0000, Duan, Zhenzhong wrote:
> Hi Eric,
> 
> >-----Original Message-----
> >From: Eric Auger <eric.auger@redhat.com>
> >Sent: Tuesday, July 4, 2023 7:15 PM
> >Subject: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO device
> >assignment
> >
> >When running on a 64kB page size host and protecting a VFIO device
> >with the virtio-iommu, qemu crashes with this kind of message:
> >
> >qemu-kvm: virtio-iommu page mask 0xfffffffffffff000 is incompatible
> >with mask 0x20010000
> 
> Does 0x20010000 mean only  512MB and 64KB super page mapping is
> supported for host iommu hw? 4KB mapping not supported?

It's not a restriction by the HW IOMMU, but the host kernel. An Arm SMMU
can implement 4KB, 16KB and/or 64KB granules, but the host kernel only
advertises through VFIO the granule corresponding to host PAGE_SIZE. This
restriction is done by arm_lpae_restrict_pgsizes() in order to choose a
page size when a device is driven by the host.

> 
> There is a check in guest kernel side hint only 4KB is supported, with
> this patch we force viommu->pgsize_bitmap to 0x20010000
> and fail below check? Does this device work in guest?
> Please correct me if I understand wrong.

Right, a guest using 4KB pages under a host that uses 64KB is not
supported, because if the guest attempts to dma_map a 4K page, the IOMMU
cannot create a mapping small enough, the mapping would have to spill over
neighbouring guest memory.

One possible solution would be supporting multiple page granules. If we
added a page granule negotiation through VFIO and virtio-iommu then the
guest could pick the page size it wants. But this requires changes to
Linux UAPI so isn't a priority at the moment, because we're trying to
enable nesting which would support 64K-host/4K-guest as well.

See also the discussion on the patch that introduced the guest check
https://lore.kernel.org/linux-iommu/20200318114047.1518048-1-jean-philippe@linaro.org/

Thanks,
Jean
Duan, Zhenzhong July 5, 2023, 10:13 a.m. UTC | #5
>-----Original Message-----
>From: Jean-Philippe Brucker <jean-philippe@linaro.org>
>Sent: Wednesday, July 5, 2023 4:29 PM
>Subject: Re: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO device
>assignment
>
>On Wed, Jul 05, 2023 at 04:52:09AM +0000, Duan, Zhenzhong wrote:
>> Hi Eric,
>>
>> >-----Original Message-----
>> >From: Eric Auger <eric.auger@redhat.com>
>> >Sent: Tuesday, July 4, 2023 7:15 PM
>> >Subject: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO
>> >device assignment
>> >
>> >When running on a 64kB page size host and protecting a VFIO device
>> >with the virtio-iommu, qemu crashes with this kind of message:
>> >
>> >qemu-kvm: virtio-iommu page mask 0xfffffffffffff000 is incompatible
>> >with mask 0x20010000
>>
>> Does 0x20010000 mean only  512MB and 64KB super page mapping is
>> supported for host iommu hw? 4KB mapping not supported?
>
>It's not a restriction by the HW IOMMU, but the host kernel. An Arm SMMU
>can implement 4KB, 16KB and/or 64KB granules, but the host kernel only
>advertises through VFIO the granule corresponding to host PAGE_SIZE. This
>restriction is done by arm_lpae_restrict_pgsizes() in order to choose a page
>size when a device is driven by the host.

Just curious why not advertises the Arm SMMU implemented granules to VFIO
Eg:4KB, 16KB or 64KB granules? But arm_lpae_restrict_pgsizes() restricted ones,
Eg: for SZ_4K, (SZ_4K | SZ_2M | SZ_1G).
(SZ_4K | SZ_2M | SZ_1G) looks not real hardware granules of Arm SMMU.

>
>>
>> There is a check in guest kernel side hint only 4KB is supported, with
>> this patch we force viommu->pgsize_bitmap to 0x20010000 and fail below
>> check? Does this device work in guest?
>> Please correct me if I understand wrong.
>
>Right, a guest using 4KB pages under a host that uses 64KB is not supported,
>because if the guest attempts to dma_map a 4K page, the IOMMU cannot
>create a mapping small enough, the mapping would have to spill over
>neighbouring guest memory.
>
>One possible solution would be supporting multiple page granules. If we
>added a page granule negotiation through VFIO and virtio-iommu then the
>guest could pick the page size it wants. But this requires changes to Linux UAPI
>so isn't a priority at the moment, because we're trying to enable nesting which
>would support 64K-host/4K-guest as well.
>
>See also the discussion on the patch that introduced the guest check
>https://lore.kernel.org/linux-iommu/20200318114047.1518048-1-jean-
>philippe@linaro.org/

Clear, thanks for sharing the history.

Regards
Zhenzhong
Jean-Philippe Brucker July 5, 2023, 11:33 a.m. UTC | #6
On Wed, Jul 05, 2023 at 10:13:11AM +0000, Duan, Zhenzhong wrote:
> >-----Original Message-----
> >From: Jean-Philippe Brucker <jean-philippe@linaro.org>
> >Sent: Wednesday, July 5, 2023 4:29 PM
> >Subject: Re: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO device
> >assignment
> >
> >On Wed, Jul 05, 2023 at 04:52:09AM +0000, Duan, Zhenzhong wrote:
> >> Hi Eric,
> >>
> >> >-----Original Message-----
> >> >From: Eric Auger <eric.auger@redhat.com>
> >> >Sent: Tuesday, July 4, 2023 7:15 PM
> >> >Subject: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO
> >> >device assignment
> >> >
> >> >When running on a 64kB page size host and protecting a VFIO device
> >> >with the virtio-iommu, qemu crashes with this kind of message:
> >> >
> >> >qemu-kvm: virtio-iommu page mask 0xfffffffffffff000 is incompatible
> >> >with mask 0x20010000
> >>
> >> Does 0x20010000 mean only  512MB and 64KB super page mapping is
> >> supported for host iommu hw? 4KB mapping not supported?
> >
> >It's not a restriction by the HW IOMMU, but the host kernel. An Arm SMMU
> >can implement 4KB, 16KB and/or 64KB granules, but the host kernel only
> >advertises through VFIO the granule corresponding to host PAGE_SIZE. This
> >restriction is done by arm_lpae_restrict_pgsizes() in order to choose a page
> >size when a device is driven by the host.
> 
> Just curious why not advertises the Arm SMMU implemented granules to VFIO
> Eg:4KB, 16KB or 64KB granules?

That's possible, but the difficulty is setting up the page table
configuration afterwards. At the moment the host installs the HW page
tables early, when QEMU sets up the VFIO container. That initializes the
page size bitmap because configuring the HW page tables requires picking
one of the supported granules (setting TG0 in the SMMU Context
Descriptor).

If the guest could pick a granule via an ATTACH request, then QEMU would
need to tell the host kernel to install page tables with the desired
granule at that point. That would require a new interface in VFIO to
reconfigure a live container and replace the existing HW page tables
configuration (before ATTACH, the container must already be configured
with working page tables in order to implement boot-bypass, I think).

> But arm_lpae_restrict_pgsizes() restricted ones,
> Eg: for SZ_4K, (SZ_4K | SZ_2M | SZ_1G).
> (SZ_4K | SZ_2M | SZ_1G) looks not real hardware granules of Arm SMMU.

Yes, the granule here is 4K, and other bits only indicate huge page sizes,
so the user can try to optimize large mappings to use fewer TLB entries
where possible.

Thanks,
Jean
Duan, Zhenzhong July 6, 2023, 9 a.m. UTC | #7
>-----Original Message-----
>From: Jean-Philippe Brucker <jean-philippe@linaro.org>
>Sent: Wednesday, July 5, 2023 7:33 PM
>Subject: Re: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO device
>assignment
>
>On Wed, Jul 05, 2023 at 10:13:11AM +0000, Duan, Zhenzhong wrote:
>> >-----Original Message-----
>> >From: Jean-Philippe Brucker <jean-philippe@linaro.org>
>> >Sent: Wednesday, July 5, 2023 4:29 PM
>> >Subject: Re: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO
>> >device assignment
>> >
>> >On Wed, Jul 05, 2023 at 04:52:09AM +0000, Duan, Zhenzhong wrote:
>> >> Hi Eric,
>> >>
>> >> >-----Original Message-----
>> >> >From: Eric Auger <eric.auger@redhat.com>
>> >> >Sent: Tuesday, July 4, 2023 7:15 PM
>> >> >Subject: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO
>> >> >device assignment
>> >> >
>> >> >When running on a 64kB page size host and protecting a VFIO device
>> >> >with the virtio-iommu, qemu crashes with this kind of message:
>> >> >
>> >> >qemu-kvm: virtio-iommu page mask 0xfffffffffffff000 is
>> >> >incompatible with mask 0x20010000
>> >>
>> >> Does 0x20010000 mean only  512MB and 64KB super page mapping is
>> >> supported for host iommu hw? 4KB mapping not supported?
>> >
>> >It's not a restriction by the HW IOMMU, but the host kernel. An Arm
>> >SMMU can implement 4KB, 16KB and/or 64KB granules, but the host
>> >kernel only advertises through VFIO the granule corresponding to host
>> >PAGE_SIZE. This restriction is done by arm_lpae_restrict_pgsizes() in
>> >order to choose a page size when a device is driven by the host.
>>
>> Just curious why not advertises the Arm SMMU implemented granules to
>> VFIO Eg:4KB, 16KB or 64KB granules?
>
>That's possible, but the difficulty is setting up the page table configuration
>afterwards. At the moment the host installs the HW page tables early, when
>QEMU sets up the VFIO container. That initializes the page size bitmap
>because configuring the HW page tables requires picking one of the supported
>granules (setting TG0 in the SMMU Context Descriptor).
>
>If the guest could pick a granule via an ATTACH request, then QEMU would
>need to tell the host kernel to install page tables with the desired granule at
>that point. That would require a new interface in VFIO to reconfigure a live
>container and replace the existing HW page tables configuration (before
>ATTACH, the container must already be configured with working page tables
>in order to implement boot-bypass, I think).

Thanks, clear, it looks different from x86. For x86 guest, 1GB mapping on guest side
may be shadowed with 4KB mappings on host side in my understanding.

Regards
Zhenzhong
diff mbox series

Patch

diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h
index 2ad5ee320b..a93fc5383e 100644
--- a/include/hw/virtio/virtio-iommu.h
+++ b/include/hw/virtio/virtio-iommu.h
@@ -61,6 +61,8 @@  struct VirtIOIOMMU {
     QemuRecMutex mutex;
     GTree *endpoints;
     bool boot_bypass;
+    Notifier machine_done;
+    bool granule_frozen;
 };
 
 #endif
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 1cd258135d..1eaf81bab5 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -24,6 +24,7 @@ 
 #include "hw/virtio/virtio.h"
 #include "sysemu/kvm.h"
 #include "sysemu/reset.h"
+#include "sysemu/sysemu.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "trace.h"
@@ -1106,12 +1107,12 @@  static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
     }
 
     /*
-     * After the machine is finalized, we can't change the mask anymore. If by
+     * Once the granule is frozen we can't change the mask anymore. If by
      * chance the hotplugged device supports the same granule, we can still
      * accept it. Having a different masks is possible but the guest will use
      * sub-optimal block sizes, so warn about it.
      */
-    if (phase_check(PHASE_MACHINE_READY)) {
+    if (s->granule_frozen) {
         int new_granule = ctz64(new_mask);
         int cur_granule = ctz64(cur_mask);
 
@@ -1146,6 +1147,27 @@  static void virtio_iommu_system_reset(void *opaque)
 
 }
 
+static void virtio_iommu_freeze_granule(Notifier *notifier, void *data)
+{
+    VirtIOIOMMU *s = container_of(notifier, VirtIOIOMMU, machine_done);
+    bool boot_bypass = s->config.bypass;
+    int granule;
+
+    /*
+     * Transient IOMMU MR enable to collect page_size_mask requirement
+     * through memory_region_iommu_set_page_size_mask() called by
+     * VFIO region_add() callback
+     */
+    s->config.bypass = 0;
+    virtio_iommu_switch_address_space_all(s);
+    /* restore default */
+    s->config.bypass = boot_bypass;
+    virtio_iommu_switch_address_space_all(s);
+    s->granule_frozen = true;
+    granule = ctz64(s->config.page_size_mask);
+    trace_virtio_iommu_freeze_granule(BIT(granule));
+}
+
 static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -1189,6 +1211,9 @@  static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
         error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
     }
 
+    s->machine_done.notify = virtio_iommu_freeze_granule;
+    qemu_add_machine_init_done_notifier(&s->machine_done);
+
     qemu_register_reset(virtio_iommu_system_reset, s);
 }
 
@@ -1198,6 +1223,7 @@  static void virtio_iommu_device_unrealize(DeviceState *dev)
     VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
 
     qemu_unregister_reset(virtio_iommu_system_reset, s);
+    qemu_remove_machine_init_done_notifier(&s->machine_done);
 
     g_hash_table_destroy(s->as_by_busptr);
     if (s->domains) {
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 8f8d05cf9b..68b752e304 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -131,6 +131,7 @@  virtio_iommu_set_page_size_mask(const char *name, uint64_t old, uint64_t new) "m
 virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s"
 virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s"
 virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
+virtio_iommu_freeze_granule(uint64_t page_size_mask) "granule set to 0x%"PRIx64
 
 # virtio-mem.c
 virtio_mem_send_response(uint16_t type) "type=%" PRIu16