[for-4.2,v10,08/15] virtio-iommu: Implement map/unmap
diff mbox series

Message ID 20190730172137.23114-9-eric.auger@redhat.com
State New
Headers show
Series
  • VIRTIO-IOMMU device
Related show

Commit Message

Eric Auger July 30, 2019, 5:21 p.m. UTC
This patch implements virtio_iommu_map/unmap.

Signed-off-by: Eric Auger <eric.auger@redhat.com>

---

v5 -> v6:
- use new v0.6 fields
- replace error_report by qemu_log_mask

v3 -> v4:
- implement unmap semantics as specified in v0.4
---
 hw/virtio/trace-events   |  3 ++
 hw/virtio/virtio-iommu.c | 94 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 2 deletions(-)

Comments

Peter Xu Aug. 19, 2019, 8:11 a.m. UTC | #1
On Tue, Jul 30, 2019 at 07:21:30PM +0200, Eric Auger wrote:

[...]

> +    mapping = g_tree_lookup(domain->mappings, (gpointer)(&interval));
> +
> +    while (mapping) {
> +        viommu_interval current;
> +        uint64_t low  = mapping->virt_addr;
> +        uint64_t high = mapping->virt_addr + mapping->size - 1;
> +
> +        current.low = low;
> +        current.high = high;
> +
> +        if (low == interval.low && size >= mapping->size) {
> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> +            interval.low = high + 1;
> +            trace_virtio_iommu_unmap_left_interval(current.low, current.high,
> +                interval.low, interval.high);
> +        } else if (high == interval.high && size >= mapping->size) {
> +            trace_virtio_iommu_unmap_right_interval(current.low, current.high,
> +                interval.low, interval.high);
> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> +            interval.high = low - 1;
> +        } else if (low > interval.low && high < interval.high) {
> +            trace_virtio_iommu_unmap_inc_interval(current.low, current.high);
> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> +        } else {
> +            break;
> +        }
> +        if (interval.low >= interval.high) {
> +            return VIRTIO_IOMMU_S_OK;
> +        } else {
> +            mapping = g_tree_lookup(domain->mappings, (gpointer)(&interval));
> +        }
> +    }
> +
> +    if (mapping) {
> +        qemu_log_mask(LOG_GUEST_ERROR,
> +                      "****** %s: Unmap 0x%"PRIx64" size=0x%"PRIx64
> +                     " from 0x%"PRIx64" size=0x%"PRIx64" is not supported\n",
> +                     __func__, interval.low, size,
> +                     mapping->virt_addr, mapping->size);
> +    } else {
> +        return VIRTIO_IOMMU_S_OK;
> +    }
> +
> +    return VIRTIO_IOMMU_S_INVAL;

Could the above chunk be simplified as something like below?

  while ((mapping = g_tree_lookup(domain->mappings, &interval))) {
    g_tree_remove(domain->mappings, mapping);
  }

Thanks,
Eric Auger Sept. 3, 2019, 11:37 a.m. UTC | #2
Hi Peter,

On 8/19/19 10:11 AM, Peter Xu wrote:
> On Tue, Jul 30, 2019 at 07:21:30PM +0200, Eric Auger wrote:
> 
> [...]
> 
>> +    mapping = g_tree_lookup(domain->mappings, (gpointer)(&interval));
>> +
>> +    while (mapping) {
>> +        viommu_interval current;
>> +        uint64_t low  = mapping->virt_addr;
>> +        uint64_t high = mapping->virt_addr + mapping->size - 1;
>> +
>> +        current.low = low;
>> +        current.high = high;
>> +
>> +        if (low == interval.low && size >= mapping->size) {
>> +            g_tree_remove(domain->mappings, (gpointer)(&current));
>> +            interval.low = high + 1;
>> +            trace_virtio_iommu_unmap_left_interval(current.low, current.high,
>> +                interval.low, interval.high);
>> +        } else if (high == interval.high && size >= mapping->size) {
>> +            trace_virtio_iommu_unmap_right_interval(current.low, current.high,
>> +                interval.low, interval.high);
>> +            g_tree_remove(domain->mappings, (gpointer)(&current));
>> +            interval.high = low - 1;
>> +        } else if (low > interval.low && high < interval.high) {
>> +            trace_virtio_iommu_unmap_inc_interval(current.low, current.high);
>> +            g_tree_remove(domain->mappings, (gpointer)(&current));
>> +        } else {
>> +            break;
>> +        }
>> +        if (interval.low >= interval.high) {
>> +            return VIRTIO_IOMMU_S_OK;
>> +        } else {
>> +            mapping = g_tree_lookup(domain->mappings, (gpointer)(&interval));
>> +        }
>> +    }
>> +
>> +    if (mapping) {
>> +        qemu_log_mask(LOG_GUEST_ERROR,
>> +                      "****** %s: Unmap 0x%"PRIx64" size=0x%"PRIx64
>> +                     " from 0x%"PRIx64" size=0x%"PRIx64" is not supported\n",
>> +                     __func__, interval.low, size,
>> +                     mapping->virt_addr, mapping->size);
>> +    } else {
>> +        return VIRTIO_IOMMU_S_OK;
>> +    }
>> +
>> +    return VIRTIO_IOMMU_S_INVAL;
> 
> Could the above chunk be simplified as something like below?
> 
>   while ((mapping = g_tree_lookup(domain->mappings, &interval))) {
>     g_tree_remove(domain->mappings, mapping);
>   }
Indeed the code could be simplified. I only need to make sure I don't
split an existing mapping.

Also I needed to use g_tree_lookup_extended to retrieve the actual key
to remove. The usage of g_tree_lookup_extended() allows me to remove the
virt_addr and size fields from the mapping value value struct as those
info can be retrieved from the key.

Thanks!

Eric
> 
> Thanks,
>
Peter Xu Sept. 4, 2019, 1:44 a.m. UTC | #3
On Tue, Sep 03, 2019 at 01:37:11PM +0200, Auger Eric wrote:
> Hi Peter,
> 
> On 8/19/19 10:11 AM, Peter Xu wrote:
> > On Tue, Jul 30, 2019 at 07:21:30PM +0200, Eric Auger wrote:
> > 
> > [...]
> > 
> >> +    mapping = g_tree_lookup(domain->mappings, (gpointer)(&interval));
> >> +
> >> +    while (mapping) {
> >> +        viommu_interval current;
> >> +        uint64_t low  = mapping->virt_addr;
> >> +        uint64_t high = mapping->virt_addr + mapping->size - 1;
> >> +
> >> +        current.low = low;
> >> +        current.high = high;
> >> +
> >> +        if (low == interval.low && size >= mapping->size) {
> >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> >> +            interval.low = high + 1;
> >> +            trace_virtio_iommu_unmap_left_interval(current.low, current.high,
> >> +                interval.low, interval.high);
> >> +        } else if (high == interval.high && size >= mapping->size) {
> >> +            trace_virtio_iommu_unmap_right_interval(current.low, current.high,
> >> +                interval.low, interval.high);
> >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> >> +            interval.high = low - 1;
> >> +        } else if (low > interval.low && high < interval.high) {
> >> +            trace_virtio_iommu_unmap_inc_interval(current.low, current.high);
> >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> >> +        } else {
> >> +            break;
> >> +        }
> >> +        if (interval.low >= interval.high) {
> >> +            return VIRTIO_IOMMU_S_OK;
> >> +        } else {
> >> +            mapping = g_tree_lookup(domain->mappings, (gpointer)(&interval));
> >> +        }
> >> +    }
> >> +
> >> +    if (mapping) {
> >> +        qemu_log_mask(LOG_GUEST_ERROR,
> >> +                      "****** %s: Unmap 0x%"PRIx64" size=0x%"PRIx64
> >> +                     " from 0x%"PRIx64" size=0x%"PRIx64" is not supported\n",
> >> +                     __func__, interval.low, size,
> >> +                     mapping->virt_addr, mapping->size);
> >> +    } else {
> >> +        return VIRTIO_IOMMU_S_OK;
> >> +    }
> >> +
> >> +    return VIRTIO_IOMMU_S_INVAL;
> > 
> > Could the above chunk be simplified as something like below?
> > 
> >   while ((mapping = g_tree_lookup(domain->mappings, &interval))) {
> >     g_tree_remove(domain->mappings, mapping);
> >   }
> Indeed the code could be simplified. I only need to make sure I don't
> split an existing mapping.

Hmm... Do we need to still split an existing mapping if necessary?
For example when with this mapping:

  iova=0x1000, size=0x2000, phys=ADDR1, flags=FLAGS1

And if we want to unmap the range (iova=0, size=0x2000), then we
should split the existing mappping and leave this one:

  iova=0x2000, size=0x1000, phys=(ADDR1+0x1000), flags=FLAGS1

Right?

> 
> Also I needed to use g_tree_lookup_extended to retrieve the actual key
> to remove. The usage of g_tree_lookup_extended() allows me to remove the
> virt_addr and size fields from the mapping value value struct as those
> info can be retrieved from the key.

True.  Thanks,
Tian, Kevin Sept. 4, 2019, 4:23 a.m. UTC | #4
> From: Peter Xu [mailto:peterx@redhat.com]
> Sent: Wednesday, September 4, 2019 9:44 AM
> 
> On Tue, Sep 03, 2019 at 01:37:11PM +0200, Auger Eric wrote:
> > Hi Peter,
> >
> > On 8/19/19 10:11 AM, Peter Xu wrote:
> > > On Tue, Jul 30, 2019 at 07:21:30PM +0200, Eric Auger wrote:
> > >
> > > [...]
> > >
> > >> +    mapping = g_tree_lookup(domain->mappings, (gpointer)(&interval));
> > >> +
> > >> +    while (mapping) {
> > >> +        viommu_interval current;
> > >> +        uint64_t low  = mapping->virt_addr;
> > >> +        uint64_t high = mapping->virt_addr + mapping->size - 1;
> > >> +
> > >> +        current.low = low;
> > >> +        current.high = high;
> > >> +
> > >> +        if (low == interval.low && size >= mapping->size) {
> > >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> > >> +            interval.low = high + 1;
> > >> +            trace_virtio_iommu_unmap_left_interval(current.low,
> current.high,
> > >> +                interval.low, interval.high);
> > >> +        } else if (high == interval.high && size >= mapping->size) {
> > >> +            trace_virtio_iommu_unmap_right_interval(current.low,
> current.high,
> > >> +                interval.low, interval.high);
> > >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> > >> +            interval.high = low - 1;
> > >> +        } else if (low > interval.low && high < interval.high) {
> > >> +            trace_virtio_iommu_unmap_inc_interval(current.low,
> current.high);
> > >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> > >> +        } else {
> > >> +            break;
> > >> +        }
> > >> +        if (interval.low >= interval.high) {
> > >> +            return VIRTIO_IOMMU_S_OK;
> > >> +        } else {
> > >> +            mapping = g_tree_lookup(domain->mappings,
> (gpointer)(&interval));
> > >> +        }
> > >> +    }
> > >> +
> > >> +    if (mapping) {
> > >> +        qemu_log_mask(LOG_GUEST_ERROR,
> > >> +                      "****** %s: Unmap 0x%"PRIx64" size=0x%"PRIx64
> > >> +                     " from 0x%"PRIx64" size=0x%"PRIx64" is not supported\n",
> > >> +                     __func__, interval.low, size,
> > >> +                     mapping->virt_addr, mapping->size);
> > >> +    } else {
> > >> +        return VIRTIO_IOMMU_S_OK;
> > >> +    }
> > >> +
> > >> +    return VIRTIO_IOMMU_S_INVAL;
> > >
> > > Could the above chunk be simplified as something like below?
> > >
> > >   while ((mapping = g_tree_lookup(domain->mappings, &interval))) {
> > >     g_tree_remove(domain->mappings, mapping);
> > >   }
> > Indeed the code could be simplified. I only need to make sure I don't
> > split an existing mapping.
> 
> Hmm... Do we need to still split an existing mapping if necessary?
> For example when with this mapping:
> 
>   iova=0x1000, size=0x2000, phys=ADDR1, flags=FLAGS1
> 
> And if we want to unmap the range (iova=0, size=0x2000), then we
> should split the existing mappping and leave this one:
> 
>   iova=0x2000, size=0x1000, phys=(ADDR1+0x1000), flags=FLAGS1
> 
> Right?
> 

virtio-iommu spec explicitly disallows partial unmap.

5.11.6.6.1 Driver Requirements: UNMAP request

The first address of a range MUST either be the first address of a 
mapping or be outside any mapping. The last address of a range 
MUST either be the last address of a mapping or be outside any 
mapping.

5.11.6.6.2 Device Requirements: UNMAP request

If a mapping affected by the range is not covered in its entirety 
by the range (the UNMAP request would split the mapping), 
then the device SHOULD set the request status to VIRTIO_IOMMU
_S_RANGE, and SHOULD NOT remove any mapping.

Thanks
Kevin
Peter Xu Sept. 4, 2019, 5:37 a.m. UTC | #5
On Wed, Sep 04, 2019 at 04:23:50AM +0000, Tian, Kevin wrote:
> > From: Peter Xu [mailto:peterx@redhat.com]
> > Sent: Wednesday, September 4, 2019 9:44 AM
> > 
> > On Tue, Sep 03, 2019 at 01:37:11PM +0200, Auger Eric wrote:
> > > Hi Peter,
> > >
> > > On 8/19/19 10:11 AM, Peter Xu wrote:
> > > > On Tue, Jul 30, 2019 at 07:21:30PM +0200, Eric Auger wrote:
> > > >
> > > > [...]
> > > >
> > > >> +    mapping = g_tree_lookup(domain->mappings, (gpointer)(&interval));
> > > >> +
> > > >> +    while (mapping) {
> > > >> +        viommu_interval current;
> > > >> +        uint64_t low  = mapping->virt_addr;
> > > >> +        uint64_t high = mapping->virt_addr + mapping->size - 1;
> > > >> +
> > > >> +        current.low = low;
> > > >> +        current.high = high;
> > > >> +
> > > >> +        if (low == interval.low && size >= mapping->size) {
> > > >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> > > >> +            interval.low = high + 1;
> > > >> +            trace_virtio_iommu_unmap_left_interval(current.low,
> > current.high,
> > > >> +                interval.low, interval.high);
> > > >> +        } else if (high == interval.high && size >= mapping->size) {
> > > >> +            trace_virtio_iommu_unmap_right_interval(current.low,
> > current.high,
> > > >> +                interval.low, interval.high);
> > > >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> > > >> +            interval.high = low - 1;
> > > >> +        } else if (low > interval.low && high < interval.high) {
> > > >> +            trace_virtio_iommu_unmap_inc_interval(current.low,
> > current.high);
> > > >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> > > >> +        } else {
> > > >> +            break;
> > > >> +        }
> > > >> +        if (interval.low >= interval.high) {
> > > >> +            return VIRTIO_IOMMU_S_OK;
> > > >> +        } else {
> > > >> +            mapping = g_tree_lookup(domain->mappings,
> > (gpointer)(&interval));
> > > >> +        }
> > > >> +    }
> > > >> +
> > > >> +    if (mapping) {
> > > >> +        qemu_log_mask(LOG_GUEST_ERROR,
> > > >> +                      "****** %s: Unmap 0x%"PRIx64" size=0x%"PRIx64
> > > >> +                     " from 0x%"PRIx64" size=0x%"PRIx64" is not supported\n",
> > > >> +                     __func__, interval.low, size,
> > > >> +                     mapping->virt_addr, mapping->size);
> > > >> +    } else {
> > > >> +        return VIRTIO_IOMMU_S_OK;
> > > >> +    }
> > > >> +
> > > >> +    return VIRTIO_IOMMU_S_INVAL;
> > > >
> > > > Could the above chunk be simplified as something like below?
> > > >
> > > >   while ((mapping = g_tree_lookup(domain->mappings, &interval))) {
> > > >     g_tree_remove(domain->mappings, mapping);
> > > >   }
> > > Indeed the code could be simplified. I only need to make sure I don't
> > > split an existing mapping.
> > 
> > Hmm... Do we need to still split an existing mapping if necessary?
> > For example when with this mapping:
> > 
> >   iova=0x1000, size=0x2000, phys=ADDR1, flags=FLAGS1
> > 
> > And if we want to unmap the range (iova=0, size=0x2000), then we
> > should split the existing mappping and leave this one:
> > 
> >   iova=0x2000, size=0x1000, phys=(ADDR1+0x1000), flags=FLAGS1
> > 
> > Right?
> > 
> 
> virtio-iommu spec explicitly disallows partial unmap.
> 
> 5.11.6.6.1 Driver Requirements: UNMAP request
> 
> The first address of a range MUST either be the first address of a 
> mapping or be outside any mapping. The last address of a range 
> MUST either be the last address of a mapping or be outside any 
> mapping.
> 
> 5.11.6.6.2 Device Requirements: UNMAP request
> 
> If a mapping affected by the range is not covered in its entirety 
> by the range (the UNMAP request would split the mapping), 
> then the device SHOULD set the request status to VIRTIO_IOMMU
> _S_RANGE, and SHOULD NOT remove any mapping.

I see, thanks Kevin.

Though why so strict?  (Sorry if I missed some discussions
... pointers welcomed...)

What I'm thinking is when we want to allocate a bunch of buffers
(e.g., 1M) while we will also need to be able to free them with
smaller chunks (e.g., 4K), then it would be even better that we allow
to allocate a whole 1M buffer within the guest and map it as a whole,
then we can selectively unmap the pages after used.  If with the
strict rule, we'll need to map one by one, that can be a total of
1M/4K roundtrips.

Regards,
Tian, Kevin Sept. 4, 2019, 5:46 a.m. UTC | #6
> From: Peter Xu [mailto:peterx@redhat.com]
> Sent: Wednesday, September 4, 2019 1:37 PM
> 
> On Wed, Sep 04, 2019 at 04:23:50AM +0000, Tian, Kevin wrote:
> > > From: Peter Xu [mailto:peterx@redhat.com]
> > > Sent: Wednesday, September 4, 2019 9:44 AM
> > >
> > > On Tue, Sep 03, 2019 at 01:37:11PM +0200, Auger Eric wrote:
> > > > Hi Peter,
> > > >
> > > > On 8/19/19 10:11 AM, Peter Xu wrote:
> > > > > On Tue, Jul 30, 2019 at 07:21:30PM +0200, Eric Auger wrote:
> > > > >
> > > > > [...]
> > > > >
> > > > >> +    mapping = g_tree_lookup(domain->mappings,
> (gpointer)(&interval));
> > > > >> +
> > > > >> +    while (mapping) {
> > > > >> +        viommu_interval current;
> > > > >> +        uint64_t low  = mapping->virt_addr;
> > > > >> +        uint64_t high = mapping->virt_addr + mapping->size - 1;
> > > > >> +
> > > > >> +        current.low = low;
> > > > >> +        current.high = high;
> > > > >> +
> > > > >> +        if (low == interval.low && size >= mapping->size) {
> > > > >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> > > > >> +            interval.low = high + 1;
> > > > >> +            trace_virtio_iommu_unmap_left_interval(current.low,
> > > current.high,
> > > > >> +                interval.low, interval.high);
> > > > >> +        } else if (high == interval.high && size >= mapping->size) {
> > > > >> +            trace_virtio_iommu_unmap_right_interval(current.low,
> > > current.high,
> > > > >> +                interval.low, interval.high);
> > > > >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> > > > >> +            interval.high = low - 1;
> > > > >> +        } else if (low > interval.low && high < interval.high) {
> > > > >> +            trace_virtio_iommu_unmap_inc_interval(current.low,
> > > current.high);
> > > > >> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> > > > >> +        } else {
> > > > >> +            break;
> > > > >> +        }
> > > > >> +        if (interval.low >= interval.high) {
> > > > >> +            return VIRTIO_IOMMU_S_OK;
> > > > >> +        } else {
> > > > >> +            mapping = g_tree_lookup(domain->mappings,
> > > (gpointer)(&interval));
> > > > >> +        }
> > > > >> +    }
> > > > >> +
> > > > >> +    if (mapping) {
> > > > >> +        qemu_log_mask(LOG_GUEST_ERROR,
> > > > >> +                      "****** %s: Unmap 0x%"PRIx64" size=0x%"PRIx64
> > > > >> +                     " from 0x%"PRIx64" size=0x%"PRIx64" is not supported\n",
> > > > >> +                     __func__, interval.low, size,
> > > > >> +                     mapping->virt_addr, mapping->size);
> > > > >> +    } else {
> > > > >> +        return VIRTIO_IOMMU_S_OK;
> > > > >> +    }
> > > > >> +
> > > > >> +    return VIRTIO_IOMMU_S_INVAL;
> > > > >
> > > > > Could the above chunk be simplified as something like below?
> > > > >
> > > > >   while ((mapping = g_tree_lookup(domain->mappings, &interval))) {
> > > > >     g_tree_remove(domain->mappings, mapping);
> > > > >   }
> > > > Indeed the code could be simplified. I only need to make sure I don't
> > > > split an existing mapping.
> > >
> > > Hmm... Do we need to still split an existing mapping if necessary?
> > > For example when with this mapping:
> > >
> > >   iova=0x1000, size=0x2000, phys=ADDR1, flags=FLAGS1
> > >
> > > And if we want to unmap the range (iova=0, size=0x2000), then we
> > > should split the existing mappping and leave this one:
> > >
> > >   iova=0x2000, size=0x1000, phys=(ADDR1+0x1000), flags=FLAGS1
> > >
> > > Right?
> > >
> >
> > virtio-iommu spec explicitly disallows partial unmap.
> >
> > 5.11.6.6.1 Driver Requirements: UNMAP request
> >
> > The first address of a range MUST either be the first address of a
> > mapping or be outside any mapping. The last address of a range
> > MUST either be the last address of a mapping or be outside any
> > mapping.
> >
> > 5.11.6.6.2 Device Requirements: UNMAP request
> >
> > If a mapping affected by the range is not covered in its entirety
> > by the range (the UNMAP request would split the mapping),
> > then the device SHOULD set the request status to VIRTIO_IOMMU
> > _S_RANGE, and SHOULD NOT remove any mapping.
> 
> I see, thanks Kevin.
> 
> Though why so strict?  (Sorry if I missed some discussions
> ... pointers welcomed...)
> 
> What I'm thinking is when we want to allocate a bunch of buffers
> (e.g., 1M) while we will also need to be able to free them with
> smaller chunks (e.g., 4K), then it would be even better that we allow
> to allocate a whole 1M buffer within the guest and map it as a whole,
> then we can selectively unmap the pages after used.  If with the
> strict rule, we'll need to map one by one, that can be a total of
> 1M/4K roundtrips.
> 

Sorry I forgot the original discussion. Need Jean to respond. :-)

A possible reason is that no such usage exists today, thus simplification
was made? 

Thanks
Kevin
Eric Auger Sept. 4, 2019, 7:54 a.m. UTC | #7
Hi,

On 9/4/19 7:46 AM, Tian, Kevin wrote:
>> From: Peter Xu [mailto:peterx@redhat.com]
>> Sent: Wednesday, September 4, 2019 1:37 PM
>>
>> On Wed, Sep 04, 2019 at 04:23:50AM +0000, Tian, Kevin wrote:
>>>> From: Peter Xu [mailto:peterx@redhat.com]
>>>> Sent: Wednesday, September 4, 2019 9:44 AM
>>>>
>>>> On Tue, Sep 03, 2019 at 01:37:11PM +0200, Auger Eric wrote:
>>>>> Hi Peter,
>>>>>
>>>>> On 8/19/19 10:11 AM, Peter Xu wrote:
>>>>>> On Tue, Jul 30, 2019 at 07:21:30PM +0200, Eric Auger wrote:
>>>>>>
>>>>>> [...]
>>>>>>
>>>>>>> +    mapping = g_tree_lookup(domain->mappings,
>> (gpointer)(&interval));
>>>>>>> +
>>>>>>> +    while (mapping) {
>>>>>>> +        viommu_interval current;
>>>>>>> +        uint64_t low  = mapping->virt_addr;
>>>>>>> +        uint64_t high = mapping->virt_addr + mapping->size - 1;
>>>>>>> +
>>>>>>> +        current.low = low;
>>>>>>> +        current.high = high;
>>>>>>> +
>>>>>>> +        if (low == interval.low && size >= mapping->size) {
>>>>>>> +            g_tree_remove(domain->mappings, (gpointer)(&current));
>>>>>>> +            interval.low = high + 1;
>>>>>>> +            trace_virtio_iommu_unmap_left_interval(current.low,
>>>> current.high,
>>>>>>> +                interval.low, interval.high);
>>>>>>> +        } else if (high == interval.high && size >= mapping->size) {
>>>>>>> +            trace_virtio_iommu_unmap_right_interval(current.low,
>>>> current.high,
>>>>>>> +                interval.low, interval.high);
>>>>>>> +            g_tree_remove(domain->mappings, (gpointer)(&current));
>>>>>>> +            interval.high = low - 1;
>>>>>>> +        } else if (low > interval.low && high < interval.high) {
>>>>>>> +            trace_virtio_iommu_unmap_inc_interval(current.low,
>>>> current.high);
>>>>>>> +            g_tree_remove(domain->mappings, (gpointer)(&current));
>>>>>>> +        } else {
>>>>>>> +            break;
>>>>>>> +        }
>>>>>>> +        if (interval.low >= interval.high) {
>>>>>>> +            return VIRTIO_IOMMU_S_OK;
>>>>>>> +        } else {
>>>>>>> +            mapping = g_tree_lookup(domain->mappings,
>>>> (gpointer)(&interval));
>>>>>>> +        }
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    if (mapping) {
>>>>>>> +        qemu_log_mask(LOG_GUEST_ERROR,
>>>>>>> +                      "****** %s: Unmap 0x%"PRIx64" size=0x%"PRIx64
>>>>>>> +                     " from 0x%"PRIx64" size=0x%"PRIx64" is not supported\n",
>>>>>>> +                     __func__, interval.low, size,
>>>>>>> +                     mapping->virt_addr, mapping->size);
>>>>>>> +    } else {
>>>>>>> +        return VIRTIO_IOMMU_S_OK;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    return VIRTIO_IOMMU_S_INVAL;
>>>>>>
>>>>>> Could the above chunk be simplified as something like below?
>>>>>>
>>>>>>   while ((mapping = g_tree_lookup(domain->mappings, &interval))) {
>>>>>>     g_tree_remove(domain->mappings, mapping);
>>>>>>   }
>>>>> Indeed the code could be simplified. I only need to make sure I don't
>>>>> split an existing mapping.
>>>>
>>>> Hmm... Do we need to still split an existing mapping if necessary?
>>>> For example when with this mapping:
>>>>
>>>>   iova=0x1000, size=0x2000, phys=ADDR1, flags=FLAGS1
>>>>
>>>> And if we want to unmap the range (iova=0, size=0x2000), then we
>>>> should split the existing mappping and leave this one:
>>>>
>>>>   iova=0x2000, size=0x1000, phys=(ADDR1+0x1000), flags=FLAGS1
>>>>
>>>> Right?
>>>>
>>>
>>> virtio-iommu spec explicitly disallows partial unmap.
>>>
>>> 5.11.6.6.1 Driver Requirements: UNMAP request
>>>
>>> The first address of a range MUST either be the first address of a
>>> mapping or be outside any mapping. The last address of a range
>>> MUST either be the last address of a mapping or be outside any
>>> mapping.
>>>
>>> 5.11.6.6.2 Device Requirements: UNMAP request
>>>
>>> If a mapping affected by the range is not covered in its entirety
>>> by the range (the UNMAP request would split the mapping),
>>> then the device SHOULD set the request status to VIRTIO_IOMMU
>>> _S_RANGE, and SHOULD NOT remove any mapping.
>>
>> I see, thanks Kevin.
>>
>> Though why so strict?  (Sorry if I missed some discussions
>> ... pointers welcomed...)
>>
>> What I'm thinking is when we want to allocate a bunch of buffers
>> (e.g., 1M) while we will also need to be able to free them with
>> smaller chunks (e.g., 4K), then it would be even better that we allow
>> to allocate a whole 1M buffer within the guest and map it as a whole,
>> then we can selectively unmap the pages after used.  If with the
>> strict rule, we'll need to map one by one, that can be a total of
>> 1M/4K roundtrips.
>>
> 
> Sorry I forgot the original discussion. Need Jean to respond. :-)
> 
> A possible reason is that no such usage exists today, thus simplification
> was made? 

In
https://virtualization.linux-foundation.narkive.com/q6XOkO76/rfc-0-3-virtio-iommu-a-paravirtualized-iommu

I found

"
(Note: the semantics of unmap are chosen to be compatible with VFIO's
type1 v2 IOMMU API. This way a device serving as intermediary between
guest and VFIO doesn't have to keep an internal tree of mappings. They are
a bit tighter than VFIO, in that they don't allow unmap spilling outside
mapped regions. Spilling is 'undefined' at the moment, because it should
work in most cases but I don't know if it's worth the added complexity in
devices that are not simply transmitting requests to VFIO. Splitting
mappings won't ever be allowed, but see the relaxed proposal in 3/3 for
more lenient semantics)
"

Thanks

Eric
> 
> Thanks
> Kevin
>
Peter Xu Sept. 4, 2019, 8:32 a.m. UTC | #8
On Wed, Sep 04, 2019 at 09:54:12AM +0200, Auger Eric wrote:
> Hi,
> 
> On 9/4/19 7:46 AM, Tian, Kevin wrote:
> >> From: Peter Xu [mailto:peterx@redhat.com]
> >> Sent: Wednesday, September 4, 2019 1:37 PM
> >>
> >> On Wed, Sep 04, 2019 at 04:23:50AM +0000, Tian, Kevin wrote:
> >>>> From: Peter Xu [mailto:peterx@redhat.com]
> >>>> Sent: Wednesday, September 4, 2019 9:44 AM
> >>>>
> >>>> On Tue, Sep 03, 2019 at 01:37:11PM +0200, Auger Eric wrote:
> >>>>> Hi Peter,
> >>>>>
> >>>>> On 8/19/19 10:11 AM, Peter Xu wrote:
> >>>>>> On Tue, Jul 30, 2019 at 07:21:30PM +0200, Eric Auger wrote:
> >>>>>>
> >>>>>> [...]
> >>>>>>
> >>>>>>> +    mapping = g_tree_lookup(domain->mappings,
> >> (gpointer)(&interval));
> >>>>>>> +
> >>>>>>> +    while (mapping) {
> >>>>>>> +        viommu_interval current;
> >>>>>>> +        uint64_t low  = mapping->virt_addr;
> >>>>>>> +        uint64_t high = mapping->virt_addr + mapping->size - 1;
> >>>>>>> +
> >>>>>>> +        current.low = low;
> >>>>>>> +        current.high = high;
> >>>>>>> +
> >>>>>>> +        if (low == interval.low && size >= mapping->size) {
> >>>>>>> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> >>>>>>> +            interval.low = high + 1;
> >>>>>>> +            trace_virtio_iommu_unmap_left_interval(current.low,
> >>>> current.high,
> >>>>>>> +                interval.low, interval.high);
> >>>>>>> +        } else if (high == interval.high && size >= mapping->size) {
> >>>>>>> +            trace_virtio_iommu_unmap_right_interval(current.low,
> >>>> current.high,
> >>>>>>> +                interval.low, interval.high);
> >>>>>>> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> >>>>>>> +            interval.high = low - 1;
> >>>>>>> +        } else if (low > interval.low && high < interval.high) {
> >>>>>>> +            trace_virtio_iommu_unmap_inc_interval(current.low,
> >>>> current.high);
> >>>>>>> +            g_tree_remove(domain->mappings, (gpointer)(&current));
> >>>>>>> +        } else {
> >>>>>>> +            break;
> >>>>>>> +        }
> >>>>>>> +        if (interval.low >= interval.high) {
> >>>>>>> +            return VIRTIO_IOMMU_S_OK;
> >>>>>>> +        } else {
> >>>>>>> +            mapping = g_tree_lookup(domain->mappings,
> >>>> (gpointer)(&interval));
> >>>>>>> +        }
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    if (mapping) {
> >>>>>>> +        qemu_log_mask(LOG_GUEST_ERROR,
> >>>>>>> +                      "****** %s: Unmap 0x%"PRIx64" size=0x%"PRIx64
> >>>>>>> +                     " from 0x%"PRIx64" size=0x%"PRIx64" is not supported\n",
> >>>>>>> +                     __func__, interval.low, size,
> >>>>>>> +                     mapping->virt_addr, mapping->size);
> >>>>>>> +    } else {
> >>>>>>> +        return VIRTIO_IOMMU_S_OK;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    return VIRTIO_IOMMU_S_INVAL;
> >>>>>>
> >>>>>> Could the above chunk be simplified as something like below?
> >>>>>>
> >>>>>>   while ((mapping = g_tree_lookup(domain->mappings, &interval))) {
> >>>>>>     g_tree_remove(domain->mappings, mapping);
> >>>>>>   }
> >>>>> Indeed the code could be simplified. I only need to make sure I don't
> >>>>> split an existing mapping.
> >>>>
> >>>> Hmm... Do we need to still split an existing mapping if necessary?
> >>>> For example when with this mapping:
> >>>>
> >>>>   iova=0x1000, size=0x2000, phys=ADDR1, flags=FLAGS1
> >>>>
> >>>> And if we want to unmap the range (iova=0, size=0x2000), then we
> >>>> should split the existing mappping and leave this one:
> >>>>
> >>>>   iova=0x2000, size=0x1000, phys=(ADDR1+0x1000), flags=FLAGS1
> >>>>
> >>>> Right?
> >>>>
> >>>
> >>> virtio-iommu spec explicitly disallows partial unmap.
> >>>
> >>> 5.11.6.6.1 Driver Requirements: UNMAP request
> >>>
> >>> The first address of a range MUST either be the first address of a
> >>> mapping or be outside any mapping. The last address of a range
> >>> MUST either be the last address of a mapping or be outside any
> >>> mapping.
> >>>
> >>> 5.11.6.6.2 Device Requirements: UNMAP request
> >>>
> >>> If a mapping affected by the range is not covered in its entirety
> >>> by the range (the UNMAP request would split the mapping),
> >>> then the device SHOULD set the request status to VIRTIO_IOMMU
> >>> _S_RANGE, and SHOULD NOT remove any mapping.
> >>
> >> I see, thanks Kevin.
> >>
> >> Though why so strict?  (Sorry if I missed some discussions
> >> ... pointers welcomed...)
> >>
> >> What I'm thinking is when we want to allocate a bunch of buffers
> >> (e.g., 1M) while we will also need to be able to free them with
> >> smaller chunks (e.g., 4K), then it would be even better that we allow
> >> to allocate a whole 1M buffer within the guest and map it as a whole,
> >> then we can selectively unmap the pages after used.  If with the
> >> strict rule, we'll need to map one by one, that can be a total of
> >> 1M/4K roundtrips.
> >>
> > 
> > Sorry I forgot the original discussion. Need Jean to respond. :-)
> > 
> > A possible reason is that no such usage exists today, thus simplification
> > was made? 
> 
> In
> https://virtualization.linux-foundation.narkive.com/q6XOkO76/rfc-0-3-virtio-iommu-a-paravirtualized-iommu
> 
> I found
> 
> "
> (Note: the semantics of unmap are chosen to be compatible with VFIO's
> type1 v2 IOMMU API. This way a device serving as intermediary between
> guest and VFIO doesn't have to keep an internal tree of mappings. They are
> a bit tighter than VFIO, in that they don't allow unmap spilling outside
> mapped regions. Spilling is 'undefined' at the moment, because it should
> work in most cases but I don't know if it's worth the added complexity in
> devices that are not simply transmitting requests to VFIO. Splitting
> mappings won't ever be allowed, but see the relaxed proposal in 3/3 for
> more lenient semantics)
> "

Yes it makes sense to follow vfio type1v2 here.  Though I'm not sure
whether the maintainance of "an internal tree of mappings" could be
avoided by this, at least if using current QEMU IOMMU notifier
framework.  The problem is currently the IOMMU notifiers cannot fail
(e.g., the VFIO_IOMMU_MAP_DMA ioctl from vfio-pci device will assume
the messages delivered from vIOMMUs are always valid), so AFAICT the
vIOMMU needs to tell which mapping request from the guest driver is
valid before delivering the request to vfio.  It seems impossible to
do that if without the internal tree of mapping.  But that seems to be
another story.

In all cases, I think I'm fine with the approach for this patch.

Thanks!

Patch
diff mbox series

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index a373bdebb3..25a71b0505 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -71,3 +71,6 @@  virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d"
 virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d"
 virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d"
 virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
+virtio_iommu_unmap_left_interval(uint64_t low, uint64_t high, uint64_t next_low, uint64_t next_high) "Unmap left [0x%"PRIx64",0x%"PRIx64"], new interval=[0x%"PRIx64",0x%"PRIx64"]"
+virtio_iommu_unmap_right_interval(uint64_t low, uint64_t high, uint64_t next_low, uint64_t next_high) "Unmap right [0x%"PRIx64",0x%"PRIx64"], new interval=[0x%"PRIx64",0x%"PRIx64"]"
+virtio_iommu_unmap_inc_interval(uint64_t low, uint64_t high) "Unmap inc [0x%"PRIx64",0x%"PRIx64"]"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 5ea0930cc2..4706b9da6e 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -18,6 +18,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu/log.h"
 #include "qemu/iov.h"
 #include "qemu-common.h"
 #include "hw/virtio/virtio.h"
@@ -55,6 +56,13 @@  typedef struct viommu_interval {
     uint64_t high;
 } viommu_interval;
 
+typedef struct viommu_mapping {
+    uint64_t virt_addr;
+    uint64_t phys_addr;
+    uint64_t size;
+    uint32_t flags;
+} viommu_mapping;
+
 static inline uint16_t virtio_iommu_get_sid(IOMMUDevice *dev)
 {
     return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
@@ -240,10 +248,37 @@  static int virtio_iommu_map(VirtIOIOMMU *s,
     uint64_t virt_start = le64_to_cpu(req->virt_start);
     uint64_t virt_end = le64_to_cpu(req->virt_end);
     uint32_t flags = le32_to_cpu(req->flags);
+    viommu_domain *domain;
+    viommu_interval *interval;
+    viommu_mapping *mapping;
+
+    interval = g_malloc0(sizeof(*interval));
+
+    interval->low = virt_start;
+    interval->high = virt_end;
+
+    domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+    if (!domain) {
+        return VIRTIO_IOMMU_S_NOENT;
+    }
+
+    mapping = g_tree_lookup(domain->mappings, (gpointer)interval);
+    if (mapping) {
+        g_free(interval);
+        return VIRTIO_IOMMU_S_INVAL;
+    }
 
     trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
 
-    return VIRTIO_IOMMU_S_UNSUPP;
+    mapping = g_malloc0(sizeof(*mapping));
+    mapping->virt_addr = virt_start;
+    mapping->phys_addr = phys_start;
+    mapping->size = virt_end - virt_start + 1;
+    mapping->flags = flags;
+
+    g_tree_insert(domain->mappings, interval, mapping);
+
+    return VIRTIO_IOMMU_S_OK;
 }
 
 static int virtio_iommu_unmap(VirtIOIOMMU *s,
@@ -252,10 +287,65 @@  static int virtio_iommu_unmap(VirtIOIOMMU *s,
     uint32_t domain_id = le32_to_cpu(req->domain);
     uint64_t virt_start = le64_to_cpu(req->virt_start);
     uint64_t virt_end = le64_to_cpu(req->virt_end);
+    uint64_t size = virt_end - virt_start + 1;
+    viommu_mapping *mapping;
+    viommu_interval interval;
+    viommu_domain *domain;
 
     trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
 
-    return VIRTIO_IOMMU_S_UNSUPP;
+    domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+    if (!domain) {
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: no domain\n", __func__);
+        return VIRTIO_IOMMU_S_NOENT;
+    }
+    interval.low = virt_start;
+    interval.high = virt_end;
+
+    mapping = g_tree_lookup(domain->mappings, (gpointer)(&interval));
+
+    while (mapping) {
+        viommu_interval current;
+        uint64_t low  = mapping->virt_addr;
+        uint64_t high = mapping->virt_addr + mapping->size - 1;
+
+        current.low = low;
+        current.high = high;
+
+        if (low == interval.low && size >= mapping->size) {
+            g_tree_remove(domain->mappings, (gpointer)(&current));
+            interval.low = high + 1;
+            trace_virtio_iommu_unmap_left_interval(current.low, current.high,
+                interval.low, interval.high);
+        } else if (high == interval.high && size >= mapping->size) {
+            trace_virtio_iommu_unmap_right_interval(current.low, current.high,
+                interval.low, interval.high);
+            g_tree_remove(domain->mappings, (gpointer)(&current));
+            interval.high = low - 1;
+        } else if (low > interval.low && high < interval.high) {
+            trace_virtio_iommu_unmap_inc_interval(current.low, current.high);
+            g_tree_remove(domain->mappings, (gpointer)(&current));
+        } else {
+            break;
+        }
+        if (interval.low >= interval.high) {
+            return VIRTIO_IOMMU_S_OK;
+        } else {
+            mapping = g_tree_lookup(domain->mappings, (gpointer)(&interval));
+        }
+    }
+
+    if (mapping) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "****** %s: Unmap 0x%"PRIx64" size=0x%"PRIx64
+                     " from 0x%"PRIx64" size=0x%"PRIx64" is not supported\n",
+                     __func__, interval.low, size,
+                     mapping->virt_addr, mapping->size);
+    } else {
+        return VIRTIO_IOMMU_S_OK;
+    }
+
+    return VIRTIO_IOMMU_S_INVAL;
 }
 
 static int virtio_iommu_iov_to_req(struct iovec *iov,