diff mbox series

[v2,2/2] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions

Message ID 20200929085550.30926-3-eric.auger@redhat.com
State New
Headers show
Series NVMe passthrough: Take into account host IOVA reserved regions | expand

Commit Message

Eric Auger Sept. 29, 2020, 8:55 a.m. UTC
Introduce the qemu_vfio_find_fixed/temp_iova helpers which
respectively allocate IOVAs from the bottom/top parts of the
usable IOVA range, without picking within host IOVA reserved
windows. The allocation remains basic: if the size is too big
for the remaining of the current usable IOVA range, we jump
to the next one, leaving a hole in the address map.

Signed-off-by: Eric Auger <eric.auger@redhat.com>

---

v1 -> v2:
- handle possible wrap
---
 util/vfio-helpers.c | 57 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 4 deletions(-)

Comments

Stefan Hajnoczi Sept. 29, 2020, 3:59 p.m. UTC | #1
On Tue, Sep 29, 2020 at 10:55:50AM +0200, Eric Auger wrote:
> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
> index ba0ee6e21c..71145970f3 100644
> --- a/util/vfio-helpers.c
> +++ b/util/vfio-helpers.c
> @@ -667,6 +667,50 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
>      return true;
>  }
>  
> +static int
> +qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
> +{
> +    int i;
> +
> +    for (i = 0; i < s->nb_iova_ranges; i++) {
> +        if (s->usable_iova_ranges[i].end < s->low_water_mark) {
> +            continue;
> +        }
> +        s->low_water_mark =
> +            MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
> +
> +        if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
> +            s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {

I don't understand the == 0 case. It seems like we are allocating an
IOVA beyond usable_iova_ranges[i].end?
Eric Auger Sept. 29, 2020, 7:44 p.m. UTC | #2
Hi Stefan,

On 9/29/20 5:59 PM, Stefan Hajnoczi wrote:
> On Tue, Sep 29, 2020 at 10:55:50AM +0200, Eric Auger wrote:
>> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
>> index ba0ee6e21c..71145970f3 100644
>> --- a/util/vfio-helpers.c
>> +++ b/util/vfio-helpers.c
>> @@ -667,6 +667,50 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
>>      return true;
>>  }
>>  
>> +static int
>> +qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
>> +{
>> +    int i;
>> +
>> +    for (i = 0; i < s->nb_iova_ranges; i++) {
>> +        if (s->usable_iova_ranges[i].end < s->low_water_mark) {
>> +            continue;
>> +        }
>> +        s->low_water_mark =
>> +            MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
>> +
>> +        if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
>> +            s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
> 
> I don't understand the == 0 case. It seems like we are allocating an
> IOVA beyond usable_iova_ranges[i].end?>
It is meant to handle the case were low_water_mark = 0 and
s->usable_iova_ranges[0].end = ULLONG_MAX (I know it cannot exist at the
moment but may happen in the future) where we get an overflow. Given the
        if (s->usable_iova_ranges[i].end < s->low_water_mark) {
            continue;
        }
I think this prevents us from allocating beyond
usable_iova_ranges[i].end or do I miss something?

Thanks

Eric
Stefan Hajnoczi Sept. 30, 2020, 9:18 a.m. UTC | #3
On Tue, Sep 29, 2020 at 09:44:48PM +0200, Auger Eric wrote:
> Hi Stefan,
> 
> On 9/29/20 5:59 PM, Stefan Hajnoczi wrote:
> > On Tue, Sep 29, 2020 at 10:55:50AM +0200, Eric Auger wrote:
> >> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
> >> index ba0ee6e21c..71145970f3 100644
> >> --- a/util/vfio-helpers.c
> >> +++ b/util/vfio-helpers.c
> >> @@ -667,6 +667,50 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
> >>      return true;
> >>  }
> >>  
> >> +static int
> >> +qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
> >> +{
> >> +    int i;
> >> +
> >> +    for (i = 0; i < s->nb_iova_ranges; i++) {
> >> +        if (s->usable_iova_ranges[i].end < s->low_water_mark) {
> >> +            continue;
> >> +        }
> >> +        s->low_water_mark =
> >> +            MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
> >> +
> >> +        if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
> >> +            s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
> > 
> > I don't understand the == 0 case. It seems like we are allocating an
> > IOVA beyond usable_iova_ranges[i].end?>
> It is meant to handle the case were low_water_mark = 0 and
> s->usable_iova_ranges[0].end = ULLONG_MAX (I know it cannot exist at the
> moment but may happen in the future) where we get an overflow. Given the
>         if (s->usable_iova_ranges[i].end < s->low_water_mark) {
>             continue;
>         }
> I think this prevents us from allocating beyond
> usable_iova_ranges[i].end or do I miss something?

Yes, you are right. Here are the constraints:

e >= l
j = max(l, s)
e - j + 1 < s
e - j + 1 == 0

Assume l >= s so we can replace j with l:

e >= l
e - l + 1 < s
e - l + 1 == 0

The case I'm worried about is when the iova range cannot fit s bytes.
The last condition is only true when e = l - 1, but this violates the
first condition e >= l.

So the problem scenario cannot occur.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
diff mbox series

Patch

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index ba0ee6e21c..71145970f3 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -667,6 +667,50 @@  static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
     return true;
 }
 
+static int
+qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
+{
+    int i;
+
+    for (i = 0; i < s->nb_iova_ranges; i++) {
+        if (s->usable_iova_ranges[i].end < s->low_water_mark) {
+            continue;
+        }
+        s->low_water_mark =
+            MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
+
+        if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
+            s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
+            *iova = s->low_water_mark;
+            s->low_water_mark += size;
+            return 0;
+        }
+    }
+    return -ENOMEM;
+}
+
+static int
+qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
+{
+    int i;
+
+    for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
+        if (s->usable_iova_ranges[i].start > s->high_water_mark) {
+            continue;
+        }
+        s->high_water_mark =
+            MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
+
+        if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
+            s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
+            *iova = s->high_water_mark - size;
+            s->high_water_mark = *iova;
+            return 0;
+        }
+    }
+    return -ENOMEM;
+}
+
 /* Map [host, host + size) area into a contiguous IOVA address space, and store
  * the result in @iova if not NULL. The caller need to make sure the area is
  * aligned to page size, and mustn't overlap with existing mapping areas (split
@@ -693,7 +737,11 @@  int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
             goto out;
         }
         if (!temporary) {
-            iova0 = s->low_water_mark;
+            if (qemu_vfio_find_fixed_iova(s, size, &iova0)) {
+                ret = -ENOMEM;
+                goto out;
+            }
+
             mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
             if (!mapping) {
                 ret = -ENOMEM;
@@ -705,15 +753,16 @@  int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
                 qemu_vfio_undo_mapping(s, mapping, NULL);
                 goto out;
             }
-            s->low_water_mark += size;
             qemu_vfio_dump_mappings(s);
         } else {
-            iova0 = s->high_water_mark - size;
+            if (qemu_vfio_find_temp_iova(s, size, &iova0)) {
+                ret = -ENOMEM;
+                goto out;
+            }
             ret = qemu_vfio_do_mapping(s, host, size, iova0);
             if (ret) {
                 goto out;
             }
-            s->high_water_mark -= size;
         }
     }
     if (iova) {