diff mbox

[v2,for-xen-4.5] xen_disk: fix unmapping of persistent grants

Message ID 1415890703-10147-1-git-send-email-roger.pau@citrix.com
State New
Headers show

Commit Message

Roger Pau Monné Nov. 13, 2014, 2:58 p.m. UTC
This patch fixes two issues with persistent grants and the disk PV backend
(Qdisk):

 - Keep track of memory regions where persistent grants have been mapped
   since we need to unmap them as a whole. It is not possible to unmap a
   single grant if it has been batch-mapped.
 - Unmap persistent grants before switching to the closed state, so the
   frontend can also free them.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reported-by: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
Xen release exception: this is obviously an important bug-fix that should be
included in 4.5. Without this fix, trying to do a block-detach of a Qdisk
from a running guest can lead to guest crash depending on the blkfront
implementation.
---
Changes since v1:
 - Instead of disabling batch mappings when using persistent grants, keep
   track of the persistently mapped regions and unmap them on disconnection.
   This prevents unmapping single persistent grants, but the current
   implementation doesn't require it.

This new version is slightly faster than the previous one, the following
test results are in IOPS from 20 runs of a block-attach, fio run,
block-detach test loop:

x batch
+ no_batch
+----------------------------------------------------------------------+
|                                      +   +     x    x + + +  x xx x  |
|+  +           x                     x+  *+++   *  +x* +++x*  x xx x *|
|                                          |_____________A_____M______||
|                            |________________A_____M___________|      |
+----------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20         52941         63822         62396      61180.15     2673.6497
+  20         49967         63849         60322       59116.9     3456.3455
Difference at 95.0% confidence
	-2063.25 +/- 1977.66
	-3.37242% +/- 3.23252%
	(Student's t, pooled s = 3089.88)
---
 hw/block/xen_disk.c | 77 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 64 insertions(+), 13 deletions(-)

Comments

Stefano Stabellini Nov. 13, 2014, 3:36 p.m. UTC | #1
On Thu, 13 Nov 2014, Roger Pau Monne wrote:
> This patch fixes two issues with persistent grants and the disk PV backend
> (Qdisk):
> 
>  - Keep track of memory regions where persistent grants have been mapped
>    since we need to unmap them as a whole. It is not possible to unmap a
>    single grant if it has been batch-mapped.
>  - Unmap persistent grants before switching to the closed state, so the
>    frontend can also free them.
> 
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> Reported-by: George Dunlap <george.dunlap@eu.citrix.com>
> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> Cc: Kevin Wolf <kwolf@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: George Dunlap <george.dunlap@eu.citrix.com>
> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> ---
> Xen release exception: this is obviously an important bug-fix that should be
> included in 4.5. Without this fix, trying to do a block-detach of a Qdisk
> from a running guest can lead to guest crash depending on the blkfront
> implementation.
> ---
> Changes since v1:
>  - Instead of disabling batch mappings when using persistent grants, keep
>    track of the persistently mapped regions and unmap them on disconnection.
>    This prevents unmapping single persistent grants, but the current
>    implementation doesn't require it.
> 
> This new version is slightly faster than the previous one, the following
> test results are in IOPS from 20 runs of a block-attach, fio run,
> block-detach test loop:
> 
> x batch
> + no_batch
> +----------------------------------------------------------------------+
> |                                      +   +     x    x + + +  x xx x  |
> |+  +           x                     x+  *+++   *  +x* +++x*  x xx x *|
> |                                          |_____________A_____M______||
> |                            |________________A_____M___________|      |
> +----------------------------------------------------------------------+
>     N           Min           Max        Median           Avg        Stddev
> x  20         52941         63822         62396      61180.15     2673.6497
> +  20         49967         63849         60322       59116.9     3456.3455
> Difference at 95.0% confidence
> 	-2063.25 +/- 1977.66
> 	-3.37242% +/- 3.23252%
> 	(Student's t, pooled s = 3089.88)

Not too bad! Thanks!


>  hw/block/xen_disk.c | 77 ++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 64 insertions(+), 13 deletions(-)
> 
> diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
> index 231e9a7..75bdc16 100644
> --- a/hw/block/xen_disk.c
> +++ b/hw/block/xen_disk.c
> @@ -59,6 +59,13 @@ struct PersistentGrant {
>  
>  typedef struct PersistentGrant PersistentGrant;
>  
> +struct PersistentRegion {
> +    void *addr;
> +    int num;
> +};
> +
> +typedef struct PersistentRegion PersistentRegion;
> +
>  struct ioreq {
>      blkif_request_t     req;
>      int16_t             status;
> @@ -118,6 +125,7 @@ struct XenBlkDev {
>      gboolean            feature_discard;
>      gboolean            feature_persistent;
>      GTree               *persistent_gnts;
> +    GSList              *persistent_regions;
>      unsigned int        persistent_gnt_count;
>      unsigned int        max_grants;
>  
> @@ -164,19 +172,41 @@ static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
>  static void destroy_grant(gpointer pgnt)
>  {
>      PersistentGrant *grant = pgnt;
> -    XenGnttab gnt = grant->blkdev->xendev.gnttabdev;
>  
> -    if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) {
> -        xen_be_printf(&grant->blkdev->xendev, 0,
> -                      "xc_gnttab_munmap failed: %s\n",
> -                      strerror(errno));
> -    }
>      grant->blkdev->persistent_gnt_count--;
>      xen_be_printf(&grant->blkdev->xendev, 3,
> -                  "unmapped grant %p\n", grant->page);
> +                  "removed grant %p\n", grant->page);
>      g_free(grant);
>  }
>  
> +static void add_persistent_region(struct XenBlkDev *blkdev, void *addr, int num)
> +{
> +    PersistentRegion *region;
> +
> +    region = g_malloc0(sizeof(*region));
> +    region->addr = addr;
> +    region->num = num;
> +    blkdev->persistent_regions = g_slist_append(blkdev->persistent_regions,
> +                                                region);
> +}
> +
> +static void remove_persistent_region(gpointer data, gpointer dev)
> +{
> +    PersistentRegion *region = data;
> +    struct XenBlkDev *blkdev = dev;
> +    XenGnttab gnt = blkdev->xendev.gnttabdev;
> +
> +    if (xc_gnttab_munmap(gnt, region->addr, region->num) != 0) {
> +        xen_be_printf(&blkdev->xendev, 0,
> +                      "xc_gnttab_munmap region %p failed: %s\n",
> +                      region->addr, strerror(errno));
> +    }
> +    xen_be_printf(&blkdev->xendev, 3,
> +                  "unmapped grant region %p with %d pages\n",
> +                  region->addr, region->num);
> +    g_free(region);
> +}
> +
>  static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
>  {
>      struct ioreq *ioreq = NULL;
> @@ -421,7 +451,17 @@ static int ioreq_map(struct ioreq *ioreq)
>              }
>          }
>      }
> -    if (ioreq->blkdev->feature_persistent) {
> +    if (ioreq->blkdev->feature_persistent && new_maps &&
> +        (!batch_maps || (ioreq->blkdev->persistent_gnt_count + new_maps <=
> +        ioreq->blkdev->max_grants))) {

Do we really need this change? If so, could you please write something
about it in the commit message?


> +        /*
> +         * If we are using persistent grants and batch mappings only
> +         * add the new maps to the list of persistent grants if the whole
> +         * area can be persistently mapped.
> +         */

Is this the reason for the change above?


> +        if (batch_maps && new_maps) {
> +            add_persistent_region(ioreq->blkdev, ioreq->pages, new_maps);
> +        }
>
>          while ((ioreq->blkdev->persistent_gnt_count < ioreq->blkdev->max_grants)
>                && new_maps) {
>              /* Go through the list of newly mapped grants and add as many
> @@ -437,6 +477,7 @@ static int ioreq_map(struct ioreq *ioreq)
>                  grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
>              } else {
>                  grant->page = ioreq->page[new_maps];
> +                add_persistent_region(ioreq->blkdev, ioreq->page[new_maps], 1);

Basically in the !batch_maps case we are duplicating persistent_gnts
into persistent_regions. Couldn't we just avoid calling
add_persistent_region at all in that case and rely on the old
destroy_grant function? In fact I think we could just pass NULL as
GDestroyNotify function when creating persistent_gnts if batch_maps and
the old unmodified destroy_grant if !batch_maps.

>              }
>              grant->blkdev = ioreq->blkdev;
>              xen_be_printf(&ioreq->blkdev->xendev, 3,
> @@ -447,6 +488,7 @@ static int ioreq_map(struct ioreq *ioreq)
>                            grant);
>              ioreq->blkdev->persistent_gnt_count++;
>          }
> +        assert(!batch_maps || new_maps == 0);

Shouldn't new_maps be == 0 even in the !batch_maps case?


>      }
>      for (i = 0; i < ioreq->v.niov; i++) {
>          ioreq->v.iov[i].iov_base += (uintptr_t)page[i];
> @@ -972,6 +1014,7 @@ static int blk_connect(struct XenDevice *xendev)
>          blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
>                                               NULL, NULL,
>                                               (GDestroyNotify)destroy_grant);
> +        blkdev->persistent_regions = NULL;
>          blkdev->persistent_gnt_count = 0;
>      }
>  
> @@ -1000,6 +1043,19 @@ static void blk_disconnect(struct XenDevice *xendev)
>          blkdev->cnt_map--;
>          blkdev->sring = NULL;
>      }
> +
> +    /*
> +     * Unmap persistent grants before switching to the closed state
> +     * so the frontend can free them.
> +     */
> +    if (blkdev->feature_persistent) {
> +        g_tree_destroy(blkdev->persistent_gnts);
> +        assert(blkdev->persistent_gnt_count == 0);
> +        g_slist_foreach(blkdev->persistent_regions,
> +                        (GFunc)remove_persistent_region, blkdev);
> +        g_slist_free(blkdev->persistent_regions);
> +        blkdev->feature_persistent = false;
> +    }
>  }
>  
>  static int blk_free(struct XenDevice *xendev)
> @@ -1011,11 +1067,6 @@ static int blk_free(struct XenDevice *xendev)
>          blk_disconnect(xendev);
>      }
>  
> -    /* Free persistent grants */
> -    if (blkdev->feature_persistent) {
> -        g_tree_destroy(blkdev->persistent_gnts);
> -    }
> -
>      while (!QLIST_EMPTY(&blkdev->freelist)) {
>          ioreq = QLIST_FIRST(&blkdev->freelist);
>          QLIST_REMOVE(ioreq, list);
> -- 
> 1.9.3 (Apple Git-50)
>
Roger Pau Monné Nov. 13, 2014, 4:19 p.m. UTC | #2
El 13/11/14 a les 16.36, Stefano Stabellini ha escrit:
> On Thu, 13 Nov 2014, Roger Pau Monne wrote:
>> @@ -421,7 +451,17 @@ static int ioreq_map(struct ioreq *ioreq)
>>              }
>>          }
>>      }
>> -    if (ioreq->blkdev->feature_persistent) {
>> +    if (ioreq->blkdev->feature_persistent && new_maps &&
>> +        (!batch_maps || (ioreq->blkdev->persistent_gnt_count + new_maps <=
>> +        ioreq->blkdev->max_grants))) {
> 
> Do we really need this change? If so, could you please write something
> about it in the commit message?

Ack.

> 
>> +        /*
>> +         * If we are using persistent grants and batch mappings only
>> +         * add the new maps to the list of persistent grants if the whole
>> +         * area can be persistently mapped.
>> +         */
> 
> Is this the reason for the change above?

Yes, this is the reason. Maybe it's not clear enough. If we are using
batch maps we have to map all the region persistently, we cannot cherry
pick single grants and make them persistent.

> 
>> +        if (batch_maps && new_maps) {
>> +            add_persistent_region(ioreq->blkdev, ioreq->pages, new_maps);
>> +        }
>>
>>          while ((ioreq->blkdev->persistent_gnt_count < ioreq->blkdev->max_grants)
>>                && new_maps) {
>>              /* Go through the list of newly mapped grants and add as many
>> @@ -437,6 +477,7 @@ static int ioreq_map(struct ioreq *ioreq)
>>                  grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
>>              } else {
>>                  grant->page = ioreq->page[new_maps];
>> +                add_persistent_region(ioreq->blkdev, ioreq->page[new_maps], 1);
> 
> Basically in the !batch_maps case we are duplicating persistent_gnts
> into persistent_regions. Couldn't we just avoid calling
> add_persistent_region at all in that case and rely on the old
> destroy_grant function? In fact I think we could just pass NULL as
> GDestroyNotify function when creating persistent_gnts if batch_maps and
> the old unmodified destroy_grant if !batch_maps.

Not exactly, we still need the GDestroyNotify function in the batch_maps
case, but I could just pass g_free directly. Let me send v3 with this
reworked.

> 
>>              }
>>              grant->blkdev = ioreq->blkdev;
>>              xen_be_printf(&ioreq->blkdev->xendev, 3,
>> @@ -447,6 +488,7 @@ static int ioreq_map(struct ioreq *ioreq)
>>                            grant);
>>              ioreq->blkdev->persistent_gnt_count++;
>>          }
>> +        assert(!batch_maps || new_maps == 0);
> 
> Shouldn't new_maps be == 0 even in the !batch_maps case?

In theory yes, because we can persistently map all the grants that can
be on the ring. But a malicious/badly coded blkfront could try to
exploit this by always passing different grants, in which case we would
not be able to map them all persistently.

Roger.
diff mbox

Patch

diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 231e9a7..75bdc16 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -59,6 +59,13 @@  struct PersistentGrant {
 
 typedef struct PersistentGrant PersistentGrant;
 
+struct PersistentRegion {
+    void *addr;
+    int num;
+};
+
+typedef struct PersistentRegion PersistentRegion;
+
 struct ioreq {
     blkif_request_t     req;
     int16_t             status;
@@ -118,6 +125,7 @@  struct XenBlkDev {
     gboolean            feature_discard;
     gboolean            feature_persistent;
     GTree               *persistent_gnts;
+    GSList              *persistent_regions;
     unsigned int        persistent_gnt_count;
     unsigned int        max_grants;
 
@@ -164,19 +172,41 @@  static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
 static void destroy_grant(gpointer pgnt)
 {
     PersistentGrant *grant = pgnt;
-    XenGnttab gnt = grant->blkdev->xendev.gnttabdev;
 
-    if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) {
-        xen_be_printf(&grant->blkdev->xendev, 0,
-                      "xc_gnttab_munmap failed: %s\n",
-                      strerror(errno));
-    }
     grant->blkdev->persistent_gnt_count--;
     xen_be_printf(&grant->blkdev->xendev, 3,
-                  "unmapped grant %p\n", grant->page);
+                  "removed grant %p\n", grant->page);
     g_free(grant);
 }
 
+static void add_persistent_region(struct XenBlkDev *blkdev, void *addr, int num)
+{
+    PersistentRegion *region;
+
+    region = g_malloc0(sizeof(*region));
+    region->addr = addr;
+    region->num = num;
+    blkdev->persistent_regions = g_slist_append(blkdev->persistent_regions,
+                                                region);
+}
+
+static void remove_persistent_region(gpointer data, gpointer dev)
+{
+    PersistentRegion *region = data;
+    struct XenBlkDev *blkdev = dev;
+    XenGnttab gnt = blkdev->xendev.gnttabdev;
+
+    if (xc_gnttab_munmap(gnt, region->addr, region->num) != 0) {
+        xen_be_printf(&blkdev->xendev, 0,
+                      "xc_gnttab_munmap region %p failed: %s\n",
+                      region->addr, strerror(errno));
+    }
+    xen_be_printf(&blkdev->xendev, 3,
+                  "unmapped grant region %p with %d pages\n",
+                  region->addr, region->num);
+    g_free(region);
+}
+
 static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
 {
     struct ioreq *ioreq = NULL;
@@ -421,7 +451,17 @@  static int ioreq_map(struct ioreq *ioreq)
             }
         }
     }
-    if (ioreq->blkdev->feature_persistent) {
+    if (ioreq->blkdev->feature_persistent && new_maps &&
+        (!batch_maps || (ioreq->blkdev->persistent_gnt_count + new_maps <=
+        ioreq->blkdev->max_grants))) {
+        /*
+         * If we are using persistent grants and batch mappings only
+         * add the new maps to the list of persistent grants if the whole
+         * area can be persistently mapped.
+         */
+        if (batch_maps && new_maps) {
+            add_persistent_region(ioreq->blkdev, ioreq->pages, new_maps);
+        }
         while ((ioreq->blkdev->persistent_gnt_count < ioreq->blkdev->max_grants)
               && new_maps) {
             /* Go through the list of newly mapped grants and add as many
@@ -437,6 +477,7 @@  static int ioreq_map(struct ioreq *ioreq)
                 grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
             } else {
                 grant->page = ioreq->page[new_maps];
+                add_persistent_region(ioreq->blkdev, ioreq->page[new_maps], 1);
             }
             grant->blkdev = ioreq->blkdev;
             xen_be_printf(&ioreq->blkdev->xendev, 3,
@@ -447,6 +488,7 @@  static int ioreq_map(struct ioreq *ioreq)
                           grant);
             ioreq->blkdev->persistent_gnt_count++;
         }
+        assert(!batch_maps || new_maps == 0);
     }
     for (i = 0; i < ioreq->v.niov; i++) {
         ioreq->v.iov[i].iov_base += (uintptr_t)page[i];
@@ -972,6 +1014,7 @@  static int blk_connect(struct XenDevice *xendev)
         blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
                                              NULL, NULL,
                                              (GDestroyNotify)destroy_grant);
+        blkdev->persistent_regions = NULL;
         blkdev->persistent_gnt_count = 0;
     }
 
@@ -1000,6 +1043,19 @@  static void blk_disconnect(struct XenDevice *xendev)
         blkdev->cnt_map--;
         blkdev->sring = NULL;
     }
+
+    /*
+     * Unmap persistent grants before switching to the closed state
+     * so the frontend can free them.
+     */
+    if (blkdev->feature_persistent) {
+        g_tree_destroy(blkdev->persistent_gnts);
+        assert(blkdev->persistent_gnt_count == 0);
+        g_slist_foreach(blkdev->persistent_regions,
+                        (GFunc)remove_persistent_region, blkdev);
+        g_slist_free(blkdev->persistent_regions);
+        blkdev->feature_persistent = false;
+    }
 }
 
 static int blk_free(struct XenDevice *xendev)
@@ -1011,11 +1067,6 @@  static int blk_free(struct XenDevice *xendev)
         blk_disconnect(xendev);
     }
 
-    /* Free persistent grants */
-    if (blkdev->feature_persistent) {
-        g_tree_destroy(blkdev->persistent_gnts);
-    }
-
     while (!QLIST_EMPTY(&blkdev->freelist)) {
         ioreq = QLIST_FIRST(&blkdev->freelist);
         QLIST_REMOVE(ioreq, list);