Patchwork [1/4] xen/netback: implements persistent grant with one page pool.

login
register
mail settings
Submitter annie.li@oracle.com
Date Nov. 15, 2012, 7:04 a.m.
Message ID <1352963066-570-1-git-send-email-annie.li@oracle.com>
Download mbox | patch
Permalink /patch/199194/
State Changes Requested
Delegated to: David Miller
Headers show

Comments

annie.li@oracle.com - Nov. 15, 2012, 7:04 a.m.
This patch implements persistent grant in netback driver. Tx and rx
share the same page pool, this pool will be split into two parts
in next patch.

Signed-off-by: Annie Li <annie.li@oracle.com>
---
 drivers/net/xen-netback/common.h    |   18 +++-
 drivers/net/xen-netback/interface.c |   22 ++++
 drivers/net/xen-netback/netback.c   |  212 +++++++++++++++++++++++++++++++----
 drivers/net/xen-netback/xenbus.c    |   14 ++-
 4 files changed, 239 insertions(+), 27 deletions(-)
Ian Campbell - Nov. 15, 2012, 9:10 a.m.
On Thu, 2012-11-15 at 07:04 +0000, Annie Li wrote:
> This patch implements persistent grant in netback driver. Tx and rx
> share the same page pool, this pool will be split into two parts
> in next patch.
> 
> Signed-off-by: Annie Li <annie.li@oracle.com>
> ---
>  drivers/net/xen-netback/common.h    |   18 +++-
>  drivers/net/xen-netback/interface.c |   22 ++++
>  drivers/net/xen-netback/netback.c   |  212 +++++++++++++++++++++++++++++++----
>  drivers/net/xen-netback/xenbus.c    |   14 ++-
>  4 files changed, 239 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
> index 94b79c3..a85cac6 100644
> --- a/drivers/net/xen-netback/common.h
> +++ b/drivers/net/xen-netback/common.h
> @@ -45,8 +45,19 @@
>  #include <xen/grant_table.h>
>  #include <xen/xenbus.h>
> 
> +#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
> +#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
> +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \

BLOCK?

> +                       (XEN_NETIF_TX_RING_SIZE + XEN_NETIF_RX_RING_SIZE)
> +
>  struct xen_netbk;
> 
> +struct persistent_entry {
> +       grant_ref_t forgranted;
> +       struct page *fpage;
> +       struct gnttab_map_grant_ref map;
> +};

Isn't this duplicating a bunch of infrastructure which is also in
blkback? Can we put it into some common helpers please?

> +
>  struct xenvif {
>         /* Unique identifier for this interface. */
>         domid_t          domid;
> @@ -75,6 +86,7 @@ struct xenvif {
> 
>         /* Internal feature information. */
>         u8 can_queue:1;     /* can queue packets for receiver? */
> +       u8 persistent_grant:1;
> 
>         /*
>          * Allow xenvif_start_xmit() to peek ahead in the rx request
> @@ -98,6 +110,9 @@ struct xenvif {
>         struct net_device *dev;
> 
>         wait_queue_head_t waiting_to_free;
> +
> +       struct persistent_entry *persistent_gnt[MAXIMUM_OUTSTANDING_BLOCK_REQS];

What is the per-vif memory overhead of this array?

> +static struct persistent_entry*
> +get_per_gnt(struct persistent_entry **pers_entry,
> +           unsigned int count, grant_ref_t gref)
> +{
> +       int i;
> +
> +       for (i = 0; i < count; i++)
> +               if (gref == pers_entry[i]->forgranted)
> +                       return pers_entry[i];

Isn't this linear scan rather expensive? I think Roger implemented some
sort of hash lookup for blkback which I think is required here too (and
should be easy if you make that code common).

> +
> +       return NULL;
> +}
> +

> @@ -1338,7 +1497,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>                 gop->source.domid = vif->domid;
>                 gop->source.offset = txreq.offset;
> 
> -               gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> +               if (!vif->persistent_grant)
> +                       gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> +               else
> +                       gop->dest.u.gmfn = (unsigned long)page_address(page);

page_address doesn't return any sort of frame number, does it? This is
rather confusing...

> @@ -453,7 +460,12 @@ static int connect_rings(struct backend_info *be)
>                 val = 0;
>         vif->csum = !val;
> 
> -       /* Map the shared frame, irq etc. */
> +       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-persistent-grants",
> +                        "%u", &val) < 0)
> +               val = 0;
> +       vif->persistent_grant = !!val;
> +
> +/* Map the shared frame, irq etc. */

Please run the patches through checkpatch.pl

>         err = xenvif_connect(vif, tx_ring_ref, rx_ring_ref, evtchn);
>         if (err) {
>                 xenbus_dev_fatal(dev, err,
> --
> 1.7.3.4
> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roger Pau Monne - Nov. 15, 2012, 9:57 a.m.
On 15/11/12 08:04, Annie Li wrote:
> This patch implements persistent grant in netback driver. Tx and rx
> share the same page pool, this pool will be split into two parts
> in next patch.
> 
> Signed-off-by: Annie Li <annie.li@oracle.com>
> ---
>  drivers/net/xen-netback/common.h    |   18 +++-
>  drivers/net/xen-netback/interface.c |   22 ++++
>  drivers/net/xen-netback/netback.c   |  212 +++++++++++++++++++++++++++++++----
>  drivers/net/xen-netback/xenbus.c    |   14 ++-
>  4 files changed, 239 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
> index 94b79c3..a85cac6 100644
> --- a/drivers/net/xen-netback/common.h
> +++ b/drivers/net/xen-netback/common.h
> @@ -45,8 +45,19 @@
>  #include <xen/grant_table.h>
>  #include <xen/xenbus.h>
> 
> +#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
> +#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
> +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
> +                       (XEN_NETIF_TX_RING_SIZE + XEN_NETIF_RX_RING_SIZE)
> +
>  struct xen_netbk;
> 
> +struct persistent_entry {
> +       grant_ref_t forgranted;
> +       struct page *fpage;
> +       struct gnttab_map_grant_ref map;
> +};

This should be common with the blkback implementation, I think we should
move some structures/functions from blkback to a common place. When I
implementated some functions in blkback I though they could be reused by
other backends that wanted to use persistent grants, so I keep them free
of blkback specific structures.

>  struct xenvif {
>         /* Unique identifier for this interface. */
>         domid_t          domid;
> @@ -75,6 +86,7 @@ struct xenvif {
> 
>         /* Internal feature information. */
>         u8 can_queue:1;     /* can queue packets for receiver? */
> +       u8 persistent_grant:1;
> 
>         /*
>          * Allow xenvif_start_xmit() to peek ahead in the rx request
> @@ -98,6 +110,9 @@ struct xenvif {
>         struct net_device *dev;
> 
>         wait_queue_head_t waiting_to_free;
> +
> +       struct persistent_entry *persistent_gnt[MAXIMUM_OUTSTANDING_BLOCK_REQS];
> +       unsigned int persistent_gntcnt;

This should be a red-black tree, which has the property of a search time
<= O(log n), using an array is more expensive in terms of memory and has
a worse search time O(n), this is specially interesting for netback,
which can have twice as much persistent grants as blkback (because two
rings are used).

Take a look at the following functions from blkback; foreach_grant,
add_persistent_gnt and get_persistent_gnt. They are generic functions to
deal with persistent grants.

>  };
> 
>  static inline struct xenbus_device *xenvif_to_xenbus_device(struct xenvif *vif)
> @@ -105,9 +120,6 @@ static inline struct xenbus_device *xenvif_to_xenbus_device(struct xenvif *vif)
>         return to_xenbus_device(vif->dev->dev.parent);
>  }
> 
> -#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
> -#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
> -
>  struct xenvif *xenvif_alloc(struct device *parent,
>                             domid_t domid,
>                             unsigned int handle);
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> index b7d41f8..226d159 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -300,6 +300,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>                 return ERR_PTR(err);
>         }
> 
> +       vif->persistent_gntcnt = 0;
> +
>         netdev_dbg(dev, "Successfully created xenvif\n");
>         return vif;
>  }
> @@ -343,6 +345,23 @@ err:
>         return err;
>  }
> 
> +void xenvif_free_grants(struct persistent_entry **pers_entry,
> +                       unsigned int count)
> +{
> +       int i, ret;
> +       struct gnttab_unmap_grant_ref unmap;
> +
> +       for (i = 0; i < count; i++) {
> +               gnttab_set_unmap_op(&unmap,
> +                       (unsigned long)page_to_pfn(pers_entry[i]->fpage),
> +                       GNTMAP_host_map,
> +                       pers_entry[i]->map.handle);
> +               ret = gnttab_unmap_refs(&unmap, &pers_entry[i]->fpage,
> +                                       1, false);

This is not correct, you should call gnttab_set_unmap_op on a batch of
grants (up to BLKIF_MAX_SEGMENTS_PER_REQUEST), and then call
gnttab_unmap_refs on all of them. Here is a simple example (take a look
at blkback.c function xen_blkif_schedule to see an example with a
red-black tree, I think this part of the code should also be made common):

for (i = 0, segs_to_unmap = 0; i < count; i++) {
	gnttab_set_unmap_op(&unmap[segs_to_unmap],
		(unsigned long)page_to_pfn(pers_entry[i]->fpage),
		GNTMAP_host_map,
		pers_entry[i]->map.handle);
	pages[segs_to_unmap] =
		(unsigned long)page_to_pfn(pers_entry[i]->fpage);
	if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
		(i + 1) == count) {
		ret = gnttab_unmap_refs(unmap, NULL, pages,
			segs_to_unmap);
		BUG_ON(ret);
		segs_to_unmap == 0;
	}
}

> +               BUG_ON(ret);
> +       }
> +}
> +
>  void xenvif_disconnect(struct xenvif *vif)
>  {
>         struct net_device *dev = vif->dev;
> @@ -366,6 +385,9 @@ void xenvif_disconnect(struct xenvif *vif)
>         unregister_netdev(vif->dev);
> 
>         xen_netbk_unmap_frontend_rings(vif);
> +       if (vif->persistent_grant)
> +               xenvif_free_grants(vif->persistent_gnt,
> +                                  vif->persistent_gntcnt);
> 
>         free_netdev(vif->dev);
>  }
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index 2596401..a26d3fc 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -80,6 +80,8 @@ union page_ext {
>         void *mapping;
>  };
> 
> +struct xenvif;
> +
>  struct xen_netbk {
>         wait_queue_head_t wq;
>         struct task_struct *task;
> @@ -102,6 +104,7 @@ struct xen_netbk {
> 
>         struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
>         struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
> +       struct xenvif *gnttab_tx_vif[MAX_PENDING_REQS];
> 
>         u16 pending_ring[MAX_PENDING_REQS];
> 
> @@ -111,12 +114,139 @@ struct xen_netbk {
>          * straddles two buffers in the frontend.
>          */
>         struct gnttab_copy grant_copy_op[2*XEN_NETIF_RX_RING_SIZE];
> +       struct xenvif *gnttab_rx_vif[2*XEN_NETIF_RX_RING_SIZE];
>         struct netbk_rx_meta meta[2*XEN_NETIF_RX_RING_SIZE];
>  };
> 
>  static struct xen_netbk *xen_netbk;
>  static int xen_netbk_group_nr;
> 
> +static struct persistent_entry*
> +get_per_gnt(struct persistent_entry **pers_entry,
> +           unsigned int count, grant_ref_t gref)
> +{
> +       int i;
> +
> +       for (i = 0; i < count; i++)
> +               if (gref == pers_entry[i]->forgranted)
> +                       return pers_entry[i];
> +
> +       return NULL;
> +}

This should be replaced with common code shared with all persistent
backends implementations.

> +
> +static void*
> +map_new_gnt(struct persistent_entry **pers_entry, unsigned int count,
> +           grant_ref_t ref, domid_t domid)
> +{
> +       struct gnttab_map_grant_ref *map;
> +       struct page *page;
> +       unsigned long vaddr;
> +       unsigned long pfn;
> +       uint32_t flags;
> +       int ret = 0;
> +
> +       pers_entry[count] = (struct persistent_entry *)
> +                           kmalloc(sizeof(struct persistent_entry),
> +                                   GFP_KERNEL);
> +       if (!pers_entry[count])
> +               return ERR_PTR(-ENOMEM);
> +
> +       map = &pers_entry[count]->map;
> +       page = alloc_page(GFP_KERNEL);
> +       if (!page) {
> +               kfree(pers_entry[count]);
> +               pers_entry[count] = NULL;
> +               return ERR_PTR(-ENOMEM);
> +       }
> +
> +       pers_entry[count]->fpage = page;
> +       pfn = page_to_pfn(page);
> +       vaddr = (unsigned long)pfn_to_kaddr(pfn);
> +       flags = GNTMAP_host_map;
> +
> +       gnttab_set_map_op(map, vaddr, flags, ref, domid);
> +       ret = gnttab_map_refs(map, NULL, &page, 1);
> +       BUG_ON(ret);

This is highly inefficient, one of the points of using gnttab_set_map_op
is that you can queue a bunch of grants, and then map them at the same
time using gnttab_map_refs, but here you are using it to map a single
grant at a time. You should instead see how much grants you need to map
to complete the request and map them all at the same time.

> +
> +       pers_entry[count]->forgranted = ref;
> +
> +       return page_address(page);
> +}
> +
> +static void*
> +get_ref_page(struct persistent_entry **pers_entry, unsigned int *count,
> +            grant_ref_t ref, domid_t domid, unsigned int total)
> +{
> +       struct persistent_entry *per_gnt;
> +       void *vaddr;
> +
> +       per_gnt = get_per_gnt(pers_entry, *count, ref);
> +
> +       if (per_gnt != NULL)
> +               return page_address(per_gnt->fpage);
> +       else {
> +               BUG_ON(*count >= total);
> +               vaddr = map_new_gnt(pers_entry, *count, ref, domid);
> +               if (IS_ERR_OR_NULL(vaddr))
> +                       return vaddr;
> +               *count += 1;
> +               return vaddr;
> +       }
> +}
> +
> +static int
> +grant_memory_copy_op(unsigned int cmd, void *vuop, unsigned int count,
> +                    struct xen_netbk *netbk, bool tx_pool)
> +{
> +       int i;
> +       struct xenvif *vif;
> +       struct gnttab_copy *uop = vuop;
> +       unsigned int *gnt_count;
> +       unsigned int gnt_total;
> +       struct persistent_entry **pers_entry;
> +       int ret = 0;
> +
> +       BUG_ON(cmd != GNTTABOP_copy);
> +       for (i = 0; i < count; i++) {
> +               if (tx_pool)
> +                       vif = netbk->gnttab_tx_vif[i];
> +               else
> +                       vif = netbk->gnttab_rx_vif[i];
> +
> +               pers_entry = vif->persistent_gnt;
> +               gnt_count = &vif->persistent_gntcnt;
> +               gnt_total = MAXIMUM_OUTSTANDING_BLOCK_REQS;
> +
> +               if (vif->persistent_grant) {
> +                       void *saddr, *daddr;
> +
> +                       saddr = uop[i].source.domid == DOMID_SELF ?
> +                               (void *) uop[i].source.u.gmfn :
> +                               get_ref_page(pers_entry, gnt_count,
> +                                            uop[i].source.u.ref,
> +                                            uop[i].source.domid,
> +                                            gnt_total);
> +                       if (IS_ERR_OR_NULL(saddr))
> +                               return -ENOMEM;
> +
> +                       daddr = uop[i].dest.domid == DOMID_SELF ?
> +                               (void *) uop[i].dest.u.gmfn :
> +                               get_ref_page(pers_entry, gnt_count,
> +                                            uop[i].dest.u.ref,
> +                                            uop[i].dest.domid,
> +                                            gnt_total);
> +                       if (IS_ERR_OR_NULL(daddr))
> +                               return -ENOMEM;
> +
> +                       memcpy(daddr+uop[i].dest.offset,
> +                              saddr+uop[i].source.offset, uop[i].len);
> +               } else
> +                       ret = HYPERVISOR_grant_table_op(cmd, &uop[i], 1);
> +       }
> +
> +       return ret;
> +}
> +
>  void xen_netbk_add_xenvif(struct xenvif *vif)
>  {
>         int i;
> @@ -387,13 +517,15 @@ static struct netbk_rx_meta *get_next_rx_buffer(struct xenvif *vif,
>   * Set up the grant operations for this fragment. If it's a flipping
>   * interface, we also set up the unmap request from here.
>   */
> -static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
> -                               struct netrx_pending_operations *npo,
> -                               struct page *page, unsigned long size,
> -                               unsigned long offset, int *head)
> +static int netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
> +                              struct netrx_pending_operations *npo,
> +                              struct page *page, unsigned long size,
> +                              unsigned long offset, int *head,
> +                              struct xenvif **grxvif)
>  {
>         struct gnttab_copy *copy_gop;
>         struct netbk_rx_meta *meta;
> +       int count = 0;
>         /*
>          * These variables are used iff get_page_ext returns true,
>          * in which case they are guaranteed to be initialized.
> @@ -425,6 +557,10 @@ static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
>                         bytes = MAX_BUFFER_OFFSET - npo->copy_off;
> 
>                 copy_gop = npo->copy + npo->copy_prod++;
> +               *grxvif = vif;
> +               grxvif++;
> +               count++;
> +
>                 copy_gop->flags = GNTCOPY_dest_gref;
>                 if (foreign) {
>                         struct xen_netbk *netbk = &xen_netbk[group];
> @@ -438,7 +574,10 @@ static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
>                 } else {
>                         void *vaddr = page_address(page);
>                         copy_gop->source.domid = DOMID_SELF;
> -                       copy_gop->source.u.gmfn = virt_to_mfn(vaddr);
> +                       if (!vif->persistent_grant)
> +                               copy_gop->source.u.gmfn = virt_to_mfn(vaddr);
> +                       else
> +                               copy_gop->source.u.gmfn = (unsigned long)vaddr;
>                 }
>                 copy_gop->source.offset = offset;
>                 copy_gop->dest.domid = vif->domid;
> @@ -460,6 +599,7 @@ static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
>                 *head = 0; /* There must be something in this buffer now. */
> 
>         }
> +       return count;
>  }
> 
>  /*
> @@ -474,8 +614,10 @@ static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
>   * zero GSO descriptors (for non-GSO packets) or one descriptor (for
>   * frontend-side LRO).
>   */
> -static int netbk_gop_skb(struct sk_buff *skb,
> -                        struct netrx_pending_operations *npo)
> +static int netbk_gop_skb(struct xen_netbk *netbk,
> +                        struct sk_buff *skb,
> +                        struct netrx_pending_operations *npo,
> +                        struct xenvif **grxvif)
>  {
>         struct xenvif *vif = netdev_priv(skb->dev);
>         int nr_frags = skb_shinfo(skb)->nr_frags;
> @@ -518,17 +660,19 @@ static int netbk_gop_skb(struct sk_buff *skb,
>                 if (data + len > skb_tail_pointer(skb))
>                         len = skb_tail_pointer(skb) - data;
> 
> -               netbk_gop_frag_copy(vif, skb, npo,
> -                                   virt_to_page(data), len, offset, &head);
> +               grxvif += netbk_gop_frag_copy(vif, skb, npo,
> +                                             virt_to_page(data), len,
> +                                             offset, &head, grxvif);
> +
>                 data += len;
>         }
> 
>         for (i = 0; i < nr_frags; i++) {
> -               netbk_gop_frag_copy(vif, skb, npo,
> -                                   skb_frag_page(&skb_shinfo(skb)->frags[i]),
> -                                   skb_frag_size(&skb_shinfo(skb)->frags[i]),
> -                                   skb_shinfo(skb)->frags[i].page_offset,
> -                                   &head);
> +               grxvif += netbk_gop_frag_copy(vif, skb, npo,
> +                               skb_frag_page(&skb_shinfo(skb)->frags[i]),
> +                               skb_frag_size(&skb_shinfo(skb)->frags[i]),
> +                               skb_shinfo(skb)->frags[i].page_offset,
> +                               &head, grxvif);
>         }
> 
>         return npo->meta_prod - old_meta_prod;
> @@ -593,6 +737,8 @@ struct skb_cb_overlay {
>  static void xen_netbk_rx_action(struct xen_netbk *netbk)
>  {
>         struct xenvif *vif = NULL, *tmp;
> +       struct xenvif **grxvif = netbk->gnttab_rx_vif;
> +       int old_copy_prod = 0;
>         s8 status;
>         u16 irq, flags;
>         struct xen_netif_rx_response *resp;
> @@ -619,7 +765,9 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk)
>                 nr_frags = skb_shinfo(skb)->nr_frags;
> 
>                 sco = (struct skb_cb_overlay *)skb->cb;
> -               sco->meta_slots_used = netbk_gop_skb(skb, &npo);
> +               sco->meta_slots_used = netbk_gop_skb(netbk, skb, &npo, grxvif);
> +               grxvif += npo.copy_prod - old_copy_prod;
> +               old_copy_prod = npo.copy_prod;
> 
>                 count += nr_frags + 1;
> 
> @@ -636,8 +784,10 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk)
>                 return;
> 
>         BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
> -       ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, &netbk->grant_copy_op,
> -                                       npo.copy_prod);
> +       ret = grant_memory_copy_op(GNTTABOP_copy,
> +                                  &netbk->grant_copy_op,
> +                                  npo.copy_prod, netbk,
> +                                  false);
>         BUG_ON(ret != 0);
> 
>         while ((skb = __skb_dequeue(&rxq)) != NULL) {
> @@ -918,7 +1068,8 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
>                                                   struct xenvif *vif,
>                                                   struct sk_buff *skb,
>                                                   struct xen_netif_tx_request *txp,
> -                                                 struct gnttab_copy *gop)
> +                                                 struct gnttab_copy *gop,
> +                                                 struct xenvif **gtxvif)
>  {
>         struct skb_shared_info *shinfo = skb_shinfo(skb);
>         skb_frag_t *frags = shinfo->frags;
> @@ -944,7 +1095,11 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
>                 gop->source.domid = vif->domid;
>                 gop->source.offset = txp->offset;
> 
> -               gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> +               if (!vif->persistent_grant)
> +                       gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> +               else
> +                       gop->dest.u.gmfn = (unsigned long)page_address(page);
> +
>                 gop->dest.domid = DOMID_SELF;
>                 gop->dest.offset = txp->offset;
> 
> @@ -952,6 +1107,9 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
>                 gop->flags = GNTCOPY_source_gref;
> 
>                 gop++;
> +               *gtxvif = vif;
> +               gtxvif++;
> +
> 
>                 memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
>                 xenvif_get(vif);
> @@ -1218,6 +1376,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
>  static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>  {
>         struct gnttab_copy *gop = netbk->tx_copy_ops, *request_gop;
> +       struct xenvif **gtxvif = netbk->gnttab_tx_vif;
>         struct sk_buff *skb;
>         int ret;
> 
> @@ -1338,7 +1497,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>                 gop->source.domid = vif->domid;
>                 gop->source.offset = txreq.offset;
> 
> -               gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> +               if (!vif->persistent_grant)
> +                       gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> +               else
> +                       gop->dest.u.gmfn = (unsigned long)page_address(page);
> +
>                 gop->dest.domid = DOMID_SELF;
>                 gop->dest.offset = txreq.offset;
> 
> @@ -1346,6 +1509,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>                 gop->flags = GNTCOPY_source_gref;
> 
>                 gop++;
> +               *gtxvif++ = vif;
> 
>                 memcpy(&netbk->pending_tx_info[pending_idx].req,
>                        &txreq, sizeof(txreq));
> @@ -1369,12 +1533,13 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>                 netbk->pending_cons++;
> 
>                 request_gop = xen_netbk_get_requests(netbk, vif,
> -                                                    skb, txfrags, gop);
> +                                                    skb, txfrags, gop, gtxvif);
>                 if (request_gop == NULL) {
>                         kfree_skb(skb);
>                         netbk_tx_err(vif, &txreq, idx);
>                         continue;
>                 }
> +               gtxvif += request_gop - gop;
>                 gop = request_gop;
> 
>                 vif->tx.req_cons = idx;
> @@ -1467,8 +1632,9 @@ static void xen_netbk_tx_action(struct xen_netbk *netbk)
> 
>         if (nr_gops == 0)
>                 return;
> -       ret = HYPERVISOR_grant_table_op(GNTTABOP_copy,
> -                                       netbk->tx_copy_ops, nr_gops);
> +       ret = grant_memory_copy_op(GNTTABOP_copy,
> +                                  netbk->tx_copy_ops, nr_gops,
> +                                  netbk, true);
>         BUG_ON(ret);
> 
>         xen_netbk_tx_submit(netbk);
> diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
> index 410018c..938e908 100644
> --- a/drivers/net/xen-netback/xenbus.c
> +++ b/drivers/net/xen-netback/xenbus.c
> @@ -114,6 +114,13 @@ static int netback_probe(struct xenbus_device *dev,
>                         goto abort_transaction;
>                 }
> 
> +               err = xenbus_printf(xbt, dev->nodename,
> +                                   "feature-persistent-grants", "%u", 1);
> +               if (err) {
> +                       message = "writing feature-persistent-grants";
> +                       goto abort_transaction;
> +               }
> +
>                 err = xenbus_transaction_end(xbt, 0);
>         } while (err == -EAGAIN);
> 
> @@ -453,7 +460,12 @@ static int connect_rings(struct backend_info *be)
>                 val = 0;
>         vif->csum = !val;
> 
> -       /* Map the shared frame, irq etc. */
> +       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-persistent-grants",
> +                        "%u", &val) < 0)

In block devices "feature-persistent" is used, so I think that for
clearness it should be announced the same way in net.

> +               val = 0;
> +       vif->persistent_grant = !!val;
> +
> +/* Map the shared frame, irq etc. */
>         err = xenvif_connect(vif, tx_ring_ref, rx_ring_ref, evtchn);
>         if (err) {
>                 xenbus_dev_fatal(dev, err,
> --
> 1.7.3.4
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
annie.li@oracle.com - Nov. 16, 2012, 2:18 a.m.
On 2012-11-15 17:10, Ian Campbell wrote:
> On Thu, 2012-11-15 at 07:04 +0000, Annie Li wrote:
>> This patch implements persistent grant in netback driver. Tx and rx
>> share the same page pool, this pool will be split into two parts
>> in next patch.
>>
>> Signed-off-by: Annie Li<annie.li@oracle.com>
>> ---
>>   drivers/net/xen-netback/common.h    |   18 +++-
>>   drivers/net/xen-netback/interface.c |   22 ++++
>>   drivers/net/xen-netback/netback.c   |  212 +++++++++++++++++++++++++++++++----
>>   drivers/net/xen-netback/xenbus.c    |   14 ++-
>>   4 files changed, 239 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
>> index 94b79c3..a85cac6 100644
>> --- a/drivers/net/xen-netback/common.h
>> +++ b/drivers/net/xen-netback/common.h
>> @@ -45,8 +45,19 @@
>>   #include<xen/grant_table.h>
>>   #include<xen/xenbus.h>
>>
>> +#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
>> +#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
>> +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
> BLOCK?

Oh, an error when splitting the patch, will fix it, thanks.

>
>> +                       (XEN_NETIF_TX_RING_SIZE + XEN_NETIF_RX_RING_SIZE)
>> +
>>   struct xen_netbk;
>>
>> +struct persistent_entry {
>> +       grant_ref_t forgranted;
>> +       struct page *fpage;
>> +       struct gnttab_map_grant_ref map;
>> +};
> Isn't this duplicating a bunch of infrastructure which is also in
> blkback? Can we put it into some common helpers please?

Yes,

"struct gnttab_map_grant_ref map" can be changed to handle like blkback to keep same as blkback, and share them in common helpers.


>
>> +
>>   struct xenvif {
>>          /* Unique identifier for this interface. */
>>          domid_t          domid;
>> @@ -75,6 +86,7 @@ struct xenvif {
>>
>>          /* Internal feature information. */
>>          u8 can_queue:1;     /* can queue packets for receiver? */
>> +       u8 persistent_grant:1;
>>
>>          /*
>>           * Allow xenvif_start_xmit() to peek ahead in the rx request
>> @@ -98,6 +110,9 @@ struct xenvif {
>>          struct net_device *dev;
>>
>>          wait_queue_head_t waiting_to_free;
>> +
>> +       struct persistent_entry *persistent_gnt[MAXIMUM_OUTSTANDING_BLOCK_REQS];
> What is the per-vif memory overhead of this array?
In this patch,
The maximum of memory overhead is about

(XEN_NETIF_TX_RING_SIZE+XEN_NETIF_RX_RING_SIZE)*PAGE_SIZE  (plus size of grant_ref_t and handle)
which is about 512 PAGE_SIZE. Normally, without heavy network offload, this maximum can not be reached.

In next patch of splitting tx/rx pool, the maximum is about (256+512)PAGE_SIZE.

>
>> +static struct persistent_entry*
>> +get_per_gnt(struct persistent_entry **pers_entry,
>> +           unsigned int count, grant_ref_t gref)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i<  count; i++)
>> +               if (gref == pers_entry[i]->forgranted)
>> +                       return pers_entry[i];
> Isn't this linear scan rather expensive? I think Roger implemented some
> sort of hash lookup for blkback which I think is required here too (and
> should be easy if you make that code common).

Agree, thanks.

>
>> +
>> +       return NULL;
>> +}
>> +
>> @@ -1338,7 +1497,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>>                  gop->source.domid = vif->domid;
>>                  gop->source.offset = txreq.offset;
>>
>> -               gop->dest.u.gmfn = virt_to_mfn(page_address(page));
>> +               if (!vif->persistent_grant)
>> +                       gop->dest.u.gmfn = virt_to_mfn(page_address(page));
>> +               else
>> +                       gop->dest.u.gmfn = (unsigned long)page_address(page);
> page_address doesn't return any sort of frame number, does it? This is
> rather confusing...

Yes. I only use dest.u.gmfn element to save the page_address here for 
future memcpy, and it does not mean to use frame number actually. To 
avoid confusion, here I can use

gop->dest.u.gmfn = virt_to_mfn(page_address(page));

and then call mfn_to_virt when doing memcpy.


>
>> @@ -453,7 +460,12 @@ static int connect_rings(struct backend_info *be)
>>                  val = 0;
>>          vif->csum = !val;
>>
>> -       /* Map the shared frame, irq etc. */
>> +       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-persistent-grants",
>> +                        "%u",&val)<  0)
>> +               val = 0;
>> +       vif->persistent_grant = !!val;
>> +
>> +/* Map the shared frame, irq etc. */
> Please run the patches through checkpatch.pl

Yes, I run checkpatch.pl before posting them. The only warning exists in 
initial code netfront.c, it is a printk code in xennet_tx_buf_gc, I did 
not fix that.

Thanks
Annie
>
>>          err = xenvif_connect(vif, tx_ring_ref, rx_ring_ref, evtchn);
>>          if (err) {
>>                  xenbus_dev_fatal(dev, err,
>> --
>> 1.7.3.4
>>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
annie.li@oracle.com - Nov. 16, 2012, 2:49 a.m.
On 2012-11-15 17:57, Roger Pau Monné wrote:
> On 15/11/12 08:04, Annie Li wrote:
>> This patch implements persistent grant in netback driver. Tx and rx
>> share the same page pool, this pool will be split into two parts
>> in next patch.
>>
>> Signed-off-by: Annie Li<annie.li@oracle.com>
>> ---
>>   drivers/net/xen-netback/common.h    |   18 +++-
>>   drivers/net/xen-netback/interface.c |   22 ++++
>>   drivers/net/xen-netback/netback.c   |  212 +++++++++++++++++++++++++++++++----
>>   drivers/net/xen-netback/xenbus.c    |   14 ++-
>>   4 files changed, 239 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
>> index 94b79c3..a85cac6 100644
>> --- a/drivers/net/xen-netback/common.h
>> +++ b/drivers/net/xen-netback/common.h
>> @@ -45,8 +45,19 @@
>>   #include<xen/grant_table.h>
>>   #include<xen/xenbus.h>
>>
>> +#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
>> +#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
>> +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
>> +                       (XEN_NETIF_TX_RING_SIZE + XEN_NETIF_RX_RING_SIZE)
>> +
>>   struct xen_netbk;
>>
>> +struct persistent_entry {
>> +       grant_ref_t forgranted;
>> +       struct page *fpage;
>> +       struct gnttab_map_grant_ref map;
>> +};
> This should be common with the blkback implementation, I think we should
> move some structures/functions from blkback to a common place. When I
> implementated some functions in blkback I though they could be reused by
> other backends that wanted to use persistent grants, so I keep them free
> of blkback specific structures.

Good idea, thanks.

>
>>   struct xenvif {
>>          /* Unique identifier for this interface. */
>>          domid_t          domid;
>> @@ -75,6 +86,7 @@ struct xenvif {
>>
>>          /* Internal feature information. */
>>          u8 can_queue:1;     /* can queue packets for receiver? */
>> +       u8 persistent_grant:1;
>>
>>          /*
>>           * Allow xenvif_start_xmit() to peek ahead in the rx request
>> @@ -98,6 +110,9 @@ struct xenvif {
>>          struct net_device *dev;
>>
>>          wait_queue_head_t waiting_to_free;
>> +
>> +       struct persistent_entry *persistent_gnt[MAXIMUM_OUTSTANDING_BLOCK_REQS];
>> +       unsigned int persistent_gntcnt;
> This should be a red-black tree, which has the property of a search time
> <= O(log n), using an array is more expensive in terms of memory and has
> a worse search time O(n), this is specially interesting for netback,
> which can have twice as much persistent grants as blkback (because two
> rings are used).

Right, thanks.

>
> Take a look at the following functions from blkback; foreach_grant,
> add_persistent_gnt and get_persistent_gnt. They are generic functions to
> deal with persistent grants.

Ok, thanks.
Or moving those functions into a separate common file?

>
>>   };
>>
>>   static inline struct xenbus_device *xenvif_to_xenbus_device(struct xenvif *vif)
>> @@ -105,9 +120,6 @@ static inline struct xenbus_device *xenvif_to_xenbus_device(struct xenvif *vif)
>>          return to_xenbus_device(vif->dev->dev.parent);
>>   }
>>
>> -#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
>> -#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
>> -
>>   struct xenvif *xenvif_alloc(struct device *parent,
>>                              domid_t domid,
>>                              unsigned int handle);
>> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
>> index b7d41f8..226d159 100644
>> --- a/drivers/net/xen-netback/interface.c
>> +++ b/drivers/net/xen-netback/interface.c
>> @@ -300,6 +300,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>>                  return ERR_PTR(err);
>>          }
>>
>> +       vif->persistent_gntcnt = 0;
>> +
>>          netdev_dbg(dev, "Successfully created xenvif\n");
>>          return vif;
>>   }
>> @@ -343,6 +345,23 @@ err:
>>          return err;
>>   }
>>
>> +void xenvif_free_grants(struct persistent_entry **pers_entry,
>> +                       unsigned int count)
>> +{
>> +       int i, ret;
>> +       struct gnttab_unmap_grant_ref unmap;
>> +
>> +       for (i = 0; i<  count; i++) {
>> +               gnttab_set_unmap_op(&unmap,
>> +                       (unsigned long)page_to_pfn(pers_entry[i]->fpage),
>> +                       GNTMAP_host_map,
>> +                       pers_entry[i]->map.handle);
>> +               ret = gnttab_unmap_refs(&unmap,&pers_entry[i]->fpage,
>> +                                       1, false);
> This is not correct, you should call gnttab_set_unmap_op on a batch of
> grants (up to BLKIF_MAX_SEGMENTS_PER_REQUEST), and then call
> gnttab_unmap_refs on all of them. Here is a simple example (take a look
> at blkback.c function xen_blkif_schedule to see an example with a
> red-black tree, I think this part of the code should also be made common):
>
> for (i = 0, segs_to_unmap = 0; i<  count; i++) {
> 	gnttab_set_unmap_op(&unmap[segs_to_unmap],
> 		(unsigned long)page_to_pfn(pers_entry[i]->fpage),
> 		GNTMAP_host_map,
> 		pers_entry[i]->map.handle);
> 	pages[segs_to_unmap] =
> 		(unsigned long)page_to_pfn(pers_entry[i]->fpage);
> 	if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
> 		(i + 1) == count) {
> 		ret = gnttab_unmap_refs(unmap, NULL, pages,
> 			segs_to_unmap);
> 		BUG_ON(ret);
> 		segs_to_unmap == 0;
> 	}
> }

Got it, thanks.

>
>> +               BUG_ON(ret);
>> +       }
>> +}
>> +
>>   void xenvif_disconnect(struct xenvif *vif)
>>   {
>>          struct net_device *dev = vif->dev;
>> @@ -366,6 +385,9 @@ void xenvif_disconnect(struct xenvif *vif)
>>          unregister_netdev(vif->dev);
>>
>>          xen_netbk_unmap_frontend_rings(vif);
>> +       if (vif->persistent_grant)
>> +               xenvif_free_grants(vif->persistent_gnt,
>> +                                  vif->persistent_gntcnt);
>>
>>          free_netdev(vif->dev);
>>   }
>> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
>> index 2596401..a26d3fc 100644
>> --- a/drivers/net/xen-netback/netback.c
>> +++ b/drivers/net/xen-netback/netback.c
>> @@ -80,6 +80,8 @@ union page_ext {
>>          void *mapping;
>>   };
>>
>> +struct xenvif;
>> +
>>   struct xen_netbk {
>>          wait_queue_head_t wq;
>>          struct task_struct *task;
>> @@ -102,6 +104,7 @@ struct xen_netbk {
>>
>>          struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
>>          struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
>> +       struct xenvif *gnttab_tx_vif[MAX_PENDING_REQS];
>>
>>          u16 pending_ring[MAX_PENDING_REQS];
>>
>> @@ -111,12 +114,139 @@ struct xen_netbk {
>>           * straddles two buffers in the frontend.
>>           */
>>          struct gnttab_copy grant_copy_op[2*XEN_NETIF_RX_RING_SIZE];
>> +       struct xenvif *gnttab_rx_vif[2*XEN_NETIF_RX_RING_SIZE];
>>          struct netbk_rx_meta meta[2*XEN_NETIF_RX_RING_SIZE];
>>   };
>>
>>   static struct xen_netbk *xen_netbk;
>>   static int xen_netbk_group_nr;
>>
>> +static struct persistent_entry*
>> +get_per_gnt(struct persistent_entry **pers_entry,
>> +           unsigned int count, grant_ref_t gref)
>> +{
>> +       int i;
>> +
>> +       for (i = 0; i<  count; i++)
>> +               if (gref == pers_entry[i]->forgranted)
>> +                       return pers_entry[i];
>> +
>> +       return NULL;
>> +}
> This should be replaced with common code shared with all persistent
> backends implementations.

Ok, thanks.

>
>> +
>> +static void*
>> +map_new_gnt(struct persistent_entry **pers_entry, unsigned int count,
>> +           grant_ref_t ref, domid_t domid)
>> +{
>> +       struct gnttab_map_grant_ref *map;
>> +       struct page *page;
>> +       unsigned long vaddr;
>> +       unsigned long pfn;
>> +       uint32_t flags;
>> +       int ret = 0;
>> +
>> +       pers_entry[count] = (struct persistent_entry *)
>> +                           kmalloc(sizeof(struct persistent_entry),
>> +                                   GFP_KERNEL);
>> +       if (!pers_entry[count])
>> +               return ERR_PTR(-ENOMEM);
>> +
>> +       map =&pers_entry[count]->map;
>> +       page = alloc_page(GFP_KERNEL);
>> +       if (!page) {
>> +               kfree(pers_entry[count]);
>> +               pers_entry[count] = NULL;
>> +               return ERR_PTR(-ENOMEM);
>> +       }
>> +
>> +       pers_entry[count]->fpage = page;
>> +       pfn = page_to_pfn(page);
>> +       vaddr = (unsigned long)pfn_to_kaddr(pfn);
>> +       flags = GNTMAP_host_map;
>> +
>> +       gnttab_set_map_op(map, vaddr, flags, ref, domid);
>> +       ret = gnttab_map_refs(map, NULL,&page, 1);
>> +       BUG_ON(ret);
> This is highly inefficient, one of the points of using gnttab_set_map_op
> is that you can queue a bunch of grants, and then map them at the same
> time using gnttab_map_refs, but here you are using it to map a single
> grant at a time. You should instead see how much grants you need to map
> to complete the request and map them all at the same time.

Yes, it is inefficient here. But this is limited by current netback 
implementation. Current netback is not per-VIF based(not like blkback 
does). After combining persistent grant and non persistent grant 
together, every vif request in the queue may/may not support persistent 
grant. I have to judge whether every vif in the queue supports 
persistent grant or not. If it support, memcpy is used, if not, 
grantcopy is used.
After making netback per-VIF works, this issue can be fixed.

>
>> +
>> +       pers_entry[count]->forgranted = ref;
>> +
>> +       return page_address(page);
>> +}
>> +
>> +static void*
>> +get_ref_page(struct persistent_entry **pers_entry, unsigned int *count,
>> +            grant_ref_t ref, domid_t domid, unsigned int total)
>> +{
>> +       struct persistent_entry *per_gnt;
>> +       void *vaddr;
>> +
>> +       per_gnt = get_per_gnt(pers_entry, *count, ref);
>> +
>> +       if (per_gnt != NULL)
>> +               return page_address(per_gnt->fpage);
>> +       else {
>> +               BUG_ON(*count>= total);
>> +               vaddr = map_new_gnt(pers_entry, *count, ref, domid);
>> +               if (IS_ERR_OR_NULL(vaddr))
>> +                       return vaddr;
>> +               *count += 1;
>> +               return vaddr;
>> +       }
>> +}
>> +
>> +static int
>> +grant_memory_copy_op(unsigned int cmd, void *vuop, unsigned int count,
>> +                    struct xen_netbk *netbk, bool tx_pool)
>> +{
>> +       int i;
>> +       struct xenvif *vif;
>> +       struct gnttab_copy *uop = vuop;
>> +       unsigned int *gnt_count;
>> +       unsigned int gnt_total;
>> +       struct persistent_entry **pers_entry;
>> +       int ret = 0;
>> +
>> +       BUG_ON(cmd != GNTTABOP_copy);
>> +       for (i = 0; i<  count; i++) {
>> +               if (tx_pool)
>> +                       vif = netbk->gnttab_tx_vif[i];
>> +               else
>> +                       vif = netbk->gnttab_rx_vif[i];
>> +
>> +               pers_entry = vif->persistent_gnt;
>> +               gnt_count =&vif->persistent_gntcnt;
>> +               gnt_total = MAXIMUM_OUTSTANDING_BLOCK_REQS;
>> +
>> +               if (vif->persistent_grant) {
>> +                       void *saddr, *daddr;
>> +
>> +                       saddr = uop[i].source.domid == DOMID_SELF ?
>> +                               (void *) uop[i].source.u.gmfn :
>> +                               get_ref_page(pers_entry, gnt_count,
>> +                                            uop[i].source.u.ref,
>> +                                            uop[i].source.domid,
>> +                                            gnt_total);
>> +                       if (IS_ERR_OR_NULL(saddr))
>> +                               return -ENOMEM;
>> +
>> +                       daddr = uop[i].dest.domid == DOMID_SELF ?
>> +                               (void *) uop[i].dest.u.gmfn :
>> +                               get_ref_page(pers_entry, gnt_count,
>> +                                            uop[i].dest.u.ref,
>> +                                            uop[i].dest.domid,
>> +                                            gnt_total);
>> +                       if (IS_ERR_OR_NULL(daddr))
>> +                               return -ENOMEM;
>> +
>> +                       memcpy(daddr+uop[i].dest.offset,
>> +                              saddr+uop[i].source.offset, uop[i].len);
>> +               } else
>> +                       ret = HYPERVISOR_grant_table_op(cmd,&uop[i], 1);
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>>   void xen_netbk_add_xenvif(struct xenvif *vif)
>>   {
>>          int i;
>> @@ -387,13 +517,15 @@ static struct netbk_rx_meta *get_next_rx_buffer(struct xenvif *vif,
>>    * Set up the grant operations for this fragment. If it's a flipping
>>    * interface, we also set up the unmap request from here.
>>    */
>> -static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
>> -                               struct netrx_pending_operations *npo,
>> -                               struct page *page, unsigned long size,
>> -                               unsigned long offset, int *head)
>> +static int netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
>> +                              struct netrx_pending_operations *npo,
>> +                              struct page *page, unsigned long size,
>> +                              unsigned long offset, int *head,
>> +                              struct xenvif **grxvif)
>>   {
>>          struct gnttab_copy *copy_gop;
>>          struct netbk_rx_meta *meta;
>> +       int count = 0;
>>          /*
>>           * These variables are used iff get_page_ext returns true,
>>           * in which case they are guaranteed to be initialized.
>> @@ -425,6 +557,10 @@ static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
>>                          bytes = MAX_BUFFER_OFFSET - npo->copy_off;
>>
>>                  copy_gop = npo->copy + npo->copy_prod++;
>> +               *grxvif = vif;
>> +               grxvif++;
>> +               count++;
>> +
>>                  copy_gop->flags = GNTCOPY_dest_gref;
>>                  if (foreign) {
>>                          struct xen_netbk *netbk =&xen_netbk[group];
>> @@ -438,7 +574,10 @@ static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
>>                  } else {
>>                          void *vaddr = page_address(page);
>>                          copy_gop->source.domid = DOMID_SELF;
>> -                       copy_gop->source.u.gmfn = virt_to_mfn(vaddr);
>> +                       if (!vif->persistent_grant)
>> +                               copy_gop->source.u.gmfn = virt_to_mfn(vaddr);
>> +                       else
>> +                               copy_gop->source.u.gmfn = (unsigned long)vaddr;
>>                  }
>>                  copy_gop->source.offset = offset;
>>                  copy_gop->dest.domid = vif->domid;
>> @@ -460,6 +599,7 @@ static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
>>                  *head = 0; /* There must be something in this buffer now. */
>>
>>          }
>> +       return count;
>>   }
>>
>>   /*
>> @@ -474,8 +614,10 @@ static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
>>    * zero GSO descriptors (for non-GSO packets) or one descriptor (for
>>    * frontend-side LRO).
>>    */
>> -static int netbk_gop_skb(struct sk_buff *skb,
>> -                        struct netrx_pending_operations *npo)
>> +static int netbk_gop_skb(struct xen_netbk *netbk,
>> +                        struct sk_buff *skb,
>> +                        struct netrx_pending_operations *npo,
>> +                        struct xenvif **grxvif)
>>   {
>>          struct xenvif *vif = netdev_priv(skb->dev);
>>          int nr_frags = skb_shinfo(skb)->nr_frags;
>> @@ -518,17 +660,19 @@ static int netbk_gop_skb(struct sk_buff *skb,
>>                  if (data + len>  skb_tail_pointer(skb))
>>                          len = skb_tail_pointer(skb) - data;
>>
>> -               netbk_gop_frag_copy(vif, skb, npo,
>> -                                   virt_to_page(data), len, offset,&head);
>> +               grxvif += netbk_gop_frag_copy(vif, skb, npo,
>> +                                             virt_to_page(data), len,
>> +                                             offset,&head, grxvif);
>> +
>>                  data += len;
>>          }
>>
>>          for (i = 0; i<  nr_frags; i++) {
>> -               netbk_gop_frag_copy(vif, skb, npo,
>> -                                   skb_frag_page(&skb_shinfo(skb)->frags[i]),
>> -                                   skb_frag_size(&skb_shinfo(skb)->frags[i]),
>> -                                   skb_shinfo(skb)->frags[i].page_offset,
>> -&head);
>> +               grxvif += netbk_gop_frag_copy(vif, skb, npo,
>> +                               skb_frag_page(&skb_shinfo(skb)->frags[i]),
>> +                               skb_frag_size(&skb_shinfo(skb)->frags[i]),
>> +                               skb_shinfo(skb)->frags[i].page_offset,
>> +&head, grxvif);
>>          }
>>
>>          return npo->meta_prod - old_meta_prod;
>> @@ -593,6 +737,8 @@ struct skb_cb_overlay {
>>   static void xen_netbk_rx_action(struct xen_netbk *netbk)
>>   {
>>          struct xenvif *vif = NULL, *tmp;
>> +       struct xenvif **grxvif = netbk->gnttab_rx_vif;
>> +       int old_copy_prod = 0;
>>          s8 status;
>>          u16 irq, flags;
>>          struct xen_netif_rx_response *resp;
>> @@ -619,7 +765,9 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk)
>>                  nr_frags = skb_shinfo(skb)->nr_frags;
>>
>>                  sco = (struct skb_cb_overlay *)skb->cb;
>> -               sco->meta_slots_used = netbk_gop_skb(skb,&npo);
>> +               sco->meta_slots_used = netbk_gop_skb(netbk, skb,&npo, grxvif);
>> +               grxvif += npo.copy_prod - old_copy_prod;
>> +               old_copy_prod = npo.copy_prod;
>>
>>                  count += nr_frags + 1;
>>
>> @@ -636,8 +784,10 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk)
>>                  return;
>>
>>          BUG_ON(npo.copy_prod>  ARRAY_SIZE(netbk->grant_copy_op));
>> -       ret = HYPERVISOR_grant_table_op(GNTTABOP_copy,&netbk->grant_copy_op,
>> -                                       npo.copy_prod);
>> +       ret = grant_memory_copy_op(GNTTABOP_copy,
>> +&netbk->grant_copy_op,
>> +                                  npo.copy_prod, netbk,
>> +                                  false);
>>          BUG_ON(ret != 0);
>>
>>          while ((skb = __skb_dequeue(&rxq)) != NULL) {
>> @@ -918,7 +1068,8 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
>>                                                    struct xenvif *vif,
>>                                                    struct sk_buff *skb,
>>                                                    struct xen_netif_tx_request *txp,
>> -                                                 struct gnttab_copy *gop)
>> +                                                 struct gnttab_copy *gop,
>> +                                                 struct xenvif **gtxvif)
>>   {
>>          struct skb_shared_info *shinfo = skb_shinfo(skb);
>>          skb_frag_t *frags = shinfo->frags;
>> @@ -944,7 +1095,11 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
>>                  gop->source.domid = vif->domid;
>>                  gop->source.offset = txp->offset;
>>
>> -               gop->dest.u.gmfn = virt_to_mfn(page_address(page));
>> +               if (!vif->persistent_grant)
>> +                       gop->dest.u.gmfn = virt_to_mfn(page_address(page));
>> +               else
>> +                       gop->dest.u.gmfn = (unsigned long)page_address(page);
>> +
>>                  gop->dest.domid = DOMID_SELF;
>>                  gop->dest.offset = txp->offset;
>>
>> @@ -952,6 +1107,9 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
>>                  gop->flags = GNTCOPY_source_gref;
>>
>>                  gop++;
>> +               *gtxvif = vif;
>> +               gtxvif++;
>> +
>>
>>                  memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
>>                  xenvif_get(vif);
>> @@ -1218,6 +1376,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
>>   static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>>   {
>>          struct gnttab_copy *gop = netbk->tx_copy_ops, *request_gop;
>> +       struct xenvif **gtxvif = netbk->gnttab_tx_vif;
>>          struct sk_buff *skb;
>>          int ret;
>>
>> @@ -1338,7 +1497,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>>                  gop->source.domid = vif->domid;
>>                  gop->source.offset = txreq.offset;
>>
>> -               gop->dest.u.gmfn = virt_to_mfn(page_address(page));
>> +               if (!vif->persistent_grant)
>> +                       gop->dest.u.gmfn = virt_to_mfn(page_address(page));
>> +               else
>> +                       gop->dest.u.gmfn = (unsigned long)page_address(page);
>> +
>>                  gop->dest.domid = DOMID_SELF;
>>                  gop->dest.offset = txreq.offset;
>>
>> @@ -1346,6 +1509,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>>                  gop->flags = GNTCOPY_source_gref;
>>
>>                  gop++;
>> +               *gtxvif++ = vif;
>>
>>                  memcpy(&netbk->pending_tx_info[pending_idx].req,
>>                         &txreq, sizeof(txreq));
>> @@ -1369,12 +1533,13 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>>                  netbk->pending_cons++;
>>
>>                  request_gop = xen_netbk_get_requests(netbk, vif,
>> -                                                    skb, txfrags, gop);
>> +                                                    skb, txfrags, gop, gtxvif);
>>                  if (request_gop == NULL) {
>>                          kfree_skb(skb);
>>                          netbk_tx_err(vif,&txreq, idx);
>>                          continue;
>>                  }
>> +               gtxvif += request_gop - gop;
>>                  gop = request_gop;
>>
>>                  vif->tx.req_cons = idx;
>> @@ -1467,8 +1632,9 @@ static void xen_netbk_tx_action(struct xen_netbk *netbk)
>>
>>          if (nr_gops == 0)
>>                  return;
>> -       ret = HYPERVISOR_grant_table_op(GNTTABOP_copy,
>> -                                       netbk->tx_copy_ops, nr_gops);
>> +       ret = grant_memory_copy_op(GNTTABOP_copy,
>> +                                  netbk->tx_copy_ops, nr_gops,
>> +                                  netbk, true);
>>          BUG_ON(ret);
>>
>>          xen_netbk_tx_submit(netbk);
>> diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
>> index 410018c..938e908 100644
>> --- a/drivers/net/xen-netback/xenbus.c
>> +++ b/drivers/net/xen-netback/xenbus.c
>> @@ -114,6 +114,13 @@ static int netback_probe(struct xenbus_device *dev,
>>                          goto abort_transaction;
>>                  }
>>
>> +               err = xenbus_printf(xbt, dev->nodename,
>> +                                   "feature-persistent-grants", "%u", 1);
>> +               if (err) {
>> +                       message = "writing feature-persistent-grants";
>> +                       goto abort_transaction;
>> +               }
>> +
>>                  err = xenbus_transaction_end(xbt, 0);
>>          } while (err == -EAGAIN);
>>
>> @@ -453,7 +460,12 @@ static int connect_rings(struct backend_info *be)
>>                  val = 0;
>>          vif->csum = !val;
>>
>> -       /* Map the shared frame, irq etc. */
>> +       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-persistent-grants",
>> +                        "%u",&val)<  0)
> In block devices "feature-persistent" is used, so I think that for
> clearness it should be announced the same way in net.
Is it  "feature-persistent" ? I checked your RFC patch, the key is 
"feature-persistent-grants".

Thanks
Annie
>
>> +               val = 0;
>> +       vif->persistent_grant = !!val;
>> +
>> +/* Map the shared frame, irq etc. */
>>          err = xenvif_connect(vif, tx_ring_ref, rx_ring_ref, evtchn);
>>          if (err) {
>>                  xenbus_dev_fatal(dev, err,
>> --
>> 1.7.3.4
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel
>>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
annie.li@oracle.com - Nov. 16, 2012, 7:57 a.m.
On 2012-11-16 10:49, ANNIE LI wrote:
>
>
> On 2012-11-15 17:57, Roger Pau Monné wrote:
>>>
>>> @@ -453,7 +460,12 @@ static int connect_rings(struct backend_info *be)
>>>                  val = 0;
>>>          vif->csum = !val;
>>>
>>> -       /* Map the shared frame, irq etc. */
>>> +       if (xenbus_scanf(XBT_NIL, dev->otherend, 
>>> "feature-persistent-grants",
>>> +                        "%u",&val)<  0)
>> In block devices "feature-persistent" is used, so I think that for
>> clearness it should be announced the same way in net.
> Is it  "feature-persistent" ? I checked your RFC patch, the key is 
> "feature-persistent-grants".
>
>
My mistake.
In your v2 patch, it is "feature-persistent". I will change the code as 
blkback/blkfront.

Thanks
Annie
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell - Nov. 16, 2012, 9:27 a.m.
On Fri, 2012-11-16 at 02:18 +0000, ANNIE LI wrote:
> In this patch,
> The maximum of memory overhead is about
> 
> (XEN_NETIF_TX_RING_SIZE+XEN_NETIF_RX_RING_SIZE)*PAGE_SIZE  (plus size of grant_ref_t and handle)
> which is about 512 PAGE_SIZE. Normally, without heavy network offload, this maximum can not be reached.
> 
> In next patch of splitting tx/rx pool, the maximum is about

"about" or just "is"?

>  (256+512)PAGE_SIZE.

IOW 3MB.

> >
> >> +
> >> +       return NULL;
> >> +}
> >> +
> >> @@ -1338,7 +1497,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
> >>                  gop->source.domid = vif->domid;
> >>                  gop->source.offset = txreq.offset;
> >>
> >> -               gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> >> +               if (!vif->persistent_grant)
> >> +                       gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> >> +               else
> >> +                       gop->dest.u.gmfn = (unsigned long)page_address(page);
> > page_address doesn't return any sort of frame number, does it? This is
> > rather confusing...
> 
> Yes. I only use dest.u.gmfn element to save the page_address here for 
> future memcpy, and it does not mean to use frame number actually. To 
> avoid confusion, here I can use
> 
> gop->dest.u.gmfn = virt_to_mfn(page_address(page));
> 
> and then call mfn_to_virt when doing memcpy.

It seems a bit odd to be using the gop structure in this way when you
aren't actually doing a grant op on it. 

While investigating I noticed:
+static int
+grant_memory_copy_op(unsigned int cmd, void *vuop, unsigned int count,
+                    struct xen_netbk *netbk, bool tx_pool)
...
+       struct gnttab_copy *uop = vuop;

Why void *vuop? Why not struct gnttab_copy * in the parameter?

I also noticed your new grant_memory_copy_op() seems to have unbatched
the grant ops in the non-persistent case, which is going to suck for
performance in non-persistent mode. You need to pull the conditional and
the HYPERVISOR_grant_table_op outside the loop and pass it full array
instead of doing them one at a time.

Ian

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell - Nov. 16, 2012, 9:32 a.m.
On Fri, 2012-11-16 at 02:49 +0000, ANNIE LI wrote:
> >
> > Take a look at the following functions from blkback; foreach_grant,
> > add_persistent_gnt and get_persistent_gnt. They are generic functions to
> > deal with persistent grants.
> 
> Ok, thanks.
> Or moving those functions into a separate common file?

Please put them somewhere common.

> > This is highly inefficient, one of the points of using gnttab_set_map_op
> > is that you can queue a bunch of grants, and then map them at the same
> > time using gnttab_map_refs, but here you are using it to map a single
> > grant at a time. You should instead see how much grants you need to map
> > to complete the request and map them all at the same time.
> 
> Yes, it is inefficient here. But this is limited by current netback
> implementation. Current netback is not per-VIF based(not like blkback
> does). After combining persistent grant and non persistent grant
> together, every vif request in the queue may/may not support persistent
> grant. I have to judge whether every vif in the queue supports
> persistent grant or not. If it support, memcpy is used, if not,
> grantcopy is used.

You could (and should) still batch all the grant copies into one
hypercall, e.g. walk the list either doing memcpy or queuing up copyops
as appropriate, then at the end if the queue is non-zero length issue
the hypercall.

I'd expect this lack of batching here and in the other case I just
spotted to have a detrimental affect on guests running with this patch
but not using persistent grants. Did you benchmark that case?

> After making netback per-VIF works, this issue can be fixed.

You've mentioned improvements which are conditional on this work a few
times I think, perhaps it makes sense to make that change first?

Ian.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
annie.li@oracle.com - Nov. 16, 2012, 9:55 a.m.
On 2012-11-16 17:27, Ian Campbell wrote:
> On Fri, 2012-11-16 at 02:18 +0000, ANNIE LI wrote:
>> In this patch,
>> The maximum of memory overhead is about
>>
>> (XEN_NETIF_TX_RING_SIZE+XEN_NETIF_RX_RING_SIZE)*PAGE_SIZE  (plus size of grant_ref_t and handle)
>> which is about 512 PAGE_SIZE. Normally, without heavy network offload, this maximum can not be reached.
>>
>> In next patch of splitting tx/rx pool, the maximum is about
> "about" or just "is"?

For only grant pages, it is this value. I took into account other 
element of grant_ref_t and map(change to handle in future)....

>
>>   (256+512)PAGE_SIZE.
> IOW 3MB.
>
>>>> +
>>>> +       return NULL;
>>>> +}
>>>> +
>>>> @@ -1338,7 +1497,11 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
>>>>                   gop->source.domid = vif->domid;
>>>>                   gop->source.offset = txreq.offset;
>>>>
>>>> -               gop->dest.u.gmfn = virt_to_mfn(page_address(page));
>>>> +               if (!vif->persistent_grant)
>>>> +                       gop->dest.u.gmfn = virt_to_mfn(page_address(page));
>>>> +               else
>>>> +                       gop->dest.u.gmfn = (unsigned long)page_address(page);
>>> page_address doesn't return any sort of frame number, does it? This is
>>> rather confusing...
>> Yes. I only use dest.u.gmfn element to save the page_address here for
>> future memcpy, and it does not mean to use frame number actually. To
>> avoid confusion, here I can use
>>
>> gop->dest.u.gmfn = virt_to_mfn(page_address(page));
>>
>> and then call mfn_to_virt when doing memcpy.
> It seems a bit odd to be using the gop structure in this way when you
> aren't actually doing a grant op on it.
>
> While investigating I noticed:
> +static int
> +grant_memory_copy_op(unsigned int cmd, void *vuop, unsigned int count,
> +                    struct xen_netbk *netbk, bool tx_pool)
> ...
> +       struct gnttab_copy *uop = vuop;
>
> Why void *vuop? Why not struct gnttab_copy * in the parameter?

Sorry, my mistake.

>
> I also noticed your new grant_memory_copy_op() seems to have unbatched
> the grant ops in the non-persistent case, which is going to suck for
> performance in non-persistent mode. You need to pull the conditional and
> the HYPERVISOR_grant_table_op outside the loop and pass it full array
> instead of doing them one at a time.

This still connects with netback per-VIF implementation.
Currently, these could not be pulled out outside since netback queue may 
contains persistent and nonpersistent in the same queue. I did consider 
to implement per-VIF first and then the persistent grant,
but thinking of it is part of wei's patch combined with other patches, 
and finally decided to implement per-VIF later.

But this does limit implementation of persistent grant.

Thanks
Annie
>
> Ian
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
annie.li@oracle.com - Nov. 16, 2012, 11:34 a.m.
On 2012-11-16 17:32, Ian Campbell wrote:
> On Fri, 2012-11-16 at 02:49 +0000, ANNIE LI wrote:
>>> Take a look at the following functions from blkback; foreach_grant,
>>> add_persistent_gnt and get_persistent_gnt. They are generic functions to
>>> deal with persistent grants.
>> Ok, thanks.
>> Or moving those functions into a separate common file?
> Please put them somewhere common.

Ok.

>>> This is highly inefficient, one of the points of using gnttab_set_map_op
>>> is that you can queue a bunch of grants, and then map them at the same
>>> time using gnttab_map_refs, but here you are using it to map a single
>>> grant at a time. You should instead see how much grants you need to map
>>> to complete the request and map them all at the same time.
>> Yes, it is inefficient here. But this is limited by current netback
>> implementation. Current netback is not per-VIF based(not like blkback
>> does). After combining persistent grant and non persistent grant
>> together, every vif request in the queue may/may not support persistent
>> grant. I have to judge whether every vif in the queue supports
>> persistent grant or not. If it support, memcpy is used, if not,
>> grantcopy is used.
> You could (and should) still batch all the grant copies into one
> hypercall, e.g. walk the list either doing memcpy or queuing up copyops
> as appropriate, then at the end if the queue is non-zero length issue
> the hypercall.

This still connects with netback per-VIF implementation.

> I'd expect this lack of batching here and in the other case I just
> spotted to have a detrimental affect on guests running with this patch
> but not using persistent grants. Did you benchmark that case?

I did some test before.
But I'd better to create more detailed result under in different case.

>> After making netback per-VIF works, this issue can be fixed.
> You've mentioned improvements which are conditional on this work a few
> times I think, perhaps it makes sense to make that change first?

Yes, I did consider to implement per-VIF first before persistent grant. 
But thinking of it is part of wei's patch and combined with other 
patches, and decided to implement it later. But making the change first 
would make things more clear.

Thanks
Annie
> Ian.
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message tomajordomo@vger.kernel.org
> More majordomo info athttp://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 94b79c3..a85cac6 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -45,8 +45,19 @@ 
 #include <xen/grant_table.h>
 #include <xen/xenbus.h>
 
+#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
+#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+			(XEN_NETIF_TX_RING_SIZE + XEN_NETIF_RX_RING_SIZE)
+
 struct xen_netbk;
 
+struct persistent_entry {
+	grant_ref_t forgranted;
+	struct page *fpage;
+	struct gnttab_map_grant_ref map;
+};
+
 struct xenvif {
 	/* Unique identifier for this interface. */
 	domid_t          domid;
@@ -75,6 +86,7 @@  struct xenvif {
 
 	/* Internal feature information. */
 	u8 can_queue:1;	    /* can queue packets for receiver? */
+	u8 persistent_grant:1;
 
 	/*
 	 * Allow xenvif_start_xmit() to peek ahead in the rx request
@@ -98,6 +110,9 @@  struct xenvif {
 	struct net_device *dev;
 
 	wait_queue_head_t waiting_to_free;
+
+	struct persistent_entry *persistent_gnt[MAXIMUM_OUTSTANDING_BLOCK_REQS];
+	unsigned int persistent_gntcnt;
 };
 
 static inline struct xenbus_device *xenvif_to_xenbus_device(struct xenvif *vif)
@@ -105,9 +120,6 @@  static inline struct xenbus_device *xenvif_to_xenbus_device(struct xenvif *vif)
 	return to_xenbus_device(vif->dev->dev.parent);
 }
 
-#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
-#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
-
 struct xenvif *xenvif_alloc(struct device *parent,
 			    domid_t domid,
 			    unsigned int handle);
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index b7d41f8..226d159 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -300,6 +300,8 @@  struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 		return ERR_PTR(err);
 	}
 
+	vif->persistent_gntcnt = 0;
+
 	netdev_dbg(dev, "Successfully created xenvif\n");
 	return vif;
 }
@@ -343,6 +345,23 @@  err:
 	return err;
 }
 
+void xenvif_free_grants(struct persistent_entry **pers_entry,
+			unsigned int count)
+{
+	int i, ret;
+	struct gnttab_unmap_grant_ref unmap;
+
+	for (i = 0; i < count; i++) {
+		gnttab_set_unmap_op(&unmap,
+			(unsigned long)page_to_pfn(pers_entry[i]->fpage),
+			GNTMAP_host_map,
+			pers_entry[i]->map.handle);
+		ret = gnttab_unmap_refs(&unmap, &pers_entry[i]->fpage,
+					1, false);
+		BUG_ON(ret);
+	}
+}
+
 void xenvif_disconnect(struct xenvif *vif)
 {
 	struct net_device *dev = vif->dev;
@@ -366,6 +385,9 @@  void xenvif_disconnect(struct xenvif *vif)
 	unregister_netdev(vif->dev);
 
 	xen_netbk_unmap_frontend_rings(vif);
+	if (vif->persistent_grant)
+		xenvif_free_grants(vif->persistent_gnt,
+				   vif->persistent_gntcnt);
 
 	free_netdev(vif->dev);
 }
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 2596401..a26d3fc 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -80,6 +80,8 @@  union page_ext {
 	void *mapping;
 };
 
+struct xenvif;
+
 struct xen_netbk {
 	wait_queue_head_t wq;
 	struct task_struct *task;
@@ -102,6 +104,7 @@  struct xen_netbk {
 
 	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
 	struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
+	struct xenvif *gnttab_tx_vif[MAX_PENDING_REQS];
 
 	u16 pending_ring[MAX_PENDING_REQS];
 
@@ -111,12 +114,139 @@  struct xen_netbk {
 	 * straddles two buffers in the frontend.
 	 */
 	struct gnttab_copy grant_copy_op[2*XEN_NETIF_RX_RING_SIZE];
+	struct xenvif *gnttab_rx_vif[2*XEN_NETIF_RX_RING_SIZE];
 	struct netbk_rx_meta meta[2*XEN_NETIF_RX_RING_SIZE];
 };
 
 static struct xen_netbk *xen_netbk;
 static int xen_netbk_group_nr;
 
+static struct persistent_entry*
+get_per_gnt(struct persistent_entry **pers_entry,
+	    unsigned int count, grant_ref_t gref)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		if (gref == pers_entry[i]->forgranted)
+			return pers_entry[i];
+
+	return NULL;
+}
+
+static void*
+map_new_gnt(struct persistent_entry **pers_entry, unsigned int count,
+	    grant_ref_t ref, domid_t domid)
+{
+	struct gnttab_map_grant_ref *map;
+	struct page *page;
+	unsigned long vaddr;
+	unsigned long pfn;
+	uint32_t flags;
+	int ret = 0;
+
+	pers_entry[count] = (struct persistent_entry *)
+			    kmalloc(sizeof(struct persistent_entry),
+				    GFP_KERNEL);
+	if (!pers_entry[count])
+		return ERR_PTR(-ENOMEM);
+
+	map = &pers_entry[count]->map;
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		kfree(pers_entry[count]);
+		pers_entry[count] = NULL;
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pers_entry[count]->fpage = page;
+	pfn = page_to_pfn(page);
+	vaddr = (unsigned long)pfn_to_kaddr(pfn);
+	flags = GNTMAP_host_map;
+
+	gnttab_set_map_op(map, vaddr, flags, ref, domid);
+	ret = gnttab_map_refs(map, NULL, &page, 1);
+	BUG_ON(ret);
+
+	pers_entry[count]->forgranted = ref;
+
+	return page_address(page);
+}
+
+static void*
+get_ref_page(struct persistent_entry **pers_entry, unsigned int *count,
+	     grant_ref_t ref, domid_t domid, unsigned int total)
+{
+	struct persistent_entry *per_gnt;
+	void *vaddr;
+
+	per_gnt = get_per_gnt(pers_entry, *count, ref);
+
+	if (per_gnt != NULL)
+		return page_address(per_gnt->fpage);
+	else {
+		BUG_ON(*count >= total);
+		vaddr = map_new_gnt(pers_entry, *count, ref, domid);
+		if (IS_ERR_OR_NULL(vaddr))
+			return vaddr;
+		*count += 1;
+		return vaddr;
+	}
+}
+
+static int
+grant_memory_copy_op(unsigned int cmd, void *vuop, unsigned int count,
+		     struct xen_netbk *netbk, bool tx_pool)
+{
+	int i;
+	struct xenvif *vif;
+	struct gnttab_copy *uop = vuop;
+	unsigned int *gnt_count;
+	unsigned int gnt_total;
+	struct persistent_entry **pers_entry;
+	int ret = 0;
+
+	BUG_ON(cmd != GNTTABOP_copy);
+	for (i = 0; i < count; i++) {
+		if (tx_pool)
+			vif = netbk->gnttab_tx_vif[i];
+		else
+			vif = netbk->gnttab_rx_vif[i];
+
+		pers_entry = vif->persistent_gnt;
+		gnt_count = &vif->persistent_gntcnt;
+		gnt_total = MAXIMUM_OUTSTANDING_BLOCK_REQS;
+
+		if (vif->persistent_grant) {
+			void *saddr, *daddr;
+
+			saddr = uop[i].source.domid == DOMID_SELF ?
+				(void *) uop[i].source.u.gmfn :
+				get_ref_page(pers_entry, gnt_count,
+					     uop[i].source.u.ref,
+					     uop[i].source.domid,
+					     gnt_total);
+			if (IS_ERR_OR_NULL(saddr))
+				return -ENOMEM;
+
+			daddr = uop[i].dest.domid == DOMID_SELF ?
+				(void *) uop[i].dest.u.gmfn :
+				get_ref_page(pers_entry, gnt_count,
+					     uop[i].dest.u.ref,
+					     uop[i].dest.domid,
+					     gnt_total);
+			if (IS_ERR_OR_NULL(daddr))
+				return -ENOMEM;
+
+			memcpy(daddr+uop[i].dest.offset,
+			       saddr+uop[i].source.offset, uop[i].len);
+		} else
+			ret = HYPERVISOR_grant_table_op(cmd, &uop[i], 1);
+	}
+
+	return ret;
+}
+
 void xen_netbk_add_xenvif(struct xenvif *vif)
 {
 	int i;
@@ -387,13 +517,15 @@  static struct netbk_rx_meta *get_next_rx_buffer(struct xenvif *vif,
  * Set up the grant operations for this fragment. If it's a flipping
  * interface, we also set up the unmap request from here.
  */
-static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
-				struct netrx_pending_operations *npo,
-				struct page *page, unsigned long size,
-				unsigned long offset, int *head)
+static int netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
+			       struct netrx_pending_operations *npo,
+			       struct page *page, unsigned long size,
+			       unsigned long offset, int *head,
+			       struct xenvif **grxvif)
 {
 	struct gnttab_copy *copy_gop;
 	struct netbk_rx_meta *meta;
+	int count = 0;
 	/*
 	 * These variables are used iff get_page_ext returns true,
 	 * in which case they are guaranteed to be initialized.
@@ -425,6 +557,10 @@  static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
 			bytes = MAX_BUFFER_OFFSET - npo->copy_off;
 
 		copy_gop = npo->copy + npo->copy_prod++;
+		*grxvif = vif;
+		grxvif++;
+		count++;
+
 		copy_gop->flags = GNTCOPY_dest_gref;
 		if (foreign) {
 			struct xen_netbk *netbk = &xen_netbk[group];
@@ -438,7 +574,10 @@  static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
 		} else {
 			void *vaddr = page_address(page);
 			copy_gop->source.domid = DOMID_SELF;
-			copy_gop->source.u.gmfn = virt_to_mfn(vaddr);
+			if (!vif->persistent_grant)
+				copy_gop->source.u.gmfn = virt_to_mfn(vaddr);
+			else
+				copy_gop->source.u.gmfn = (unsigned long)vaddr;
 		}
 		copy_gop->source.offset = offset;
 		copy_gop->dest.domid = vif->domid;
@@ -460,6 +599,7 @@  static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
 		*head = 0; /* There must be something in this buffer now. */
 
 	}
+	return count;
 }
 
 /*
@@ -474,8 +614,10 @@  static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
  * zero GSO descriptors (for non-GSO packets) or one descriptor (for
  * frontend-side LRO).
  */
-static int netbk_gop_skb(struct sk_buff *skb,
-			 struct netrx_pending_operations *npo)
+static int netbk_gop_skb(struct xen_netbk *netbk,
+			 struct sk_buff *skb,
+			 struct netrx_pending_operations *npo,
+			 struct xenvif **grxvif)
 {
 	struct xenvif *vif = netdev_priv(skb->dev);
 	int nr_frags = skb_shinfo(skb)->nr_frags;
@@ -518,17 +660,19 @@  static int netbk_gop_skb(struct sk_buff *skb,
 		if (data + len > skb_tail_pointer(skb))
 			len = skb_tail_pointer(skb) - data;
 
-		netbk_gop_frag_copy(vif, skb, npo,
-				    virt_to_page(data), len, offset, &head);
+		grxvif += netbk_gop_frag_copy(vif, skb, npo,
+					      virt_to_page(data), len,
+					      offset, &head, grxvif);
+
 		data += len;
 	}
 
 	for (i = 0; i < nr_frags; i++) {
-		netbk_gop_frag_copy(vif, skb, npo,
-				    skb_frag_page(&skb_shinfo(skb)->frags[i]),
-				    skb_frag_size(&skb_shinfo(skb)->frags[i]),
-				    skb_shinfo(skb)->frags[i].page_offset,
-				    &head);
+		grxvif += netbk_gop_frag_copy(vif, skb, npo,
+				skb_frag_page(&skb_shinfo(skb)->frags[i]),
+				skb_frag_size(&skb_shinfo(skb)->frags[i]),
+				skb_shinfo(skb)->frags[i].page_offset,
+				&head, grxvif);
 	}
 
 	return npo->meta_prod - old_meta_prod;
@@ -593,6 +737,8 @@  struct skb_cb_overlay {
 static void xen_netbk_rx_action(struct xen_netbk *netbk)
 {
 	struct xenvif *vif = NULL, *tmp;
+	struct xenvif **grxvif = netbk->gnttab_rx_vif;
+	int old_copy_prod = 0;
 	s8 status;
 	u16 irq, flags;
 	struct xen_netif_rx_response *resp;
@@ -619,7 +765,9 @@  static void xen_netbk_rx_action(struct xen_netbk *netbk)
 		nr_frags = skb_shinfo(skb)->nr_frags;
 
 		sco = (struct skb_cb_overlay *)skb->cb;
-		sco->meta_slots_used = netbk_gop_skb(skb, &npo);
+		sco->meta_slots_used = netbk_gop_skb(netbk, skb, &npo, grxvif);
+		grxvif += npo.copy_prod - old_copy_prod;
+		old_copy_prod = npo.copy_prod;
 
 		count += nr_frags + 1;
 
@@ -636,8 +784,10 @@  static void xen_netbk_rx_action(struct xen_netbk *netbk)
 		return;
 
 	BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
-	ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, &netbk->grant_copy_op,
-					npo.copy_prod);
+	ret = grant_memory_copy_op(GNTTABOP_copy,
+				   &netbk->grant_copy_op,
+				   npo.copy_prod, netbk,
+				   false);
 	BUG_ON(ret != 0);
 
 	while ((skb = __skb_dequeue(&rxq)) != NULL) {
@@ -918,7 +1068,8 @@  static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
 						  struct xenvif *vif,
 						  struct sk_buff *skb,
 						  struct xen_netif_tx_request *txp,
-						  struct gnttab_copy *gop)
+						  struct gnttab_copy *gop,
+						  struct xenvif **gtxvif)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
@@ -944,7 +1095,11 @@  static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
 		gop->source.domid = vif->domid;
 		gop->source.offset = txp->offset;
 
-		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+		if (!vif->persistent_grant)
+			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+		else
+			gop->dest.u.gmfn = (unsigned long)page_address(page);
+
 		gop->dest.domid = DOMID_SELF;
 		gop->dest.offset = txp->offset;
 
@@ -952,6 +1107,9 @@  static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
 		gop->flags = GNTCOPY_source_gref;
 
 		gop++;
+		*gtxvif = vif;
+		gtxvif++;
+
 
 		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
 		xenvif_get(vif);
@@ -1218,6 +1376,7 @@  static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
 static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
 {
 	struct gnttab_copy *gop = netbk->tx_copy_ops, *request_gop;
+	struct xenvif **gtxvif = netbk->gnttab_tx_vif;
 	struct sk_buff *skb;
 	int ret;
 
@@ -1338,7 +1497,11 @@  static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
 		gop->source.domid = vif->domid;
 		gop->source.offset = txreq.offset;
 
-		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+		if (!vif->persistent_grant)
+			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+		else
+			gop->dest.u.gmfn = (unsigned long)page_address(page);
+
 		gop->dest.domid = DOMID_SELF;
 		gop->dest.offset = txreq.offset;
 
@@ -1346,6 +1509,7 @@  static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
 		gop->flags = GNTCOPY_source_gref;
 
 		gop++;
+		*gtxvif++ = vif;
 
 		memcpy(&netbk->pending_tx_info[pending_idx].req,
 		       &txreq, sizeof(txreq));
@@ -1369,12 +1533,13 @@  static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
 		netbk->pending_cons++;
 
 		request_gop = xen_netbk_get_requests(netbk, vif,
-						     skb, txfrags, gop);
+						     skb, txfrags, gop, gtxvif);
 		if (request_gop == NULL) {
 			kfree_skb(skb);
 			netbk_tx_err(vif, &txreq, idx);
 			continue;
 		}
+		gtxvif += request_gop - gop;
 		gop = request_gop;
 
 		vif->tx.req_cons = idx;
@@ -1467,8 +1632,9 @@  static void xen_netbk_tx_action(struct xen_netbk *netbk)
 
 	if (nr_gops == 0)
 		return;
-	ret = HYPERVISOR_grant_table_op(GNTTABOP_copy,
-					netbk->tx_copy_ops, nr_gops);
+	ret = grant_memory_copy_op(GNTTABOP_copy,
+				   netbk->tx_copy_ops, nr_gops,
+				   netbk, true);
 	BUG_ON(ret);
 
 	xen_netbk_tx_submit(netbk);
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
index 410018c..938e908 100644
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -114,6 +114,13 @@  static int netback_probe(struct xenbus_device *dev,
 			goto abort_transaction;
 		}
 
+		err = xenbus_printf(xbt, dev->nodename,
+				    "feature-persistent-grants", "%u", 1);
+		if (err) {
+			message = "writing feature-persistent-grants";
+			goto abort_transaction;
+		}
+
 		err = xenbus_transaction_end(xbt, 0);
 	} while (err == -EAGAIN);
 
@@ -453,7 +460,12 @@  static int connect_rings(struct backend_info *be)
 		val = 0;
 	vif->csum = !val;
 
-	/* Map the shared frame, irq etc. */
+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-persistent-grants",
+			 "%u", &val) < 0)
+		val = 0;
+	vif->persistent_grant = !!val;
+
+/* Map the shared frame, irq etc. */
 	err = xenvif_connect(vif, tx_ring_ref, rx_ring_ref, evtchn);
 	if (err) {
 		xenbus_dev_fatal(dev, err,