diff mbox

[net-next,v5,2/9] xen-netback: Change TX path from grant copy to mapping

Message ID 1390253069-25507-3-git-send-email-zoltan.kiss@citrix.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Zoltan Kiss Jan. 20, 2014, 9:24 p.m. UTC
This patch changes the grant copy on the TX patch to grant mapping

v2:
- delete branch for handling fragmented packets fit PKT_PROT_LEN sized first
  request
- mark the effect of using ballooned pages in a comment
- place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
  before netif_receive_skb, and mark the importance of it
- grab dealloc_lock before __napi_complete to avoid contention with the
  callback's napi_schedule
- handle fragmented packets where first request < PKT_PROT_LEN
- fix up error path when checksum_setup failed
- check before teardown for pending grants, and start complain if they are
  there after 10 second

v3:
- delete a surplus checking from tx_action
- remove stray line
- squash xenvif_idx_unmap changes into the first patch
- init spinlocks
- call map hypercall directly instead of gnttab_map_refs()
- fix unmapping timeout in xenvif_free()

v4:
- fix indentations and comments
- handle errors of set_phys_to_machine
- go back to gnttab_map_refs instead of direct hypercall. Now we rely on the
  modified API

v5:
- BUG_ON(vif->dealloc_task) in xenvif_connect
- use 'task' in xenvif_connect for thread creation
- proper return value if alloc_xenballooned_pages fails
- BUG in xenvif_tx_check_gop if stale handle found

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---
 drivers/net/xen-netback/interface.c |   63 ++++++++-
 drivers/net/xen-netback/netback.c   |  254 ++++++++++++++---------------------
 2 files changed, 160 insertions(+), 157 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Ian Campbell Feb. 18, 2014, 5:40 p.m. UTC | #1
On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> This patch changes the grant copy on the TX patch to grant mapping

Both this and the previous patch had a single sentence commit message (I
count them together since they are split weirdly and are a single
logical change to my eyes).

Really a change of this magnitude deserves a commit message to match,
e.g. explaining the approach which is taken by the code at a high level,
what it is doing, how it is doing it, the rationale for using a kthread
etc etc.

> 
> v2:
> - delete branch for handling fragmented packets fit PKT_PROT_LEN sized first
>   request
> - mark the effect of using ballooned pages in a comment
> - place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
>   before netif_receive_skb, and mark the importance of it
> - grab dealloc_lock before __napi_complete to avoid contention with the
>   callback's napi_schedule
> - handle fragmented packets where first request < PKT_PROT_LEN
> - fix up error path when checksum_setup failed
> - check before teardown for pending grants, and start complain if they are
>   there after 10 second
> 
> v3:
> - delete a surplus checking from tx_action
> - remove stray line
> - squash xenvif_idx_unmap changes into the first patch
> - init spinlocks
> - call map hypercall directly instead of gnttab_map_refs()
> - fix unmapping timeout in xenvif_free()
> 
> v4:
> - fix indentations and comments
> - handle errors of set_phys_to_machine
> - go back to gnttab_map_refs instead of direct hypercall. Now we rely on the
>   modified API
> 
> v5:
> - BUG_ON(vif->dealloc_task) in xenvif_connect
> - use 'task' in xenvif_connect for thread creation
> - proper return value if alloc_xenballooned_pages fails
> - BUG in xenvif_tx_check_gop if stale handle found
> 
> Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
> ---
>  drivers/net/xen-netback/interface.c |   63 ++++++++-
>  drivers/net/xen-netback/netback.c   |  254 ++++++++++++++---------------------
>  2 files changed, 160 insertions(+), 157 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> index f0f0c3d..b3daae2 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -122,7 +122,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	BUG_ON(skb->dev != dev);
>  
>  	/* Drop the packet if vif is not ready */
> -	if (vif->task == NULL || !xenvif_schedulable(vif))
> +	if (vif->task == NULL ||
> +	    vif->dealloc_task == NULL ||

Under what conditions could this be true? Would it not represent a
rather serious failure?

> +	    !xenvif_schedulable(vif))
>  		goto drop;
>  
>  	/* At best we'll need one slot for the header and one for each
> @@ -344,8 +346,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>  	vif->pending_prod = MAX_PENDING_REQS;
>  	for (i = 0; i < MAX_PENDING_REQS; i++)
>  		vif->pending_ring[i] = i;
> -	for (i = 0; i < MAX_PENDING_REQS; i++)
> -		vif->mmap_pages[i] = NULL;
> +	spin_lock_init(&vif->dealloc_lock);
> +	spin_lock_init(&vif->response_lock);
> +	/* If ballooning is disabled, this will consume real memory, so you
> +	 * better enable it.

Almost no one who would be affected by this is going to read this
comment. And it doesn't just require enabling ballooning, but actually
booting with some maxmem "slack" to leave space.

Classic-xen kernels used to add 8M of slop to the physical address space
to leave a suitable pool for exactly this sort of thing. I never liked
that but perhaps it should be reconsidered (or at least raised as a
possibility with the core-Xen Linux guys).

>  The long term solution would be to use just a
> +	 * bunch of valid page descriptors, without dependency on ballooning

Where would these come from? Do you have a cunning plan here?

> +	 */
> +	err = alloc_xenballooned_pages(MAX_PENDING_REQS,
> +				       vif->mmap_pages,
> +				       false);
> +	if (err) {
> +		netdev_err(dev, "Could not reserve mmap_pages\n");
> +		return ERR_PTR(-ENOMEM);
> +	}
> +	for (i = 0; i < MAX_PENDING_REQS; i++) {
> +		vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
> +			{ .callback = xenvif_zerocopy_callback,
> +			  .ctx = NULL,
> +			  .desc = i };
> +		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
> +	}
>  
>  	/*
>  	 * Initialise a dummy MAC address. We choose the numerically
> @@ -383,12 +403,14 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
>  
>  	BUG_ON(vif->tx_irq);
>  	BUG_ON(vif->task);
> +	BUG_ON(vif->dealloc_task);
>  
>  	err = xenvif_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref);
>  	if (err < 0)
>  		goto err;
>  
>  	init_waitqueue_head(&vif->wq);
> +	init_waitqueue_head(&vif->dealloc_wq);
>  
>  	if (tx_evtchn == rx_evtchn) {
>  		/* feature-split-event-channels == 0 */
> @@ -432,6 +454,18 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
>  
>  	vif->task = task;
>  
> +	task = kthread_create(xenvif_dealloc_kthread,
> +					   (void *)vif,
> +					   "%s-dealloc",
> +					   vif->dev->name);

This is separate to the existing kthread that handles rx stuff. If they
cannot or should not be combined then I think the existing one needs
renaming, both the function and the thread itself in a precursor patch.

> @@ -494,6 +534,23 @@ void xenvif_disconnect(struct xenvif *vif)
>  
>  void xenvif_free(struct xenvif *vif)
>  {
> +	int i, unmap_timeout = 0;
> +
> +	for (i = 0; i < MAX_PENDING_REQS; ++i) {
> +		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
> +			unmap_timeout++;
> +			schedule_timeout(msecs_to_jiffies(1000));

What are we waiting for here? Have we taken any action to ensure that it
is going to happen, like kicking something?

> +			if (unmap_timeout > 9 &&

Why 9? Why not rely on net_ratelimit to DTRT? Or is it normal for this
to fail at least once?

> +			    net_ratelimit())
> +				netdev_err(vif->dev,

I thought there was a ratelimited netdev printk which combined the
limiting and the printing in one function call. Maybe I am mistaken.

> +					   "Page still granted! Index: %x\n",
> +					   i);
> +			i = -1;
> +		}
> +	}
> +
> +	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
> +
>  	netif_napi_del(&vif->napi);
>  
>  	unregister_netdev(vif->dev);
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index 195602f..747b428 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -646,9 +646,12 @@ static void xenvif_tx_err(struct xenvif *vif,
>  			  struct xen_netif_tx_request *txp, RING_IDX end)
>  {
>  	RING_IDX cons = vif->tx.req_cons;
> +	unsigned long flags;
>  
>  	do {
> +		spin_lock_irqsave(&vif->response_lock, flags);

Looking at the callers you have added it would seem more natural to
handle the locking within make_tx_response itself.

What are you locking against here? Is this different to the dealloc
lock? If the concern is the rx action stuff and the dealloc stuff
conflicting perhaps a single vif lock would make sense?

>  		make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
> +		spin_unlock_irqrestore(&vif->response_lock, flags);
>  		if (cons == end)
>  			break;
>  		txp = RING_GET_REQUEST(&vif->tx, cons++);
> @@ -787,10 +790,10 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif,
>  	       sizeof(*txp));
>  }
>  
> -static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
> -					       struct sk_buff *skb,
> -					       struct xen_netif_tx_request *txp,
> -					       struct gnttab_copy *gop)
> +static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
> +							struct sk_buff *skb,
> +							struct xen_netif_tx_request *txp,
> +							struct gnttab_map_grant_ref *gop)
>  {
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
>  	skb_frag_t *frags = shinfo->frags;
> @@ -909,9 +841,9 @@ err:
>  
>  static int xenvif_tx_check_gop(struct xenvif *vif,
>  			       struct sk_buff *skb,
> -			       struct gnttab_copy **gopp)
> +			       struct gnttab_map_grant_ref **gopp)
>  {
> -	struct gnttab_copy *gop = *gopp;
> +	struct gnttab_map_grant_ref *gop = *gopp;
>  	u16 pending_idx = *((u16 *)skb->data);
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
>  	struct pending_tx_info *tx_info;
> @@ -923,6 +855,17 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>  	err = gop->status;
>  	if (unlikely(err))
>  		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
> +	else {
> +		if (vif->grant_tx_handle[pending_idx] !=
> +		    NETBACK_INVALID_HANDLE) {
> +			netdev_err(vif->dev,
> +				   "Stale mapped handle! pending_idx %x handle %x\n",
> +				   pending_idx,
> +				   vif->grant_tx_handle[pending_idx]);
> +			BUG();
> +		}
> +		vif->grant_tx_handle[pending_idx] = gop->handle;
> +	}
>  
>  	/* Skip first skb fragment if it is on same page as header fragment. */
>  	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
> @@ -936,18 +879,24 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>  		head = tx_info->head;
>  
>  		/* Check error status: if okay then remember grant handle. */
> -		do {
>  			newerr = (++gop)->status;
> -			if (newerr)
> -				break;
> -			peek = vif->pending_ring[pending_index(++head)];
> -		} while (!pending_tx_is_head(vif, peek));
>  
>  		if (likely(!newerr)) {
> +			if (vif->grant_tx_handle[pending_idx] !=
> +			    NETBACK_INVALID_HANDLE) {
> +				netdev_err(vif->dev,
> +					   "Stale mapped handle! pending_idx %x handle %x\n",
> +					   pending_idx,
> +					   vif->grant_tx_handle[pending_idx]);
> +				BUG();
> +			}

You had the same thing earlier. Perhaps a helper function would be
useful?

> +			vif->grant_tx_handle[pending_idx] = gop->handle;
>  			/* Had a previous error? Invalidate this fragment. */
> -			if (unlikely(err))
> +			if (unlikely(err)) {
> +				xenvif_idx_unmap(vif, pending_idx);
>  				xenvif_idx_release(vif, pending_idx,
>  						   XEN_NETIF_RSP_OKAY);

Would it make sense to unmap and release in a single function? (I
Haven't looked to see if you ever do one without the other, but the next
page of diff had two more occurrences of them together)

> +			}
>  			continue;
>  		}
>  
> @@ -960,9 +909,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>  
>  		/* First error: invalidate header and preceding fragments. */
>  		pending_idx = *((u16 *)skb->data);
> +		xenvif_idx_unmap(vif, pending_idx);
>  		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
>  		for (j = start; j < i; j++) {
>  			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
> +			xenvif_idx_unmap(vif, pending_idx);
>  			xenvif_idx_release(vif, pending_idx,
>  					   XEN_NETIF_RSP_OKAY);
>  		}

>  	}
> +	/* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
> +	 * overlaps with "index", and "mapping" is not set. I think mapping
> +	 * should be set. If delivered to local stack, it would drop this
> +	 * skb in sk_filter unless the socket has the right to use it.

What is the plan to fix this?

Is this dropping not a significant issue (TBH I'm not sure what "has the
right to use it" would entail).

> +	 */
> +	skb->pfmemalloc	= false;
>  }
>  
>  static int xenvif_get_extras(struct xenvif *vif,
> @@ -1372,7 +1341,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)

> @@ -1581,7 +1535,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
>  		else if (txp->flags & XEN_NETTXF_data_validated)
>  			skb->ip_summed = CHECKSUM_UNNECESSARY;
>  
> -		xenvif_fill_frags(vif, skb);
> +		xenvif_fill_frags(vif,
> +				  skb,
> +				  skb_shinfo(skb)->destructor_arg ?
> +				  pending_idx :
> +				  INVALID_PENDING_IDX

Couldn't xenvif_fill_frags calculate the 3rd argument itself given that
it has skb in hand.

> );
>  
>  		if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
>  			int target = min_t(int, skb->len, PKT_PROT_LEN);
> @@ -1595,6 +1553,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
>  		if (checksum_setup(vif, skb)) {
>  			netdev_dbg(vif->dev,
>  				   "Can't setup checksum in net_tx_action\n");
> +			/* We have to set this flag so the dealloc thread can
> +			 * send the slots back

Wouldn't it be more accurate to say that we need it so that the callback
happens (which we then use to trigger the dealloc thread)?

> +			 */
> +			if (skb_shinfo(skb)->destructor_arg)
> +				skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
>  			kfree_skb(skb);
>  			continue;
>  		}
> @@ -1620,6 +1583,14 @@ static int xenvif_tx_submit(struct xenvif *vif)
>  
>  		work_done++;
>  
> +		/* Set this flag right before netif_receive_skb, otherwise
> +		 * someone might think this packet already left netback, and
> +		 * do a skb_copy_ubufs while we are still in control of the
> +		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.

Hrm, subtle.

Ian.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Vrabel Feb. 18, 2014, 6:46 p.m. UTC | #2
On 18/02/14 17:40, Ian Campbell wrote:
> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>> 
>> @@ -344,8 +346,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>>  	vif->pending_prod = MAX_PENDING_REQS;
>>  	for (i = 0; i < MAX_PENDING_REQS; i++)
>>  		vif->pending_ring[i] = i;
>> -	for (i = 0; i < MAX_PENDING_REQS; i++)
>> -		vif->mmap_pages[i] = NULL;
>> +	spin_lock_init(&vif->dealloc_lock);
>> +	spin_lock_init(&vif->response_lock);
>> +	/* If ballooning is disabled, this will consume real memory, so you
>> +	 * better enable it.
> 
> Almost no one who would be affected by this is going to read this
> comment. And it doesn't just require enabling ballooning, but actually
> booting with some maxmem "slack" to leave space.
> 
> Classic-xen kernels used to add 8M of slop to the physical address space
> to leave a suitable pool for exactly this sort of thing. I never liked
> that but perhaps it should be reconsidered (or at least raised as a
> possibility with the core-Xen Linux guys).

I plan to fix the balloon memory hotplug stuff to do the right thing
(it's almost there -- it just tries to overlap the new memory with
existing stuff).

David
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Feb. 19, 2014, 9:54 a.m. UTC | #3
On Tue, 2014-02-18 at 18:46 +0000, David Vrabel wrote:
> On 18/02/14 17:40, Ian Campbell wrote:
> > On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> >> 
> >> @@ -344,8 +346,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
> >>  	vif->pending_prod = MAX_PENDING_REQS;
> >>  	for (i = 0; i < MAX_PENDING_REQS; i++)
> >>  		vif->pending_ring[i] = i;
> >> -	for (i = 0; i < MAX_PENDING_REQS; i++)
> >> -		vif->mmap_pages[i] = NULL;
> >> +	spin_lock_init(&vif->dealloc_lock);
> >> +	spin_lock_init(&vif->response_lock);
> >> +	/* If ballooning is disabled, this will consume real memory, so you
> >> +	 * better enable it.
> > 
> > Almost no one who would be affected by this is going to read this
> > comment. And it doesn't just require enabling ballooning, but actually
> > booting with some maxmem "slack" to leave space.
> > 
> > Classic-xen kernels used to add 8M of slop to the physical address space
> > to leave a suitable pool for exactly this sort of thing. I never liked
> > that but perhaps it should be reconsidered (or at least raised as a
> > possibility with the core-Xen Linux guys).
> 
> I plan to fix the balloon memory hotplug stuff to do the right thing

Which is for alloc_xenballoon_pages to hotplug a new empty region,
rather than inflating the balloon if it doesn't have enough pages to
satisfy the allocation? Or something else?

> (it's almost there -- it just tries to overlap the new memory with
> existing stuff).
> 
> David


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Vrabel Feb. 19, 2014, 12:27 p.m. UTC | #4
On 19/02/14 09:54, Ian Campbell wrote:
> On Tue, 2014-02-18 at 18:46 +0000, David Vrabel wrote:
>> On 18/02/14 17:40, Ian Campbell wrote:
>>> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>>>>
>>>> @@ -344,8 +346,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>>>>  	vif->pending_prod = MAX_PENDING_REQS;
>>>>  	for (i = 0; i < MAX_PENDING_REQS; i++)
>>>>  		vif->pending_ring[i] = i;
>>>> -	for (i = 0; i < MAX_PENDING_REQS; i++)
>>>> -		vif->mmap_pages[i] = NULL;
>>>> +	spin_lock_init(&vif->dealloc_lock);
>>>> +	spin_lock_init(&vif->response_lock);
>>>> +	/* If ballooning is disabled, this will consume real memory, so you
>>>> +	 * better enable it.
>>>
>>> Almost no one who would be affected by this is going to read this
>>> comment. And it doesn't just require enabling ballooning, but actually
>>> booting with some maxmem "slack" to leave space.
>>>
>>> Classic-xen kernels used to add 8M of slop to the physical address space
>>> to leave a suitable pool for exactly this sort of thing. I never liked
>>> that but perhaps it should be reconsidered (or at least raised as a
>>> possibility with the core-Xen Linux guys).
>>
>> I plan to fix the balloon memory hotplug stuff to do the right thing
> 
> Which is for alloc_xenballoon_pages to hotplug a new empty region,
> rather than inflating the balloon if it doesn't have enough pages to
> satisfy the allocation?

Yes.

David
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zoltan Kiss Feb. 22, 2014, 10:33 p.m. UTC | #5
On 18/02/14 17:40, Ian Campbell wrote:
> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>> This patch changes the grant copy on the TX patch to grant mapping
>
> Both this and the previous patch had a single sentence commit message (I
> count them together since they are split weirdly and are a single
> logical change to my eyes).
>
> Really a change of this magnitude deserves a commit message to match,
> e.g. explaining the approach which is taken by the code at a high level,
> what it is doing, how it is doing it, the rationale for using a kthread
> etc etc.
Ok, I'll  improve that

>> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
>> index f0f0c3d..b3daae2 100644
>> --- a/drivers/net/xen-netback/interface.c
>> +++ b/drivers/net/xen-netback/interface.c
>> @@ -122,7 +122,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
>>   	BUG_ON(skb->dev != dev);
>>
>>   	/* Drop the packet if vif is not ready */
>> -	if (vif->task == NULL || !xenvif_schedulable(vif))
>> +	if (vif->task == NULL ||
>> +	    vif->dealloc_task == NULL ||
>
> Under what conditions could this be true? Would it not represent a
> rather serious failure?
xenvif_start_xmit can start after xenvif_open, while the threads are 
created when the ring connects. I haven't checked under what 
circumstances can that happen, but I guess if it worked like that 
before, that's fine. If not, that's the topic of a different patch(series).

>
>> +	    !xenvif_schedulable(vif))
>>   		goto drop;
>>
>>   	/* At best we'll need one slot for the header and one for each
>> @@ -344,8 +346,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>>   	vif->pending_prod = MAX_PENDING_REQS;
>>   	for (i = 0; i < MAX_PENDING_REQS; i++)
>>   		vif->pending_ring[i] = i;
>> -	for (i = 0; i < MAX_PENDING_REQS; i++)
>> -		vif->mmap_pages[i] = NULL;
>> +	spin_lock_init(&vif->dealloc_lock);
>> +	spin_lock_init(&vif->response_lock);
>> +	/* If ballooning is disabled, this will consume real memory, so you
>> +	 * better enable it.
>
> Almost no one who would be affected by this is going to read this
> comment. And it doesn't just require enabling ballooning, but actually
> booting with some maxmem "slack" to leave space.
Where should we document this? I mean, in case David doesn't fix this 
before acceptance of this patch series :)


>> @@ -432,6 +454,18 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
>>
>>   	vif->task = task;
>>
>> +	task = kthread_create(xenvif_dealloc_kthread,
>> +					   (void *)vif,
>> +					   "%s-dealloc",
>> +					   vif->dev->name);
>
> This is separate to the existing kthread that handles rx stuff. If they
> cannot or should not be combined then I think the existing one needs
> renaming, both the function and the thread itself in a precursor patch.
I've explained in another email about the reasons why they are separate 
thread. I'll rename the existing thread and functions

>
>> @@ -494,6 +534,23 @@ void xenvif_disconnect(struct xenvif *vif)
>>
>>   void xenvif_free(struct xenvif *vif)
>>   {
>> +	int i, unmap_timeout = 0;
>> +
>> +	for (i = 0; i < MAX_PENDING_REQS; ++i) {
>> +		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
>> +			unmap_timeout++;
>> +			schedule_timeout(msecs_to_jiffies(1000));
>
> What are we waiting for here? Have we taken any action to ensure that it
> is going to happen, like kicking something?
We are waiting for skb's to be freed so we can return the slots. They 
are not owned by us after we sent them, and we don't know who owns them. 
As discussed months ago, it is safe to assume that other devices won't 
sit on it indefinitely. If it goes to userspace or further up the stack 
to IP layer, we swap the pages out with local ones. The only place where 
things can go wrong is an another netback thread, that's handled in 
patch #8.

>
>> +			if (unmap_timeout > 9 &&
>
> Why 9? Why not rely on net_ratelimit to DTRT? Or is it normal for this
> to fail at least once?
As mentioned earlier, this is quite temporary here, it is improved in 
patch #8

>
>> +			    net_ratelimit())
>> +				netdev_err(vif->dev,
>
> I thought there was a ratelimited netdev printk which combined the
> limiting and the printing in one function call. Maybe I am mistaken.
There is indeed, net_err_ratelimited and friends. But they call pr_err 
instead of netdev_err, so we lose the vif name from the log entry, which 
could be quite important. If someone introduce a netdev_err_ratelimit 
which calls netdev_err, we can change this, but I would defer this to a 
later patch.


>> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
>> index 195602f..747b428 100644
>> --- a/drivers/net/xen-netback/netback.c
>> +++ b/drivers/net/xen-netback/netback.c
>> @@ -646,9 +646,12 @@ static void xenvif_tx_err(struct xenvif *vif,
>>   			  struct xen_netif_tx_request *txp, RING_IDX end)
>>   {
>>   	RING_IDX cons = vif->tx.req_cons;
>> +	unsigned long flags;
>>
>>   	do {
>> +		spin_lock_irqsave(&vif->response_lock, flags);
>
> Looking at the callers you have added it would seem more natural to
> handle the locking within make_tx_response itself.
>
> What are you locking against here? Is this different to the dealloc
> lock? If the concern is the rx action stuff and the dealloc stuff
> conflicting perhaps a single vif lock would make sense?
I've improved the comment, as mentioned in another email, here is it:

	/* This prevents zerocopy callbacks  to race over dealloc_ring */
	spinlock_t callback_lock;
	/* This prevents dealloc thread and NAPI instance to race over response
	 * creation and pending_ring in xenvif_idx_release. In xenvif_tx_err
	 * it only protect response creation
	 */

>> @@ -936,18 +879,24 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>>   		head = tx_info->head;
>>
>>   		/* Check error status: if okay then remember grant handle. */
>> -		do {
>>   			newerr = (++gop)->status;
>> -			if (newerr)
>> -				break;
>> -			peek = vif->pending_ring[pending_index(++head)];
>> -		} while (!pending_tx_is_head(vif, peek));
>>
>>   		if (likely(!newerr)) {
>> +			if (vif->grant_tx_handle[pending_idx] !=
>> +			    NETBACK_INVALID_HANDLE) {
>> +				netdev_err(vif->dev,
>> +					   "Stale mapped handle! pending_idx %x handle %x\n",
>> +					   pending_idx,
>> +					   vif->grant_tx_handle[pending_idx]);
>> +				BUG();
>> +			}
>
> You had the same thing earlier. Perhaps a helper function would be
> useful?
Makes sense, I'll do that.

>
>> +			vif->grant_tx_handle[pending_idx] = gop->handle;
>>   			/* Had a previous error? Invalidate this fragment. */
>> -			if (unlikely(err))
>> +			if (unlikely(err)) {
>> +				xenvif_idx_unmap(vif, pending_idx);
>>   				xenvif_idx_release(vif, pending_idx,
>>   						   XEN_NETIF_RSP_OKAY);
>
> Would it make sense to unmap and release in a single function? (I
> Haven't looked to see if you ever do one without the other, but the next
> page of diff had two more occurrences of them together)
Yep, it's better call idx_release from unmap instead of doing it 
separately all the time.


>> @@ -960,9 +909,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>>
>>   		/* First error: invalidate header and preceding fragments. */
>>   		pending_idx = *((u16 *)skb->data);
>> +		xenvif_idx_unmap(vif, pending_idx);
>>   		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
>>   		for (j = start; j < i; j++) {
>>   			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
>> +			xenvif_idx_unmap(vif, pending_idx);
>>   			xenvif_idx_release(vif, pending_idx,
>>   					   XEN_NETIF_RSP_OKAY);
>>   		}
>
>>   	}
>> +	/* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
>> +	 * overlaps with "index", and "mapping" is not set. I think mapping
>> +	 * should be set. If delivered to local stack, it would drop this
>> +	 * skb in sk_filter unless the socket has the right to use it.
>
> What is the plan to fix this?
Probably not using "index" during grant mapping. When it is solved 
somehow we can clean this up.

>
> Is this dropping not a significant issue (TBH I'm not sure what "has the
> right to use it" would entail).
It doesn't happen as we fix it up with this workaround.

>
>> +	 */
>> +	skb->pfmemalloc	= false;
>>   }
>>
>>   static int xenvif_get_extras(struct xenvif *vif,
>> @@ -1372,7 +1341,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
>
>> @@ -1581,7 +1535,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
>>   		else if (txp->flags & XEN_NETTXF_data_validated)
>>   			skb->ip_summed = CHECKSUM_UNNECESSARY;
>>
>> -		xenvif_fill_frags(vif, skb);
>> +		xenvif_fill_frags(vif,
>> +				  skb,
>> +				  skb_shinfo(skb)->destructor_arg ?
>> +				  pending_idx :
>> +				  INVALID_PENDING_IDX
>
> Couldn't xenvif_fill_frags calculate the 3rd argument itself given that
> it has skb in hand.
We still have to pass pending_idx, as it is no longer in skb->data. I 
have plans (I've already prototyped it, actually) to move that 
pending_idx from skb->data to skb->cb, if that happens, this won't be 
necessary.
On the other hand, it makes more sense just to just pass pending_idx, 
and in fill_frags we can decide based on destructor_arg whether do we 
need it or not.

>> @@ -1595,6 +1553,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
>>   		if (checksum_setup(vif, skb)) {
>>   			netdev_dbg(vif->dev,
>>   				   "Can't setup checksum in net_tx_action\n");
>> +			/* We have to set this flag so the dealloc thread can
>> +			 * send the slots back
>
> Wouldn't it be more accurate to say that we need it so that the callback
> happens (which we then use to trigger the dealloc thread)?
Yep, I'll change that.

Zoli
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zoltan Kiss Feb. 24, 2014, 4:56 p.m. UTC | #6
On 22/02/14 22:33, Zoltan Kiss wrote:
> On 18/02/14 17:40, Ian Campbell wrote:
>> +     */
>>> +    skb->pfmemalloc    = false;
>>>   }
>>>
>>>   static int xenvif_get_extras(struct xenvif *vif,
>>> @@ -1372,7 +1341,7 @@ static bool tx_credit_exceeded(struct xenvif 
>>> *vif, unsigned size)
>>
>>> @@ -1581,7 +1535,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
>>>           else if (txp->flags & XEN_NETTXF_data_validated)
>>>               skb->ip_summed = CHECKSUM_UNNECESSARY;
>>>
>>> -        xenvif_fill_frags(vif, skb);
>>> +        xenvif_fill_frags(vif,
>>> +                  skb,
>>> +                  skb_shinfo(skb)->destructor_arg ?
>>> +                  pending_idx :
>>> +                  INVALID_PENDING_IDX
>>
>> Couldn't xenvif_fill_frags calculate the 3rd argument itself given that
>> it has skb in hand.
> We still have to pass pending_idx, as it is no longer in skb->data. I 
> have plans (I've already prototyped it, actually) to move that 
> pending_idx from skb->data to skb->cb, if that happens, this won't be 
> necessary.
> On the other hand, it makes more sense just to just pass pending_idx, 
> and in fill_frags we can decide based on destructor_arg whether do we 
> need it or not.
Actually, I've just moved the skb->cb patch to the beginning of this 
series, so we can completely omit that new parameter from fill_frags.

Zoli
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index f0f0c3d..b3daae2 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -122,7 +122,9 @@  static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	BUG_ON(skb->dev != dev);
 
 	/* Drop the packet if vif is not ready */
-	if (vif->task == NULL || !xenvif_schedulable(vif))
+	if (vif->task == NULL ||
+	    vif->dealloc_task == NULL ||
+	    !xenvif_schedulable(vif))
 		goto drop;
 
 	/* At best we'll need one slot for the header and one for each
@@ -344,8 +346,26 @@  struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	vif->pending_prod = MAX_PENDING_REQS;
 	for (i = 0; i < MAX_PENDING_REQS; i++)
 		vif->pending_ring[i] = i;
-	for (i = 0; i < MAX_PENDING_REQS; i++)
-		vif->mmap_pages[i] = NULL;
+	spin_lock_init(&vif->dealloc_lock);
+	spin_lock_init(&vif->response_lock);
+	/* If ballooning is disabled, this will consume real memory, so you
+	 * better enable it. The long term solution would be to use just a
+	 * bunch of valid page descriptors, without dependency on ballooning
+	 */
+	err = alloc_xenballooned_pages(MAX_PENDING_REQS,
+				       vif->mmap_pages,
+				       false);
+	if (err) {
+		netdev_err(dev, "Could not reserve mmap_pages\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	for (i = 0; i < MAX_PENDING_REQS; i++) {
+		vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
+			{ .callback = xenvif_zerocopy_callback,
+			  .ctx = NULL,
+			  .desc = i };
+		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
+	}
 
 	/*
 	 * Initialise a dummy MAC address. We choose the numerically
@@ -383,12 +403,14 @@  int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 
 	BUG_ON(vif->tx_irq);
 	BUG_ON(vif->task);
+	BUG_ON(vif->dealloc_task);
 
 	err = xenvif_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref);
 	if (err < 0)
 		goto err;
 
 	init_waitqueue_head(&vif->wq);
+	init_waitqueue_head(&vif->dealloc_wq);
 
 	if (tx_evtchn == rx_evtchn) {
 		/* feature-split-event-channels == 0 */
@@ -432,6 +454,18 @@  int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 
 	vif->task = task;
 
+	task = kthread_create(xenvif_dealloc_kthread,
+					   (void *)vif,
+					   "%s-dealloc",
+					   vif->dev->name);
+	if (IS_ERR(task)) {
+		pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
+		err = PTR_ERR(task);
+		goto err_rx_unbind;
+	}
+
+	vif->dealloc_task = task;
+
 	rtnl_lock();
 	if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN)
 		dev_set_mtu(vif->dev, ETH_DATA_LEN);
@@ -442,6 +476,7 @@  int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 	rtnl_unlock();
 
 	wake_up_process(vif->task);
+	wake_up_process(vif->dealloc_task);
 
 	return 0;
 
@@ -479,6 +514,11 @@  void xenvif_disconnect(struct xenvif *vif)
 		vif->task = NULL;
 	}
 
+	if (vif->dealloc_task) {
+		kthread_stop(vif->dealloc_task);
+		vif->dealloc_task = NULL;
+	}
+
 	if (vif->tx_irq) {
 		if (vif->tx_irq == vif->rx_irq)
 			unbind_from_irqhandler(vif->tx_irq, vif);
@@ -494,6 +534,23 @@  void xenvif_disconnect(struct xenvif *vif)
 
 void xenvif_free(struct xenvif *vif)
 {
+	int i, unmap_timeout = 0;
+
+	for (i = 0; i < MAX_PENDING_REQS; ++i) {
+		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
+			unmap_timeout++;
+			schedule_timeout(msecs_to_jiffies(1000));
+			if (unmap_timeout > 9 &&
+			    net_ratelimit())
+				netdev_err(vif->dev,
+					   "Page still granted! Index: %x\n",
+					   i);
+			i = -1;
+		}
+	}
+
+	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
+
 	netif_napi_del(&vif->napi);
 
 	unregister_netdev(vif->dev);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 195602f..747b428 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -646,9 +646,12 @@  static void xenvif_tx_err(struct xenvif *vif,
 			  struct xen_netif_tx_request *txp, RING_IDX end)
 {
 	RING_IDX cons = vif->tx.req_cons;
+	unsigned long flags;
 
 	do {
+		spin_lock_irqsave(&vif->response_lock, flags);
 		make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+		spin_unlock_irqrestore(&vif->response_lock, flags);
 		if (cons == end)
 			break;
 		txp = RING_GET_REQUEST(&vif->tx, cons++);
@@ -787,10 +790,10 @@  static inline void xenvif_tx_create_gop(struct xenvif *vif,
 	       sizeof(*txp));
 }
 
-static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
-					       struct sk_buff *skb,
-					       struct xen_netif_tx_request *txp,
-					       struct gnttab_copy *gop)
+static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
+							struct sk_buff *skb,
+							struct xen_netif_tx_request *txp,
+							struct gnttab_map_grant_ref *gop)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
@@ -811,83 +814,12 @@  static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
 
-	/* Coalesce tx requests, at this point the packet passed in
-	 * should be <= 64K. Any packets larger than 64K have been
-	 * handled in xenvif_count_requests().
-	 */
-	for (shinfo->nr_frags = slot = start; slot < nr_slots;
-	     shinfo->nr_frags++) {
-		struct pending_tx_info *pending_tx_info =
-			vif->pending_tx_info;
-
-		page = alloc_page(GFP_ATOMIC|__GFP_COLD);
-		if (!page)
-			goto err;
-
-		dst_offset = 0;
-		first = NULL;
-		while (dst_offset < PAGE_SIZE && slot < nr_slots) {
-			gop->flags = GNTCOPY_source_gref;
-
-			gop->source.u.ref = txp->gref;
-			gop->source.domid = vif->domid;
-			gop->source.offset = txp->offset;
-
-			gop->dest.domid = DOMID_SELF;
-
-			gop->dest.offset = dst_offset;
-			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
-
-			if (dst_offset + txp->size > PAGE_SIZE) {
-				/* This page can only merge a portion
-				 * of tx request. Do not increment any
-				 * pointer / counter here. The txp
-				 * will be dealt with in future
-				 * rounds, eventually hitting the
-				 * `else` branch.
-				 */
-				gop->len = PAGE_SIZE - dst_offset;
-				txp->offset += gop->len;
-				txp->size -= gop->len;
-				dst_offset += gop->len; /* quit loop */
-			} else {
-				/* This tx request can be merged in the page */
-				gop->len = txp->size;
-				dst_offset += gop->len;
-
+	for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
+	     shinfo->nr_frags++, txp++, gop++) {
 				index = pending_index(vif->pending_cons++);
-
 				pending_idx = vif->pending_ring[index];
-
-				memcpy(&pending_tx_info[pending_idx].req, txp,
-				       sizeof(*txp));
-
-				/* Poison these fields, corresponding
-				 * fields for head tx req will be set
-				 * to correct values after the loop.
-				 */
-				vif->mmap_pages[pending_idx] = (void *)(~0UL);
-				pending_tx_info[pending_idx].head =
-					INVALID_PENDING_RING_IDX;
-
-				if (!first) {
-					first = &pending_tx_info[pending_idx];
-					start_idx = index;
-					head_idx = pending_idx;
-				}
-
-				txp++;
-				slot++;
-			}
-
-			gop++;
-		}
-
-		first->req.offset = 0;
-		first->req.size = dst_offset;
-		first->head = start_idx;
-		vif->mmap_pages[head_idx] = page;
-		frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);
+		xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
 	}
 
 	BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
@@ -909,9 +841,9 @@  err:
 
 static int xenvif_tx_check_gop(struct xenvif *vif,
 			       struct sk_buff *skb,
-			       struct gnttab_copy **gopp)
+			       struct gnttab_map_grant_ref **gopp)
 {
-	struct gnttab_copy *gop = *gopp;
+	struct gnttab_map_grant_ref *gop = *gopp;
 	u16 pending_idx = *((u16 *)skb->data);
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	struct pending_tx_info *tx_info;
@@ -923,6 +855,17 @@  static int xenvif_tx_check_gop(struct xenvif *vif,
 	err = gop->status;
 	if (unlikely(err))
 		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
+	else {
+		if (vif->grant_tx_handle[pending_idx] !=
+		    NETBACK_INVALID_HANDLE) {
+			netdev_err(vif->dev,
+				   "Stale mapped handle! pending_idx %x handle %x\n",
+				   pending_idx,
+				   vif->grant_tx_handle[pending_idx]);
+			BUG();
+		}
+		vif->grant_tx_handle[pending_idx] = gop->handle;
+	}
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
@@ -936,18 +879,24 @@  static int xenvif_tx_check_gop(struct xenvif *vif,
 		head = tx_info->head;
 
 		/* Check error status: if okay then remember grant handle. */
-		do {
 			newerr = (++gop)->status;
-			if (newerr)
-				break;
-			peek = vif->pending_ring[pending_index(++head)];
-		} while (!pending_tx_is_head(vif, peek));
 
 		if (likely(!newerr)) {
+			if (vif->grant_tx_handle[pending_idx] !=
+			    NETBACK_INVALID_HANDLE) {
+				netdev_err(vif->dev,
+					   "Stale mapped handle! pending_idx %x handle %x\n",
+					   pending_idx,
+					   vif->grant_tx_handle[pending_idx]);
+				BUG();
+			}
+			vif->grant_tx_handle[pending_idx] = gop->handle;
 			/* Had a previous error? Invalidate this fragment. */
-			if (unlikely(err))
+			if (unlikely(err)) {
+				xenvif_idx_unmap(vif, pending_idx);
 				xenvif_idx_release(vif, pending_idx,
 						   XEN_NETIF_RSP_OKAY);
+			}
 			continue;
 		}
 
@@ -960,9 +909,11 @@  static int xenvif_tx_check_gop(struct xenvif *vif,
 
 		/* First error: invalidate header and preceding fragments. */
 		pending_idx = *((u16 *)skb->data);
+		xenvif_idx_unmap(vif, pending_idx);
 		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
 		for (j = start; j < i; j++) {
 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
+			xenvif_idx_unmap(vif, pending_idx);
 			xenvif_idx_release(vif, pending_idx,
 					   XEN_NETIF_RSP_OKAY);
 		}
@@ -975,7 +926,9 @@  static int xenvif_tx_check_gop(struct xenvif *vif,
 	return err;
 }
 
-static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
+static void xenvif_fill_frags(struct xenvif *vif,
+			      struct sk_buff *skb,
+			      u16 prev_pending_idx)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int nr_frags = shinfo->nr_frags;
@@ -989,6 +942,17 @@  static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
 
 		pending_idx = frag_get_pending_idx(frag);
 
+		/* If this is not the first frag, chain it to the previous*/
+		if (unlikely(prev_pending_idx == INVALID_PENDING_IDX))
+			skb_shinfo(skb)->destructor_arg =
+				&vif->pending_tx_info[pending_idx].callback_struct;
+		else if (likely(pending_idx != prev_pending_idx))
+			vif->pending_tx_info[prev_pending_idx].callback_struct.ctx =
+				&(vif->pending_tx_info[pending_idx].callback_struct);
+
+		vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
+		prev_pending_idx = pending_idx;
+
 		txp = &vif->pending_tx_info[pending_idx].req;
 		page = virt_to_page(idx_to_kaddr(vif, pending_idx));
 		__skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
@@ -996,10 +960,15 @@  static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
 		skb->data_len += txp->size;
 		skb->truesize += txp->size;
 
-		/* Take an extra reference to offset xenvif_idx_release */
+		/* Take an extra reference to offset network stack's put_page */
 		get_page(vif->mmap_pages[pending_idx]);
-		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
 	}
+	/* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
+	 * overlaps with "index", and "mapping" is not set. I think mapping
+	 * should be set. If delivered to local stack, it would drop this
+	 * skb in sk_filter unless the socket has the right to use it.
+	 */
+	skb->pfmemalloc	= false;
 }
 
 static int xenvif_get_extras(struct xenvif *vif,
@@ -1372,7 +1341,7 @@  static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
 
 static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 {
-	struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop;
+	struct gnttab_map_grant_ref *gop = vif->tx_map_ops, *request_gop;
 	struct sk_buff *skb;
 	int ret;
 
@@ -1480,30 +1449,10 @@  static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			}
 		}
 
-		/* XXX could copy straight to head */
-		page = xenvif_alloc_page(vif, pending_idx);
-		if (!page) {
-			kfree_skb(skb);
-			xenvif_tx_err(vif, &txreq, idx);
-			break;
-		}
-
-		gop->source.u.ref = txreq.gref;
-		gop->source.domid = vif->domid;
-		gop->source.offset = txreq.offset;
-
-		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
-		gop->dest.domid = DOMID_SELF;
-		gop->dest.offset = txreq.offset;
-
-		gop->len = txreq.size;
-		gop->flags = GNTCOPY_source_gref;
+		xenvif_tx_create_gop(vif, pending_idx, &txreq, gop);
 
 		gop++;
 
-		memcpy(&vif->pending_tx_info[pending_idx].req,
-		       &txreq, sizeof(txreq));
-		vif->pending_tx_info[pending_idx].head = index;
 		*((u16 *)skb->data) = pending_idx;
 
 		__skb_put(skb, data_len);
@@ -1532,17 +1481,17 @@  static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 
 		vif->tx.req_cons = idx;
 
-		if ((gop-vif->tx_copy_ops) >= ARRAY_SIZE(vif->tx_copy_ops))
+		if ((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops))
 			break;
 	}
 
-	return gop - vif->tx_copy_ops;
+	return gop - vif->tx_map_ops;
 }
 
 
 static int xenvif_tx_submit(struct xenvif *vif)
 {
-	struct gnttab_copy *gop = vif->tx_copy_ops;
+	struct gnttab_map_grant_ref *gop = vif->tx_map_ops;
 	struct sk_buff *skb;
 	int work_done = 0;
 
@@ -1566,12 +1515,17 @@  static int xenvif_tx_submit(struct xenvif *vif)
 		memcpy(skb->data,
 		       (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset),
 		       data_len);
+		vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
 		if (data_len < txp->size) {
 			/* Append the packet payload as a fragment. */
 			txp->offset += data_len;
 			txp->size -= data_len;
+			skb_shinfo(skb)->destructor_arg =
+				&vif->pending_tx_info[pending_idx].callback_struct;
 		} else {
 			/* Schedule a response immediately. */
+			skb_shinfo(skb)->destructor_arg = NULL;
+			xenvif_idx_unmap(vif, pending_idx);
 			xenvif_idx_release(vif, pending_idx,
 					   XEN_NETIF_RSP_OKAY);
 		}
@@ -1581,7 +1535,11 @@  static int xenvif_tx_submit(struct xenvif *vif)
 		else if (txp->flags & XEN_NETTXF_data_validated)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-		xenvif_fill_frags(vif, skb);
+		xenvif_fill_frags(vif,
+				  skb,
+				  skb_shinfo(skb)->destructor_arg ?
+				  pending_idx :
+				  INVALID_PENDING_IDX);
 
 		if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
 			int target = min_t(int, skb->len, PKT_PROT_LEN);
@@ -1595,6 +1553,11 @@  static int xenvif_tx_submit(struct xenvif *vif)
 		if (checksum_setup(vif, skb)) {
 			netdev_dbg(vif->dev,
 				   "Can't setup checksum in net_tx_action\n");
+			/* We have to set this flag so the dealloc thread can
+			 * send the slots back
+			 */
+			if (skb_shinfo(skb)->destructor_arg)
+				skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 			kfree_skb(skb);
 			continue;
 		}
@@ -1620,6 +1583,14 @@  static int xenvif_tx_submit(struct xenvif *vif)
 
 		work_done++;
 
+		/* Set this flag right before netif_receive_skb, otherwise
+		 * someone might think this packet already left netback, and
+		 * do a skb_copy_ubufs while we are still in control of the
+		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
+		 */
+		if (skb_shinfo(skb)->destructor_arg)
+			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+
 		netif_receive_skb(skb);
 	}
 
@@ -1731,7 +1702,7 @@  static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
 int xenvif_tx_action(struct xenvif *vif, int budget)
 {
 	unsigned nr_gops;
-	int work_done;
+	int work_done, ret;
 
 	if (unlikely(!tx_work_todo(vif)))
 		return 0;
@@ -1741,7 +1712,8 @@  int xenvif_tx_action(struct xenvif *vif, int budget)
 	if (nr_gops == 0)
 		return 0;
 
-	gnttab_batch_copy(vif->tx_copy_ops, nr_gops);
+	ret = gnttab_map_refs(vif->tx_map_ops, vif->pages_to_map, nr_gops);
+	BUG_ON(ret);
 
 	work_done = xenvif_tx_submit(vif);
 
@@ -1752,45 +1724,19 @@  static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
 			       u8 status)
 {
 	struct pending_tx_info *pending_tx_info;
-	pending_ring_idx_t head;
+	pending_ring_idx_t index;
 	u16 peek; /* peek into next tx request */
+	unsigned long flags;
 
-	BUG_ON(vif->mmap_pages[pending_idx] == (void *)(~0UL));
-
-	/* Already complete? */
-	if (vif->mmap_pages[pending_idx] == NULL)
-		return;
-
-	pending_tx_info = &vif->pending_tx_info[pending_idx];
-
-	head = pending_tx_info->head;
-
-	BUG_ON(!pending_tx_is_head(vif, head));
-	BUG_ON(vif->pending_ring[pending_index(head)] != pending_idx);
-
-	do {
-		pending_ring_idx_t index;
-		pending_ring_idx_t idx = pending_index(head);
-		u16 info_idx = vif->pending_ring[idx];
-
-		pending_tx_info = &vif->pending_tx_info[info_idx];
+		pending_tx_info = &vif->pending_tx_info[pending_idx];
+		spin_lock_irqsave(&vif->response_lock, flags);
 		make_tx_response(vif, &pending_tx_info->req, status);
-
-		/* Setting any number other than
-		 * INVALID_PENDING_RING_IDX indicates this slot is
-		 * starting a new packet / ending a previous packet.
-		 */
-		pending_tx_info->head = 0;
-
-		index = pending_index(vif->pending_prod++);
-		vif->pending_ring[index] = vif->pending_ring[info_idx];
-
-		peek = vif->pending_ring[pending_index(++head)];
-
-	} while (!pending_tx_is_head(vif, peek));
-
-	put_page(vif->mmap_pages[pending_idx]);
-	vif->mmap_pages[pending_idx] = NULL;
+		index = pending_index(vif->pending_prod);
+		vif->pending_ring[index] = pending_idx;
+		/* TX shouldn't use the index before we give it back here */
+		mb();
+		vif->pending_prod++;
+		spin_unlock_irqrestore(&vif->response_lock, flags);
 }
 
 void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)