diff mbox

[net-next,v2,2/9] xen-netback: Change TX path from grant copy to mapping

Message ID 1386892097-15502-3-git-send-email-zoltan.kiss@citrix.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Zoltan Kiss Dec. 12, 2013, 11:48 p.m. UTC
This patch changes the grant copy on the TX patch to grant mapping

v2:
- delete branch for handling fragmented packets fit PKT_PROT_LINE sized first
  request
- mark the effect of using ballooned pages in a comment
- place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
  before netif_receive_skb, and mark the importance of it
- grab dealloc_lock before __napi_complete to avoid contention with the
  callback's napi_schedule
- handle fragmented packets where first request < PKT_PROT_LINE
- fix up error path when checksum_setup failed
- check before teardown for pending grants, and start complain if they are
  there after 10 second

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---
 drivers/net/xen-netback/interface.c |   57 +++++++-
 drivers/net/xen-netback/netback.c   |  257 ++++++++++++++---------------------
 2 files changed, 156 insertions(+), 158 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Wei Liu Dec. 13, 2013, 3:36 p.m. UTC | #1
On Thu, Dec 12, 2013 at 11:48:10PM +0000, Zoltan Kiss wrote:
> This patch changes the grant copy on the TX patch to grant mapping
> 
> v2:
> - delete branch for handling fragmented packets fit PKT_PROT_LINE sized first
                                                      ^ PKT_PROT_LEN
>   request
> - mark the effect of using ballooned pages in a comment
> - place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
>   before netif_receive_skb, and mark the importance of it
> - grab dealloc_lock before __napi_complete to avoid contention with the
>   callback's napi_schedule
> - handle fragmented packets where first request < PKT_PROT_LINE
                                                    ^ PKT_PROT_LEN
> - fix up error path when checksum_setup failed
> - check before teardown for pending grants, and start complain if they are
>   there after 10 second
> 
> Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
> ---
[...]
>  void xenvif_free(struct xenvif *vif)
>  {
> +	int i, unmap_timeout = 0;
> +
> +	for (i = 0; i < MAX_PENDING_REQS; ++i) {
> +		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
> +			i = 0;
> +			unmap_timeout++;
> +			msleep(1000);
> +			if (unmap_timeout > 9 &&
> +				net_ratelimit())
> +				netdev_err(vif->dev,
> +					"Page still granted! Index: %x\n", i);
> +		}
> +	}
> +
> +	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
> +

If some pages are stuck and you just free them will it cause Dom0 to
crash? I mean, if those pages are recycled by other balloon page users.

Even if it will not cause Dom0 to crash, will it leak any resource in
Dom0? At plain sight it looks like at least grant table entry is leaked,
isn't it? We need to be careful about this because a malicious might be
able to DoS Dom0 with resource leakage.

>  	netif_napi_del(&vif->napi);
>  
>  	unregister_netdev(vif->dev);
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index 3ddc474..20352be 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -645,9 +645,12 @@ static void xenvif_tx_err(struct xenvif *vif,
>  			  struct xen_netif_tx_request *txp, RING_IDX end)
>  {
>  	RING_IDX cons = vif->tx.req_cons;
> +	unsigned long flags;
>  
>  	do {
> +		spin_lock_irqsave(&vif->response_lock, flags);
>  		make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
> +		spin_unlock_irqrestore(&vif->response_lock, flags);

You only hold the lock for one function call is this intentional?

>  		if (cons == end)
>  			break;
>  		txp = RING_GET_REQUEST(&vif->tx, cons++);
> @@ -786,10 +789,10 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif, u16 pending_idx,
>  
>  }
>  
> -static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
> +static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
>  					       struct sk_buff *skb,
>  					       struct xen_netif_tx_request *txp,
> -					       struct gnttab_copy *gop)
> +					       struct gnttab_map_grant_ref *gop)
>  {
>  	struct skb_shared_info *shinfo = skb_shinfo(skb);
>  	skb_frag_t *frags = shinfo->frags;
> @@ -810,83 +813,12 @@ static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
>  	/* Skip first skb fragment if it is on same page as header fragment. */
>  	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
>  
[...]
>  		if (checksum_setup(vif, skb)) {
>  			netdev_dbg(vif->dev,
>  				   "Can't setup checksum in net_tx_action\n");
> +			if (skb_shinfo(skb)->destructor_arg)
> +				skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
>  			kfree_skb(skb);
>  			continue;
>  		}
> @@ -1601,6 +1559,14 @@ static int xenvif_tx_submit(struct xenvif *vif)
>  
>  		work_done++;
>  
> +		/* Set this flag right before netif_receive_skb, otherwise
> +		 * someone might think this packet already left netback, and
> +		 * do a skb_copy_ubufs while we are still in control of the
> +		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
> +		 */
> +		if (skb_shinfo(skb)->destructor_arg)
> +			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
> +

This is really tricky. :-P

>  		netif_receive_skb(skb);
>  	}
>  
> @@ -1711,7 +1677,7 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
>  int xenvif_tx_action(struct xenvif *vif, int budget)
>  {
>  	unsigned nr_gops;
> -	int work_done;
> +	int work_done, ret;
>  
>  	if (unlikely(!tx_work_todo(vif)))
>  		return 0;
> @@ -1721,7 +1687,13 @@ int xenvif_tx_action(struct xenvif *vif, int budget)
>  	if (nr_gops == 0)
>  		return 0;
>  
> -	gnttab_batch_copy(vif->tx_copy_ops, nr_gops);
> +	if (nr_gops) {

Surely you can remove this "if". At this point nr_gops cannot be zero --
see two lines above.

> +		ret = gnttab_map_refs(vif->tx_map_ops,
> +			NULL,
> +			vif->pages_to_map,
> +			nr_gops);
> +		BUG_ON(ret);
> +	}
>  
>  	work_done = xenvif_tx_submit(vif);
>  
> @@ -1732,61 +1704,37 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
>  			       u8 status)
>  {
[...]
>  
>  void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
>  {
>  	int ret;
> +	struct gnttab_unmap_grant_ref tx_unmap_op;
> +
>  	if (vif->grant_tx_handle[pending_idx] == NETBACK_INVALID_HANDLE) {
>  		netdev_err(vif->dev,
>  				"Trying to unmap invalid handle! pending_idx: %x\n",
>  				pending_idx);
>  		return;
>  	}
> -	gnttab_set_unmap_op(&vif->tx_unmap_ops[0],
> +	gnttab_set_unmap_op(&tx_unmap_op,
>  			idx_to_kaddr(vif, pending_idx),
>  			GNTMAP_host_map,
>  			vif->grant_tx_handle[pending_idx]);
> -	ret = gnttab_unmap_refs(vif->tx_unmap_ops,
> +	ret = gnttab_unmap_refs(&tx_unmap_op,
>  			NULL,
>  			&vif->mmap_pages[pending_idx],
>  			1);

This change should be squashed to patch 1. Or as I suggested the changes
in patch 1 should be moved here.

> @@ -1845,7 +1793,6 @@ static inline int rx_work_todo(struct xenvif *vif)
>  
>  static inline int tx_work_todo(struct xenvif *vif)
>  {
> -

Stray blank line change.

Wei.

>  	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)) &&
>  	    (nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX
>  	     < MAX_PENDING_REQS))
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zoltan Kiss Dec. 16, 2013, 3:38 p.m. UTC | #2
On 13/12/13 15:36, Wei Liu wrote:
> On Thu, Dec 12, 2013 at 11:48:10PM +0000, Zoltan Kiss wrote:
>> This patch changes the grant copy on the TX patch to grant mapping
>>
>> v2:
>> - delete branch for handling fragmented packets fit PKT_PROT_LINE sized first
>                                                        ^ PKT_PROT_LEN
>>    request
>> - mark the effect of using ballooned pages in a comment
>> - place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
>>    before netif_receive_skb, and mark the importance of it
>> - grab dealloc_lock before __napi_complete to avoid contention with the
>>    callback's napi_schedule
>> - handle fragmented packets where first request < PKT_PROT_LINE
>                                                      ^ PKT_PROT_LEN
Oh, some dyskleksia of mine, I will fix that :)

>> - fix up error path when checksum_setup failed
>> - check before teardown for pending grants, and start complain if they are
>>    there after 10 second
>>
>> Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
>> ---
> [...]
>>   void xenvif_free(struct xenvif *vif)
>>   {
>> +	int i, unmap_timeout = 0;
>> +
>> +	for (i = 0; i < MAX_PENDING_REQS; ++i) {
>> +		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
>> +			i = 0;
>> +			unmap_timeout++;
>> +			msleep(1000);
>> +			if (unmap_timeout > 9 &&
>> +				net_ratelimit())
>> +				netdev_err(vif->dev,
>> +					"Page still granted! Index: %x\n", i);
>> +		}
>> +	}
>> +
>> +	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
>> +
>
> If some pages are stuck and you just free them will it cause Dom0 to
> crash? I mean, if those pages are recycled by other balloon page users.
>
> Even if it will not cause Dom0 to crash, will it leak any resource in
> Dom0? At plain sight it looks like at least grant table entry is leaked,
> isn't it? We need to be careful about this because a malicious might be
> able to DoS Dom0 with resource leakage.
Yes, if we call free_xenballooned_pages while something is still mapped, 
Xen kills Dom0 because balloon driver tries to touch the PTE of a grant 
mapped page. That's why we make sure before that everything is unmapped, 
and repeat an error message if it's not. I'm afraid we can't do anything 
better here, that means a serious netback bug.
But a malicious guest cannot take advantage of this unless it's find a 
way to screw up netback's internal bookkeeping. Then it can block here 
indefinitely the teardown of the VIF, and it's associated resources.

>
>>   	netif_napi_del(&vif->napi);
>>
>>   	unregister_netdev(vif->dev);
>> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
>> index 3ddc474..20352be 100644
>> --- a/drivers/net/xen-netback/netback.c
>> +++ b/drivers/net/xen-netback/netback.c
>> @@ -645,9 +645,12 @@ static void xenvif_tx_err(struct xenvif *vif,
>>   			  struct xen_netif_tx_request *txp, RING_IDX end)
>>   {
>>   	RING_IDX cons = vif->tx.req_cons;
>> +	unsigned long flags;
>>
>>   	do {
>> +		spin_lock_irqsave(&vif->response_lock, flags);
>>   		make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
>> +		spin_unlock_irqrestore(&vif->response_lock, flags);
>
> You only hold the lock for one function call is this intentional?
Yes, make_tx_response can be called from xenvif_tx_err or 
xenvif_idx_release, and they can be called from the NAPI instance and 
the dealloc thread. (xenvif_tx_err only from NAPI)

>
>>   		netif_receive_skb(skb);
>>   	}
>>
>> @@ -1711,7 +1677,7 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
>>   int xenvif_tx_action(struct xenvif *vif, int budget)
>>   {
>>   	unsigned nr_gops;
>> -	int work_done;
>> +	int work_done, ret;
>>
>>   	if (unlikely(!tx_work_todo(vif)))
>>   		return 0;
>> @@ -1721,7 +1687,13 @@ int xenvif_tx_action(struct xenvif *vif, int budget)
>>   	if (nr_gops == 0)
>>   		return 0;
>>
>> -	gnttab_batch_copy(vif->tx_copy_ops, nr_gops);
>> +	if (nr_gops) {
>
> Surely you can remove this "if". At this point nr_gops cannot be zero --
> see two lines above.

>>   void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
>>   {
>>   	int ret;
>> +	struct gnttab_unmap_grant_ref tx_unmap_op;
>> +
>>   	if (vif->grant_tx_handle[pending_idx] == NETBACK_INVALID_HANDLE) {
>>   		netdev_err(vif->dev,
>>   				"Trying to unmap invalid handle! pending_idx: %x\n",
>>   				pending_idx);
>>   		return;
>>   	}
>> -	gnttab_set_unmap_op(&vif->tx_unmap_ops[0],
>> +	gnttab_set_unmap_op(&tx_unmap_op,
>>   			idx_to_kaddr(vif, pending_idx),
>>   			GNTMAP_host_map,
>>   			vif->grant_tx_handle[pending_idx]);
>> -	ret = gnttab_unmap_refs(vif->tx_unmap_ops,
>> +	ret = gnttab_unmap_refs(&tx_unmap_op,
>>   			NULL,
>>   			&vif->mmap_pages[pending_idx],
>>   			1);
>
> This change should be squashed to patch 1. Or as I suggested the changes
> in patch 1 should be moved here.
>
>> @@ -1845,7 +1793,6 @@ static inline int rx_work_todo(struct xenvif *vif)
>>
>>   static inline int tx_work_todo(struct xenvif *vif)
>>   {
>> -
>
> Stray blank line change.
Agreed on previous 3 comments, I will apply them.

Zoli
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Liu Dec. 16, 2013, 6:21 p.m. UTC | #3
On Mon, Dec 16, 2013 at 03:38:05PM +0000, Zoltan Kiss wrote:
[...]
> >>+	for (i = 0; i < MAX_PENDING_REQS; ++i) {
> >>+		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
> >>+			i = 0;
> >>+			unmap_timeout++;
> >>+			msleep(1000);
> >>+			if (unmap_timeout > 9 &&
> >>+				net_ratelimit())
> >>+				netdev_err(vif->dev,
> >>+					"Page still granted! Index: %x\n", i);
> >>+		}
> >>+	}
> >>+
> >>+	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
> >>+
> >
> >If some pages are stuck and you just free them will it cause Dom0 to
> >crash? I mean, if those pages are recycled by other balloon page users.
> >
> >Even if it will not cause Dom0 to crash, will it leak any resource in
> >Dom0? At plain sight it looks like at least grant table entry is leaked,
> >isn't it? We need to be careful about this because a malicious might be
> >able to DoS Dom0 with resource leakage.
> Yes, if we call free_xenballooned_pages while something is still
> mapped, Xen kills Dom0 because balloon driver tries to touch the PTE
> of a grant mapped page. That's why we make sure before that
> everything is unmapped, and repeat an error message if it's not. I'm

The code snippet above doesn't loop ten times over the whole array if
there's stale pages found, nor does it loop ten times on any stale
pages.

So imagine that we have the very last page in the array staled. This
routine sleeps for 1 second then free all ballooned pages. It's still
not guaranteed at the point we call free_xenballooned_pages all pages
are unmapped, right?

> afraid we can't do anything better here, that means a serious
> netback bug.
> But a malicious guest cannot take advantage of this unless it's find
> a way to screw up netback's internal bookkeeping. Then it can block
> here indefinitely the teardown of the VIF, and it's associated
> resources.
> 

OK.

Wei.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zoltan Kiss Dec. 16, 2013, 6:57 p.m. UTC | #4
On 16/12/13 18:21, Wei Liu wrote:
> On Mon, Dec 16, 2013 at 03:38:05PM +0000, Zoltan Kiss wrote:
> [...]
>>>> +	for (i = 0; i < MAX_PENDING_REQS; ++i) {
>>>> +		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
>>>> +			i = 0;
>>>> +			unmap_timeout++;
>>>> +			msleep(1000);
>>>> +			if (unmap_timeout > 9 &&
>>>> +				net_ratelimit())
>>>> +				netdev_err(vif->dev,
>>>> +					"Page still granted! Index: %x\n", i);
>>>> +		}
>>>> +	}
>>>> +
>>>> +	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
>>>> +
>>>
>>> If some pages are stuck and you just free them will it cause Dom0 to
>>> crash? I mean, if those pages are recycled by other balloon page users.
>>>
>>> Even if it will not cause Dom0 to crash, will it leak any resource in
>>> Dom0? At plain sight it looks like at least grant table entry is leaked,
>>> isn't it? We need to be careful about this because a malicious might be
>>> able to DoS Dom0 with resource leakage.
>> Yes, if we call free_xenballooned_pages while something is still
>> mapped, Xen kills Dom0 because balloon driver tries to touch the PTE
>> of a grant mapped page. That's why we make sure before that
>> everything is unmapped, and repeat an error message if it's not. I'm

There is an "i = 0" if we find a valid handle. So we start again 
checking the whole array from the second element (incorrectly, it should 
be "i = -1"!), and we print an incorrect error message, but essentially 
we are not leaving the loop, unless the first element was the 
problematic. We can modify that to "i--" or "i = -1" if we want to 
recheck the whole array. It shouldn't happen at this point that we 
transmit new packets, starting from the beginning is just an extra 
safety check.
Also, we should modify i after the printing of the error message.

Zoli

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Liu Dec. 16, 2013, 7:06 p.m. UTC | #5
On Mon, Dec 16, 2013 at 06:57:44PM +0000, Zoltan Kiss wrote:
> On 16/12/13 18:21, Wei Liu wrote:
> >On Mon, Dec 16, 2013 at 03:38:05PM +0000, Zoltan Kiss wrote:
> >[...]
> >>>>+	for (i = 0; i < MAX_PENDING_REQS; ++i) {
> >>>>+		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
> >>>>+			i = 0;
> >>>>+			unmap_timeout++;
> >>>>+			msleep(1000);
> >>>>+			if (unmap_timeout > 9 &&
> >>>>+				net_ratelimit())
> >>>>+				netdev_err(vif->dev,
> >>>>+					"Page still granted! Index: %x\n", i);
> >>>>+		}
> >>>>+	}
> >>>>+
> >>>>+	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
> >>>>+
> >>>
> >>>If some pages are stuck and you just free them will it cause Dom0 to
> >>>crash? I mean, if those pages are recycled by other balloon page users.
> >>>
> >>>Even if it will not cause Dom0 to crash, will it leak any resource in
> >>>Dom0? At plain sight it looks like at least grant table entry is leaked,
> >>>isn't it? We need to be careful about this because a malicious might be
> >>>able to DoS Dom0 with resource leakage.
> >>Yes, if we call free_xenballooned_pages while something is still
> >>mapped, Xen kills Dom0 because balloon driver tries to touch the PTE
> >>of a grant mapped page. That's why we make sure before that
> >>everything is unmapped, and repeat an error message if it's not. I'm
> 
> There is an "i = 0" if we find a valid handle. So we start again

Oops, missed that.

> checking the whole array from the second element (incorrectly, it
> should be "i = -1"!), and we print an incorrect error message, but
> essentially we are not leaving the loop, unless the first element
> was the problematic. We can modify that to "i--" or "i = -1" if we
> want to recheck the whole array. It shouldn't happen at this point
> that we transmit new packets, starting from the beginning is just an
> extra safety check.
> Also, we should modify i after the printing of the error message.
> 

So I did help find a bug though. :-)

Wei.

> Zoli
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Konrad Rzeszutek Wilk Dec. 17, 2013, 9:49 p.m. UTC | #6
On Thu, Dec 12, 2013 at 11:48:10PM +0000, Zoltan Kiss wrote:
> This patch changes the grant copy on the TX patch to grant mapping
> 
> v2:
> - delete branch for handling fragmented packets fit PKT_PROT_LINE sized first
>   request
> - mark the effect of using ballooned pages in a comment
> - place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
>   before netif_receive_skb, and mark the importance of it
> - grab dealloc_lock before __napi_complete to avoid contention with the
>   callback's napi_schedule
> - handle fragmented packets where first request < PKT_PROT_LINE
> - fix up error path when checksum_setup failed
> - check before teardown for pending grants, and start complain if they are
>   there after 10 second
> 
> Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
> ---
>  drivers/net/xen-netback/interface.c |   57 +++++++-
>  drivers/net/xen-netback/netback.c   |  257 ++++++++++++++---------------------
>  2 files changed, 156 insertions(+), 158 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> index 1c27e9e..42946de 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -122,7 +122,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	BUG_ON(skb->dev != dev);
>  
>  	/* Drop the packet if vif is not ready */
> -	if (vif->task == NULL || !xenvif_schedulable(vif))
> +	if (vif->task == NULL ||
> +		vif->dealloc_task == NULL ||
> +		!xenvif_schedulable(vif))
>  		goto drop;
>  
>  	/* At best we'll need one slot for the header and one for each
> @@ -335,8 +337,25 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>  	vif->pending_prod = MAX_PENDING_REQS;
>  	for (i = 0; i < MAX_PENDING_REQS; i++)
>  		vif->pending_ring[i] = i;
> -	for (i = 0; i < MAX_PENDING_REQS; i++)
> -		vif->mmap_pages[i] = NULL;
> +	/* If ballooning is disabled, this will consume real memory, so you
> +	 * better enable it. The long term solution would be to use just a
> +	 * bunch of valid page descriptors, without dependency on ballooning
> +	 */
> +	err = alloc_xenballooned_pages(MAX_PENDING_REQS,
> +		vif->mmap_pages,
> +		false);
> +	if (err) {
> +		netdev_err(dev, "Could not reserve mmap_pages\n");
> +		return NULL;
> +	}
> +	for (i = 0; i < MAX_PENDING_REQS; i++) {
> +		vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
> +			{ .callback = xenvif_zerocopy_callback,
> +			  .ctx = NULL,
> +			  .desc = i };
> +		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
> +	}
> +	init_timer(&vif->dealloc_delay);
>  
>  	/*
>  	 * Initialise a dummy MAC address. We choose the numerically
> @@ -380,6 +399,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
>  		goto err;
>  
>  	init_waitqueue_head(&vif->wq);
> +	init_waitqueue_head(&vif->dealloc_wq);
>  
>  	if (tx_evtchn == rx_evtchn) {
>  		/* feature-split-event-channels == 0 */
> @@ -421,6 +441,14 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
>  		goto err_rx_unbind;
>  	}
>  
> +	vif->dealloc_task = kthread_create(xenvif_dealloc_kthread,
> +				   (void *)vif, "%s-dealloc", vif->dev->name);
> +	if (IS_ERR(vif->dealloc_task)) {
> +		pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
> +		err = PTR_ERR(vif->dealloc_task);
> +		goto err_rx_unbind;
> +	}
> +
>  	vif->task = task;
>  
>  	rtnl_lock();
> @@ -433,6 +461,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
>  	rtnl_unlock();
>  
>  	wake_up_process(vif->task);
> +	wake_up_process(vif->dealloc_task);
>  
>  	return 0;
>  
> @@ -470,6 +499,12 @@ void xenvif_disconnect(struct xenvif *vif)
>  		vif->task = NULL;
>  	}
>  
> +	if (vif->dealloc_task) {
> +		del_timer_sync(&vif->dealloc_delay);
> +		kthread_stop(vif->dealloc_task);
> +		vif->dealloc_task = NULL;
> +	}
> +
>  	if (vif->tx_irq) {
>  		if (vif->tx_irq == vif->rx_irq)
>  			unbind_from_irqhandler(vif->tx_irq, vif);
> @@ -485,6 +520,22 @@ void xenvif_disconnect(struct xenvif *vif)
>  
>  void xenvif_free(struct xenvif *vif)
>  {
> +	int i, unmap_timeout = 0;
> +
> +	for (i = 0; i < MAX_PENDING_REQS; ++i) {
> +		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
> +			i = 0;
> +			unmap_timeout++;
> +			msleep(1000);

You don't want to use schedule() and a wakeup here to allow other threads
to do their work?

> +			if (unmap_timeout > 9 &&
> +				net_ratelimit())
> +				netdev_err(vif->dev,
> +					"Page still granted! Index: %x\n", i);
> +		}
> +	}
> +
> +	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);

How about just stashing those pages on a 'I can't free them' list that will
keep them forever. And if that list gets truly large then switch back to
grant_copy?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zoltan Kiss Dec. 30, 2013, 5:58 p.m. UTC | #7
On 17/12/13 21:49, Konrad Rzeszutek Wilk wrote:
> On Thu, Dec 12, 2013 at 11:48:10PM +0000, Zoltan Kiss wrote:
>> @@ -485,6 +520,22 @@ void xenvif_disconnect(struct xenvif *vif)
>>
>>   void xenvif_free(struct xenvif *vif)
>>   {
>> +	int i, unmap_timeout = 0;
>> +
>> +	for (i = 0; i < MAX_PENDING_REQS; ++i) {
>> +		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
>> +			i = 0;
>> +			unmap_timeout++;
>> +			msleep(1000);
>
> You don't want to use schedule() and a wakeup here to allow other threads
> to do their work?
Yep, schedule_timeout() would be nicer indeed

>
>> +			if (unmap_timeout > 9 &&
>> +				net_ratelimit())
>> +				netdev_err(vif->dev,
>> +					"Page still granted! Index: %x\n", i);
>> +		}
>> +	}
>> +
>> +	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
>
> How about just stashing those pages on a 'I can't free them' list that will
> keep them forever. And if that list gets truly large then switch back to
> grant_copy?
But then what would you answer to the guest? You can't shoot the shared 
ring until there is an outstanding slot.
On the other hand, doing a copy would just move the memory leak into the 
backend, which could be problematic if a guest figures out how to make a 
packet which can get stucked somewhere in the backend.

Zoli
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 1c27e9e..42946de 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -122,7 +122,9 @@  static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	BUG_ON(skb->dev != dev);
 
 	/* Drop the packet if vif is not ready */
-	if (vif->task == NULL || !xenvif_schedulable(vif))
+	if (vif->task == NULL ||
+		vif->dealloc_task == NULL ||
+		!xenvif_schedulable(vif))
 		goto drop;
 
 	/* At best we'll need one slot for the header and one for each
@@ -335,8 +337,25 @@  struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	vif->pending_prod = MAX_PENDING_REQS;
 	for (i = 0; i < MAX_PENDING_REQS; i++)
 		vif->pending_ring[i] = i;
-	for (i = 0; i < MAX_PENDING_REQS; i++)
-		vif->mmap_pages[i] = NULL;
+	/* If ballooning is disabled, this will consume real memory, so you
+	 * better enable it. The long term solution would be to use just a
+	 * bunch of valid page descriptors, without dependency on ballooning
+	 */
+	err = alloc_xenballooned_pages(MAX_PENDING_REQS,
+		vif->mmap_pages,
+		false);
+	if (err) {
+		netdev_err(dev, "Could not reserve mmap_pages\n");
+		return NULL;
+	}
+	for (i = 0; i < MAX_PENDING_REQS; i++) {
+		vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
+			{ .callback = xenvif_zerocopy_callback,
+			  .ctx = NULL,
+			  .desc = i };
+		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
+	}
+	init_timer(&vif->dealloc_delay);
 
 	/*
 	 * Initialise a dummy MAC address. We choose the numerically
@@ -380,6 +399,7 @@  int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 		goto err;
 
 	init_waitqueue_head(&vif->wq);
+	init_waitqueue_head(&vif->dealloc_wq);
 
 	if (tx_evtchn == rx_evtchn) {
 		/* feature-split-event-channels == 0 */
@@ -421,6 +441,14 @@  int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 		goto err_rx_unbind;
 	}
 
+	vif->dealloc_task = kthread_create(xenvif_dealloc_kthread,
+				   (void *)vif, "%s-dealloc", vif->dev->name);
+	if (IS_ERR(vif->dealloc_task)) {
+		pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
+		err = PTR_ERR(vif->dealloc_task);
+		goto err_rx_unbind;
+	}
+
 	vif->task = task;
 
 	rtnl_lock();
@@ -433,6 +461,7 @@  int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 	rtnl_unlock();
 
 	wake_up_process(vif->task);
+	wake_up_process(vif->dealloc_task);
 
 	return 0;
 
@@ -470,6 +499,12 @@  void xenvif_disconnect(struct xenvif *vif)
 		vif->task = NULL;
 	}
 
+	if (vif->dealloc_task) {
+		del_timer_sync(&vif->dealloc_delay);
+		kthread_stop(vif->dealloc_task);
+		vif->dealloc_task = NULL;
+	}
+
 	if (vif->tx_irq) {
 		if (vif->tx_irq == vif->rx_irq)
 			unbind_from_irqhandler(vif->tx_irq, vif);
@@ -485,6 +520,22 @@  void xenvif_disconnect(struct xenvif *vif)
 
 void xenvif_free(struct xenvif *vif)
 {
+	int i, unmap_timeout = 0;
+
+	for (i = 0; i < MAX_PENDING_REQS; ++i) {
+		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
+			i = 0;
+			unmap_timeout++;
+			msleep(1000);
+			if (unmap_timeout > 9 &&
+				net_ratelimit())
+				netdev_err(vif->dev,
+					"Page still granted! Index: %x\n", i);
+		}
+	}
+
+	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
+
 	netif_napi_del(&vif->napi);
 
 	unregister_netdev(vif->dev);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 3ddc474..20352be 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -645,9 +645,12 @@  static void xenvif_tx_err(struct xenvif *vif,
 			  struct xen_netif_tx_request *txp, RING_IDX end)
 {
 	RING_IDX cons = vif->tx.req_cons;
+	unsigned long flags;
 
 	do {
+		spin_lock_irqsave(&vif->response_lock, flags);
 		make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+		spin_unlock_irqrestore(&vif->response_lock, flags);
 		if (cons == end)
 			break;
 		txp = RING_GET_REQUEST(&vif->tx, cons++);
@@ -786,10 +789,10 @@  static inline void xenvif_tx_create_gop(struct xenvif *vif, u16 pending_idx,
 
 }
 
-static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
+static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 					       struct sk_buff *skb,
 					       struct xen_netif_tx_request *txp,
-					       struct gnttab_copy *gop)
+					       struct gnttab_map_grant_ref *gop)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
@@ -810,83 +813,12 @@  static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
 
-	/* Coalesce tx requests, at this point the packet passed in
-	 * should be <= 64K. Any packets larger than 64K have been
-	 * handled in xenvif_count_requests().
-	 */
-	for (shinfo->nr_frags = slot = start; slot < nr_slots;
-	     shinfo->nr_frags++) {
-		struct pending_tx_info *pending_tx_info =
-			vif->pending_tx_info;
-
-		page = alloc_page(GFP_ATOMIC|__GFP_COLD);
-		if (!page)
-			goto err;
-
-		dst_offset = 0;
-		first = NULL;
-		while (dst_offset < PAGE_SIZE && slot < nr_slots) {
-			gop->flags = GNTCOPY_source_gref;
-
-			gop->source.u.ref = txp->gref;
-			gop->source.domid = vif->domid;
-			gop->source.offset = txp->offset;
-
-			gop->dest.domid = DOMID_SELF;
-
-			gop->dest.offset = dst_offset;
-			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
-
-			if (dst_offset + txp->size > PAGE_SIZE) {
-				/* This page can only merge a portion
-				 * of tx request. Do not increment any
-				 * pointer / counter here. The txp
-				 * will be dealt with in future
-				 * rounds, eventually hitting the
-				 * `else` branch.
-				 */
-				gop->len = PAGE_SIZE - dst_offset;
-				txp->offset += gop->len;
-				txp->size -= gop->len;
-				dst_offset += gop->len; /* quit loop */
-			} else {
-				/* This tx request can be merged in the page */
-				gop->len = txp->size;
-				dst_offset += gop->len;
-
+	for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
+	     shinfo->nr_frags++, txp++, gop++) {
 				index = pending_index(vif->pending_cons++);
-
 				pending_idx = vif->pending_ring[index];
-
-				memcpy(&pending_tx_info[pending_idx].req, txp,
-				       sizeof(*txp));
-
-				/* Poison these fields, corresponding
-				 * fields for head tx req will be set
-				 * to correct values after the loop.
-				 */
-				vif->mmap_pages[pending_idx] = (void *)(~0UL);
-				pending_tx_info[pending_idx].head =
-					INVALID_PENDING_RING_IDX;
-
-				if (!first) {
-					first = &pending_tx_info[pending_idx];
-					start_idx = index;
-					head_idx = pending_idx;
-				}
-
-				txp++;
-				slot++;
-			}
-
-			gop++;
-		}
-
-		first->req.offset = 0;
-		first->req.size = dst_offset;
-		first->head = start_idx;
-		vif->mmap_pages[head_idx] = page;
-		frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);
+		xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
 	}
 
 	BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
@@ -908,9 +840,9 @@  err:
 
 static int xenvif_tx_check_gop(struct xenvif *vif,
 			       struct sk_buff *skb,
-			       struct gnttab_copy **gopp)
+			       struct gnttab_map_grant_ref **gopp)
 {
-	struct gnttab_copy *gop = *gopp;
+	struct gnttab_map_grant_ref *gop = *gopp;
 	u16 pending_idx = *((u16 *)skb->data);
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	struct pending_tx_info *tx_info;
@@ -922,6 +854,16 @@  static int xenvif_tx_check_gop(struct xenvif *vif,
 	err = gop->status;
 	if (unlikely(err))
 		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
+	else {
+		if (vif->grant_tx_handle[pending_idx] !=
+			NETBACK_INVALID_HANDLE) {
+			netdev_err(vif->dev,
+				"Stale mapped handle! pending_idx %x handle %x\n",
+				pending_idx, vif->grant_tx_handle[pending_idx]);
+			xenvif_fatal_tx_err(vif);
+		}
+		vif->grant_tx_handle[pending_idx] = gop->handle;
+	}
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
@@ -935,18 +877,24 @@  static int xenvif_tx_check_gop(struct xenvif *vif,
 		head = tx_info->head;
 
 		/* Check error status: if okay then remember grant handle. */
-		do {
 			newerr = (++gop)->status;
-			if (newerr)
-				break;
-			peek = vif->pending_ring[pending_index(++head)];
-		} while (!pending_tx_is_head(vif, peek));
 
 		if (likely(!newerr)) {
+			if (vif->grant_tx_handle[pending_idx] !=
+				NETBACK_INVALID_HANDLE) {
+				netdev_err(vif->dev,
+					"Stale mapped handle! pending_idx %x handle %x\n",
+					pending_idx,
+					vif->grant_tx_handle[pending_idx]);
+				xenvif_fatal_tx_err(vif);
+			}
+			vif->grant_tx_handle[pending_idx] = gop->handle;
 			/* Had a previous error? Invalidate this fragment. */
-			if (unlikely(err))
+			if (unlikely(err)) {
+				xenvif_idx_unmap(vif, pending_idx);
 				xenvif_idx_release(vif, pending_idx,
 						   XEN_NETIF_RSP_OKAY);
+			}
 			continue;
 		}
 
@@ -959,9 +907,11 @@  static int xenvif_tx_check_gop(struct xenvif *vif,
 
 		/* First error: invalidate header and preceding fragments. */
 		pending_idx = *((u16 *)skb->data);
+		xenvif_idx_unmap(vif, pending_idx);
 		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
 		for (j = start; j < i; j++) {
 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
+			xenvif_idx_unmap(vif, pending_idx);
 			xenvif_idx_release(vif, pending_idx,
 					   XEN_NETIF_RSP_OKAY);
 		}
@@ -974,7 +924,8 @@  static int xenvif_tx_check_gop(struct xenvif *vif,
 	return err;
 }
 
-static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
+static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb,
+		u16 prev_pending_idx)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int nr_frags = shinfo->nr_frags;
@@ -988,6 +939,17 @@  static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
 
 		pending_idx = frag_get_pending_idx(frag);
 
+		/* If this is not the first frag, chain it to the previous*/
+		if (unlikely(prev_pending_idx == INVALID_PENDING_IDX))
+			skb_shinfo(skb)->destructor_arg =
+				&vif->pending_tx_info[pending_idx].callback_struct;
+		else if (likely(pending_idx != prev_pending_idx))
+			vif->pending_tx_info[prev_pending_idx].callback_struct.ctx =
+				&(vif->pending_tx_info[pending_idx].callback_struct);
+
+		vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
+		prev_pending_idx = pending_idx;
+
 		txp = &vif->pending_tx_info[pending_idx].req;
 		page = virt_to_page(idx_to_kaddr(vif, pending_idx));
 		__skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
@@ -995,10 +957,15 @@  static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
 		skb->data_len += txp->size;
 		skb->truesize += txp->size;
 
-		/* Take an extra reference to offset xenvif_idx_release */
+		/* Take an extra reference to offset network stack's put_page */
 		get_page(vif->mmap_pages[pending_idx]);
-		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
 	}
+	/* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
+	 * overlaps with "index", and "mapping" is not set. I think mapping
+	 * should be set. If delivered to local stack, it would drop this
+	 * skb in sk_filter unless the socket has the right to use it.
+	 */
+	skb->pfmemalloc	= false;
 }
 
 static int xenvif_get_extras(struct xenvif *vif,
@@ -1367,7 +1334,7 @@  static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
 
 static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 {
-	struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop;
+	struct gnttab_map_grant_ref *gop = vif->tx_map_ops, *request_gop;
 	struct sk_buff *skb;
 	int ret;
 
@@ -1475,30 +1442,10 @@  static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			}
 		}
 
-		/* XXX could copy straight to head */
-		page = xenvif_alloc_page(vif, pending_idx);
-		if (!page) {
-			kfree_skb(skb);
-			xenvif_tx_err(vif, &txreq, idx);
-			break;
-		}
-
-		gop->source.u.ref = txreq.gref;
-		gop->source.domid = vif->domid;
-		gop->source.offset = txreq.offset;
-
-		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
-		gop->dest.domid = DOMID_SELF;
-		gop->dest.offset = txreq.offset;
-
-		gop->len = txreq.size;
-		gop->flags = GNTCOPY_source_gref;
+		xenvif_tx_create_gop(vif, pending_idx, &txreq, gop);
 
 		gop++;
 
-		memcpy(&vif->pending_tx_info[pending_idx].req,
-		       &txreq, sizeof(txreq));
-		vif->pending_tx_info[pending_idx].head = index;
 		*((u16 *)skb->data) = pending_idx;
 
 		__skb_put(skb, data_len);
@@ -1527,17 +1474,17 @@  static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 
 		vif->tx.req_cons = idx;
 
-		if ((gop-vif->tx_copy_ops) >= ARRAY_SIZE(vif->tx_copy_ops))
+		if ((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops))
 			break;
 	}
 
-	return gop - vif->tx_copy_ops;
+	return gop - vif->tx_map_ops;
 }
 
 
 static int xenvif_tx_submit(struct xenvif *vif)
 {
-	struct gnttab_copy *gop = vif->tx_copy_ops;
+	struct gnttab_map_grant_ref *gop = vif->tx_map_ops;
 	struct sk_buff *skb;
 	int work_done = 0;
 
@@ -1561,12 +1508,17 @@  static int xenvif_tx_submit(struct xenvif *vif)
 		memcpy(skb->data,
 		       (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset),
 		       data_len);
+		vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
 		if (data_len < txp->size) {
 			/* Append the packet payload as a fragment. */
 			txp->offset += data_len;
 			txp->size -= data_len;
+			skb_shinfo(skb)->destructor_arg =
+				&vif->pending_tx_info[pending_idx].callback_struct;
 		} else {
 			/* Schedule a response immediately. */
+			skb_shinfo(skb)->destructor_arg = NULL;
+			xenvif_idx_unmap(vif, pending_idx);
 			xenvif_idx_release(vif, pending_idx,
 					   XEN_NETIF_RSP_OKAY);
 		}
@@ -1576,7 +1528,11 @@  static int xenvif_tx_submit(struct xenvif *vif)
 		else if (txp->flags & XEN_NETTXF_data_validated)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-		xenvif_fill_frags(vif, skb);
+		xenvif_fill_frags(vif,
+			skb,
+			skb_shinfo(skb)->destructor_arg ?
+					pending_idx :
+					INVALID_PENDING_IDX);
 
 		if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
 			int target = min_t(int, skb->len, PKT_PROT_LEN);
@@ -1590,6 +1546,8 @@  static int xenvif_tx_submit(struct xenvif *vif)
 		if (checksum_setup(vif, skb)) {
 			netdev_dbg(vif->dev,
 				   "Can't setup checksum in net_tx_action\n");
+			if (skb_shinfo(skb)->destructor_arg)
+				skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 			kfree_skb(skb);
 			continue;
 		}
@@ -1601,6 +1559,14 @@  static int xenvif_tx_submit(struct xenvif *vif)
 
 		work_done++;
 
+		/* Set this flag right before netif_receive_skb, otherwise
+		 * someone might think this packet already left netback, and
+		 * do a skb_copy_ubufs while we are still in control of the
+		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
+		 */
+		if (skb_shinfo(skb)->destructor_arg)
+			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+
 		netif_receive_skb(skb);
 	}
 
@@ -1711,7 +1677,7 @@  static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
 int xenvif_tx_action(struct xenvif *vif, int budget)
 {
 	unsigned nr_gops;
-	int work_done;
+	int work_done, ret;
 
 	if (unlikely(!tx_work_todo(vif)))
 		return 0;
@@ -1721,7 +1687,13 @@  int xenvif_tx_action(struct xenvif *vif, int budget)
 	if (nr_gops == 0)
 		return 0;
 
-	gnttab_batch_copy(vif->tx_copy_ops, nr_gops);
+	if (nr_gops) {
+		ret = gnttab_map_refs(vif->tx_map_ops,
+			NULL,
+			vif->pages_to_map,
+			nr_gops);
+		BUG_ON(ret);
+	}
 
 	work_done = xenvif_tx_submit(vif);
 
@@ -1732,61 +1704,37 @@  static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
 			       u8 status)
 {
 	struct pending_tx_info *pending_tx_info;
-	pending_ring_idx_t head;
+	pending_ring_idx_t index;
 	u16 peek; /* peek into next tx request */
+	unsigned long flags;
 
-	BUG_ON(vif->mmap_pages[pending_idx] == (void *)(~0UL));
-
-	/* Already complete? */
-	if (vif->mmap_pages[pending_idx] == NULL)
-		return;
-
-	pending_tx_info = &vif->pending_tx_info[pending_idx];
-
-	head = pending_tx_info->head;
-
-	BUG_ON(!pending_tx_is_head(vif, head));
-	BUG_ON(vif->pending_ring[pending_index(head)] != pending_idx);
-
-	do {
-		pending_ring_idx_t index;
-		pending_ring_idx_t idx = pending_index(head);
-		u16 info_idx = vif->pending_ring[idx];
-
-		pending_tx_info = &vif->pending_tx_info[info_idx];
+		pending_tx_info = &vif->pending_tx_info[pending_idx];
+		spin_lock_irqsave(&vif->response_lock, flags);
 		make_tx_response(vif, &pending_tx_info->req, status);
-
-		/* Setting any number other than
-		 * INVALID_PENDING_RING_IDX indicates this slot is
-		 * starting a new packet / ending a previous packet.
-		 */
-		pending_tx_info->head = 0;
-
-		index = pending_index(vif->pending_prod++);
-		vif->pending_ring[index] = vif->pending_ring[info_idx];
-
-		peek = vif->pending_ring[pending_index(++head)];
-
-	} while (!pending_tx_is_head(vif, peek));
-
-	put_page(vif->mmap_pages[pending_idx]);
-	vif->mmap_pages[pending_idx] = NULL;
+		index = pending_index(vif->pending_prod);
+		vif->pending_ring[index] = pending_idx;
+		/* TX shouldn't use the index before we give it back here */
+		mb();
+		vif->pending_prod++;
+		spin_unlock_irqrestore(&vif->response_lock, flags);
 }
 
 void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
 {
 	int ret;
+	struct gnttab_unmap_grant_ref tx_unmap_op;
+
 	if (vif->grant_tx_handle[pending_idx] == NETBACK_INVALID_HANDLE) {
 		netdev_err(vif->dev,
 				"Trying to unmap invalid handle! pending_idx: %x\n",
 				pending_idx);
 		return;
 	}
-	gnttab_set_unmap_op(&vif->tx_unmap_ops[0],
+	gnttab_set_unmap_op(&tx_unmap_op,
 			idx_to_kaddr(vif, pending_idx),
 			GNTMAP_host_map,
 			vif->grant_tx_handle[pending_idx]);
-	ret = gnttab_unmap_refs(vif->tx_unmap_ops,
+	ret = gnttab_unmap_refs(&tx_unmap_op,
 			NULL,
 			&vif->mmap_pages[pending_idx],
 			1);
@@ -1845,7 +1793,6 @@  static inline int rx_work_todo(struct xenvif *vif)
 
 static inline int tx_work_todo(struct xenvif *vif)
 {
-
 	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)) &&
 	    (nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX
 	     < MAX_PENDING_REQS))