diff mbox

[net-next,v5,1/9] xen-netback: Introduce TX grant map definitions

Message ID 1390253069-25507-2-git-send-email-zoltan.kiss@citrix.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Zoltan Kiss Jan. 20, 2014, 9:24 p.m. UTC
This patch contains the new definitions necessary for grant mapping.

v2:
- move unmapping to separate thread. The NAPI instance has to be scheduled
  even from thread context, which can cause huge delays
- that causes unfortunately bigger struct xenvif
- store grant handle after checking validity

v3:
- fix comment in xenvif_tx_dealloc_action()
- call unmap hypercall directly instead of gnttab_unmap_refs(), which does
  unnecessary m2p_override. Also remove pages_to_[un]map members
- BUG() if grant_tx_handle corrupted

v4:
- fix indentations and comments
- use bool for tx_dealloc_work_todo
- BUG() if grant_tx_handle corrupted - now really :)
- go back to gnttab_unmap_refs, now we rely on API changes

v5:
- remove hypercall from xenvif_idx_unmap
- remove stray line in xenvif_tx_create_gop
- simplify tx_dealloc_work_todo
- BUG() in xenvif_idx_unmap

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>

---
 drivers/net/xen-netback/common.h    |   30 ++++++-
 drivers/net/xen-netback/interface.c |    1 +
 drivers/net/xen-netback/netback.c   |  161 +++++++++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+), 1 deletion(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Ian Campbell Feb. 18, 2014, 5:06 p.m. UTC | #1
On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> This patch contains the new definitions necessary for grant mapping.

Is this just adding a bunch of (currently) unused functions? That's a
slightly odd way to structure a series. They don't seem to be "generic
helpers" or anything so it would be more normal to introduce these as
they get used -- it's a bit hard to review them out of context.

> v2:

This sort of intraversion changelog should go after the S-o-b and a
"---" marker. This way they are not included in the final commit
message.

[...]
> Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---

v2: Blah blah

v3: Etc etc


> @@ -226,6 +248,12 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);
>  
>  void xenvif_stop_queue(struct xenvif *vif);
>  
> +/* Callback from stack when TX packet can be released */
> +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
> +
> +/* Unmap a pending page, usually has to be called before xenvif_idx_release */

"usually" or always? How does one determine when it is or isn't
appropriate to call it later?

> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
> +
>  extern bool separate_tx_rx_irq;
>  
>  #endif /* __XEN_NETBACK__COMMON_H__ */
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> index 7669d49..f0f0c3d 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -38,6 +38,7 @@
>  
>  #include <xen/events.h>
>  #include <asm/xen/hypercall.h>
> +#include <xen/balloon.h>

What is this for?
 
>  #define XENVIF_QUEUE_LENGTH 32
>  #define XENVIF_NAPI_WEIGHT  64
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index bb241d0..195602f 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -773,6 +773,20 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,
>  	return page;
>  }
>  
> +static inline void xenvif_tx_create_gop(struct xenvif *vif,
> +					u16 pending_idx,
> +					struct xen_netif_tx_request *txp,
> +					struct gnttab_map_grant_ref *gop)
> +{
> +	vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
> +	gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
> +			  GNTMAP_host_map | GNTMAP_readonly,
> +			  txp->gref, vif->domid);
> +
> +	memcpy(&vif->pending_tx_info[pending_idx].req, txp,
> +	       sizeof(*txp));

Can this not go in xenvif_tx_build_gops? Or conversely should the
non-mapping code there be factored out?

Given the presence of both kinds of gop the name of this function needs
to be more specific I think.

> +}
> +
>  static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
>  					       struct sk_buff *skb,
>  					       struct xen_netif_tx_request *txp,
> @@ -1612,6 +1626,107 @@ static int xenvif_tx_submit(struct xenvif *vif)
>  	return work_done;
>  }
>  
> +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
> +{
> +	unsigned long flags;
> +	pending_ring_idx_t index;
> +	u16 pending_idx = ubuf->desc;
> +	struct pending_tx_info *temp =
> +		container_of(ubuf, struct pending_tx_info, callback_struct);
> +	struct xenvif *vif = container_of(temp - pending_idx,

This is subtracting a u16 from a pointer?

> +					  struct xenvif,
> +					  pending_tx_info[0]);
> +
> +	spin_lock_irqsave(&vif->dealloc_lock, flags);
> +	do {
> +		pending_idx = ubuf->desc;
> +		ubuf = (struct ubuf_info *) ubuf->ctx;
> +		index = pending_index(vif->dealloc_prod);
> +		vif->dealloc_ring[index] = pending_idx;
> +		/* Sync with xenvif_tx_dealloc_action:
> +		 * insert idx then incr producer.
> +		 */
> +		smp_wmb();

Is this really needed given that there is a lock held?

Or what is dealloc_lock protecting against?

> +		vif->dealloc_prod++;

What happens if the dealloc ring becomes full, will this wrap and cause
havoc?

> +	} while (ubuf);
> +	wake_up(&vif->dealloc_wq);
> +	spin_unlock_irqrestore(&vif->dealloc_lock, flags);
> +}
> +
> +static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
> +{
> +	struct gnttab_unmap_grant_ref *gop;
> +	pending_ring_idx_t dc, dp;
> +	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
> +	unsigned int i = 0;
> +
> +	dc = vif->dealloc_cons;
> +	gop = vif->tx_unmap_ops;
> +
> +	/* Free up any grants we have finished using */
> +	do {
> +		dp = vif->dealloc_prod;
> +
> +		/* Ensure we see all indices enqueued by all
> +		 * xenvif_zerocopy_callback().
> +		 */
> +		smp_rmb();
> +
> +		while (dc != dp) {
> +			pending_idx =
> +				vif->dealloc_ring[pending_index(dc++)];
> +
> +			/* Already unmapped? */
> +			if (vif->grant_tx_handle[pending_idx] ==
> +				NETBACK_INVALID_HANDLE) {
> +				netdev_err(vif->dev,
> +					   "Trying to unmap invalid handle! "
> +					   "pending_idx: %x\n", pending_idx);
> +				BUG();
> +			}
> +
> +			pending_idx_release[gop-vif->tx_unmap_ops] =
> +				pending_idx;
> +			vif->pages_to_unmap[gop-vif->tx_unmap_ops] =
> +				vif->mmap_pages[pending_idx];
> +			gnttab_set_unmap_op(gop,
> +					    idx_to_kaddr(vif, pending_idx),
> +					    GNTMAP_host_map,
> +					    vif->grant_tx_handle[pending_idx]);
> +			vif->grant_tx_handle[pending_idx] =
> +				NETBACK_INVALID_HANDLE;
> +			++gop;

Can we run out of space in the gop array?

> +		}
> +
> +	} while (dp != vif->dealloc_prod);
> +
> +	vif->dealloc_cons = dc;

No barrier here?

> +	if (gop - vif->tx_unmap_ops > 0) {
> +		int ret;
> +		ret = gnttab_unmap_refs(vif->tx_unmap_ops,
> +					vif->pages_to_unmap,
> +					gop - vif->tx_unmap_ops);
> +		if (ret) {
> +			netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
> +				   gop - vif->tx_unmap_ops, ret);
> +			for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {

This seems liable to be a lot of spew on failure. Perhaps only log the
ones where gop[i].status != success.

Have you considered whether or not the frontend can force this error to
occur?

> +				netdev_err(vif->dev,
> +					   " host_addr: %llx handle: %x status: %d\n",
> +					   gop[i].host_addr,
> +					   gop[i].handle,
> +					   gop[i].status);
> +			}
> +			BUG();
> +		}
> +	}
> +
> +	for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
> +		xenvif_idx_release(vif, pending_idx_release[i],
> +				   XEN_NETIF_RSP_OKAY);
> +}
> +
> +
>  /* Called after netfront has transmitted */
>  int xenvif_tx_action(struct xenvif *vif, int budget)
>  {
> @@ -1678,6 +1793,25 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
>  	vif->mmap_pages[pending_idx] = NULL;
>  }
>  
> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)

This is a single shot version of the batched xenvif_tx_dealloc_action
version? Why not just enqueue the idx to be unmapped later?

> +{
> +	int ret;
> +	struct gnttab_unmap_grant_ref tx_unmap_op;
> +
> +	if (vif->grant_tx_handle[pending_idx] == NETBACK_INVALID_HANDLE) {
> +		netdev_err(vif->dev,
> +			   "Trying to unmap invalid handle! pending_idx: %x\n",
> +			   pending_idx);
> +		BUG();
> +	}
> +	gnttab_set_unmap_op(&tx_unmap_op,
> +			    idx_to_kaddr(vif, pending_idx),
> +			    GNTMAP_host_map,
> +			    vif->grant_tx_handle[pending_idx]);
> +	ret = gnttab_unmap_refs(&tx_unmap_op, &vif->mmap_pages[pending_idx], 1);
> +	BUG_ON(ret);
> +	vif->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
> +}
>  
>  static void make_tx_response(struct xenvif *vif,
>  			     struct xen_netif_tx_request *txp,
> @@ -1740,6 +1874,11 @@ static inline int tx_work_todo(struct xenvif *vif)
>  	return 0;
>  }
>  
> +static inline bool tx_dealloc_work_todo(struct xenvif *vif)
> +{
> +	return vif->dealloc_cons != vif->dealloc_prod
> +}
> +
>  void xenvif_unmap_frontend_rings(struct xenvif *vif)
>  {
>  	if (vif->tx.sring)
> @@ -1826,6 +1965,28 @@ int xenvif_kthread(void *data)
>  	return 0;
>  }
>  
> +int xenvif_dealloc_kthread(void *data)

Is this going to be a thread per vif?

> +{
> +	struct xenvif *vif = data;
> +
> +	while (!kthread_should_stop()) {
> +		wait_event_interruptible(vif->dealloc_wq,
> +					 tx_dealloc_work_todo(vif) ||
> +					 kthread_should_stop());
> +		if (kthread_should_stop())
> +			break;
> +
> +		xenvif_tx_dealloc_action(vif);
> +		cond_resched();
> +	}
> +
> +	/* Unmap anything remaining*/
> +	if (tx_dealloc_work_todo(vif))
> +		xenvif_tx_dealloc_action(vif);
> +
> +	return 0;
> +}
> +
>  static int __init netback_init(void)
>  {
>  	int rc = 0;


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Feb. 18, 2014, 5:24 p.m. UTC | #2
On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> 
> +       spinlock_t dealloc_lock;
> +       spinlock_t response_lock; 

Please add comments to both of these describing what bits of the
datastructure they are locking.

You might find it is clearer to group the locks and the things they
protect together rather than grouping the locks together.

Ian.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zoltan Kiss Feb. 18, 2014, 8:36 p.m. UTC | #3
On 18/02/14 17:06, Ian Campbell wrote:
> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>> This patch contains the new definitions necessary for grant mapping.
>
> Is this just adding a bunch of (currently) unused functions? That's a
> slightly odd way to structure a series. They don't seem to be "generic
> helpers" or anything so it would be more normal to introduce these as
> they get used -- it's a bit hard to review them out of context.
I've created two patches because they are quite huge even now, 
separately. Together they would be a ~500 line change. That was the best 
I could figure out keeping in mind that bisect should work. But as I 
wrote in the first email, I welcome other suggestions. If you and Wei 
prefer this two patch in one big one, I merge them in the next version.

>> v2:
>
> This sort of intraversion changelog should go after the S-o-b and a
> "---" marker. This way they are not included in the final commit
> message.
Ok, I'll do that.

>> @@ -226,6 +248,12 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);
>>
>>   void xenvif_stop_queue(struct xenvif *vif);
>>
>> +/* Callback from stack when TX packet can be released */
>> +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
>> +
>> +/* Unmap a pending page, usually has to be called before xenvif_idx_release */
>
> "usually" or always? How does one determine when it is or isn't
> appropriate to call it later?
If you haven't unmapped it before, then you have to call it. I'll 
clarify the comment


>> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
>> index 7669d49..f0f0c3d 100644
>> --- a/drivers/net/xen-netback/interface.c
>> +++ b/drivers/net/xen-netback/interface.c
>> @@ -38,6 +38,7 @@
>>
>>   #include <xen/events.h>
>>   #include <asm/xen/hypercall.h>
>> +#include <xen/balloon.h>
>
> What is this for?
For alloc/free_xenballooned_pages

>> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
>> index bb241d0..195602f 100644
>> --- a/drivers/net/xen-netback/netback.c
>> +++ b/drivers/net/xen-netback/netback.c
>> @@ -773,6 +773,20 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,
>>   	return page;
>>   }
>>
>> +static inline void xenvif_tx_create_gop(struct xenvif *vif,
>> +					u16 pending_idx,
>> +					struct xen_netif_tx_request *txp,
>> +					struct gnttab_map_grant_ref *gop)
>> +{
>> +	vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
>> +	gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
>> +			  GNTMAP_host_map | GNTMAP_readonly,
>> +			  txp->gref, vif->domid);
>> +
>> +	memcpy(&vif->pending_tx_info[pending_idx].req, txp,
>> +	       sizeof(*txp));
>
> Can this not go in xenvif_tx_build_gops? Or conversely should the
> non-mapping code there be factored out?
>
> Given the presence of both kinds of gop the name of this function needs
> to be more specific I think.
It is called from tx_build_gop and get_requests, and the non-mapping 
code will go away. I have a patch on top of this series which does grant 
copy for the header part, but it doesn't create a separate function for 
the single copy operation, and you'll still call this function from 
build_gops to handle the rest of the first slot (if any)
So TX will have only one kind of gop.

>
>> +}
>> +
>>   static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
>>   					       struct sk_buff *skb,
>>   					       struct xen_netif_tx_request *txp,
>> @@ -1612,6 +1626,107 @@ static int xenvif_tx_submit(struct xenvif *vif)
>>   	return work_done;
>>   }
>>
>> +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
>> +{
>> +	unsigned long flags;
>> +	pending_ring_idx_t index;
>> +	u16 pending_idx = ubuf->desc;
>> +	struct pending_tx_info *temp =
>> +		container_of(ubuf, struct pending_tx_info, callback_struct);
>> +	struct xenvif *vif = container_of(temp - pending_idx,
>
> This is subtracting a u16 from a pointer?
Yes. I moved this to an ubuf_to_vif helper for the next version of the 
patch series

>
>> +					  struct xenvif,
>> +					  pending_tx_info[0]);
>> +
>> +	spin_lock_irqsave(&vif->dealloc_lock, flags);
>> +	do {
>> +		pending_idx = ubuf->desc;
>> +		ubuf = (struct ubuf_info *) ubuf->ctx;
>> +		index = pending_index(vif->dealloc_prod);
>> +		vif->dealloc_ring[index] = pending_idx;
>> +		/* Sync with xenvif_tx_dealloc_action:
>> +		 * insert idx then incr producer.
>> +		 */
>> +		smp_wmb();
>
> Is this really needed given that there is a lock held?
Yes, as the comment right above explains. This actually comes from 
classic kernel's netif_idx_release
>
> Or what is dealloc_lock protecting against?
The callbacks from each other. So it is checjed only in this function.
>
>> +		vif->dealloc_prod++;
>
> What happens if the dealloc ring becomes full, will this wrap and cause
> havoc?
Nope, if the dealloc ring is full, the value of the last increment won't 
be used to index the dealloc ring again until some space made available. 
Of course if something broke and we have more pending slots than tx ring 
or dealloc slots then it can happen. Do you suggest a 
BUG_ON(vif->dealloc_prod - vif->dealloc_cons >= MAX_PENDING_REQS)?

>
>> +	} while (ubuf);
>> +	wake_up(&vif->dealloc_wq);
>> +	spin_unlock_irqrestore(&vif->dealloc_lock, flags);
>> +}
>> +
>> +static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
>> +{
>> +	struct gnttab_unmap_grant_ref *gop;
>> +	pending_ring_idx_t dc, dp;
>> +	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
>> +	unsigned int i = 0;
>> +
>> +	dc = vif->dealloc_cons;
>> +	gop = vif->tx_unmap_ops;
>> +
>> +	/* Free up any grants we have finished using */
>> +	do {
>> +		dp = vif->dealloc_prod;
>> +
>> +		/* Ensure we see all indices enqueued by all
>> +		 * xenvif_zerocopy_callback().
>> +		 */
>> +		smp_rmb();
>> +
>> +		while (dc != dp) {
>> +			pending_idx =
>> +				vif->dealloc_ring[pending_index(dc++)];
>> +
>> +			/* Already unmapped? */
>> +			if (vif->grant_tx_handle[pending_idx] ==
>> +				NETBACK_INVALID_HANDLE) {
>> +				netdev_err(vif->dev,
>> +					   "Trying to unmap invalid handle! "
>> +					   "pending_idx: %x\n", pending_idx);
>> +				BUG();
>> +			}
>> +
>> +			pending_idx_release[gop-vif->tx_unmap_ops] =
>> +				pending_idx;
>> +			vif->pages_to_unmap[gop-vif->tx_unmap_ops] =
>> +				vif->mmap_pages[pending_idx];
>> +			gnttab_set_unmap_op(gop,
>> +					    idx_to_kaddr(vif, pending_idx),
>> +					    GNTMAP_host_map,
>> +					    vif->grant_tx_handle[pending_idx]);
>> +			vif->grant_tx_handle[pending_idx] =
>> +				NETBACK_INVALID_HANDLE;
>> +			++gop;
>
> Can we run out of space in the gop array?
No, unless the same thing happen as at my previous answer. BUG_ON() here 
as well?
>
>> +		}
>> +
>> +	} while (dp != vif->dealloc_prod);
>> +
>> +	vif->dealloc_cons = dc;
>
> No barrier here?
dealloc_cons only used in the dealloc_thread. dealloc_prod is used by 
the callback and the thread as well, that's why we need mb() in 
previous. Btw. this function comes from classic's net_tx_action_dealloc

>
>> +	if (gop - vif->tx_unmap_ops > 0) {
>> +		int ret;
>> +		ret = gnttab_unmap_refs(vif->tx_unmap_ops,
>> +					vif->pages_to_unmap,
>> +					gop - vif->tx_unmap_ops);
>> +		if (ret) {
>> +			netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
>> +				   gop - vif->tx_unmap_ops, ret);
>> +			for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {
>
> This seems liable to be a lot of spew on failure. Perhaps only log the
> ones where gop[i].status != success.
Ok, I'll change that.
>
> Have you considered whether or not the frontend can force this error to
> occur?
Not yet, good point. I guess if we successfully mapped the page, then 
there is no way a frontend to prevent unmapping. But worth further checking.
>
>> +				netdev_err(vif->dev,
>> +					   " host_addr: %llx handle: %x status: %d\n",
>> +					   gop[i].host_addr,
>> +					   gop[i].handle,
>> +					   gop[i].status);
>> +			}
>> +			BUG();
>> +		}
>> +	}
>> +
>> +	for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
>> +		xenvif_idx_release(vif, pending_idx_release[i],
>> +				   XEN_NETIF_RSP_OKAY);
>> +}
>> +
>> +
>>   /* Called after netfront has transmitted */
>>   int xenvif_tx_action(struct xenvif *vif, int budget)
>>   {
>> @@ -1678,6 +1793,25 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
>>   	vif->mmap_pages[pending_idx] = NULL;
>>   }
>>
>> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
>
> This is a single shot version of the batched xenvif_tx_dealloc_action
> version? Why not just enqueue the idx to be unmapped later?
This is called only from the NAPI instance. Using the dealloc ring 
require synchronization with the callback which can increase lock 
contention. On the other hand, if the guest sends small packets 
(<PAGE_SIZE), the TLB flushing can cause performance penalty. The above 
mentioned upcoming patch which gntcopy the header can prevent that 
(together with Malcolm's Xen side patch, which prevents TLB flush if the 
page were not touched in Dom0)

>> @@ -1826,6 +1965,28 @@ int xenvif_kthread(void *data)
>>   	return 0;
>>   }
>>
>> +int xenvif_dealloc_kthread(void *data)
>
> Is this going to be a thread per vif?
Yes. In the first versions I've put the dealloc in the NAPI instance 
(similarly as in classic, where it happened in tx_action), but that had 
an unexpected performance penalty: the callback has to notify whoever 
does the dealloc, that there is something to do. If it is the NAPI 
instance, it has to call napi_schedule. But if the packet were delivered 
to an another guest, the callback is called from thread context, and 
according to Eric Dumazet, napi_schedule from thread context can 
significantly delay softirq handling. So NAPI instance were delayed with 
miliseconds, and it caused terrible performance.
Moving this to the RX thread haven't seemed like a wise decision, so I 
made a new thread.
Actually in the next version of the patches I'll reintroduce 
__napi_schedule in the callback again, because if the NAPI instance 
still have unconsumed requests but not enough pending slots, it 
deschedule itself, and the callback has to schedule it again, if:
- unconsumed requests in the ring < XEN_NETBK_LEGACY_SLOTS_MAX
- there are enough free pending slots to handle them
- and the NAPI instance is not scheduled yet
This should really happen if netback is faster than target devices, but 
then it doesn't mean a bottleneck.

Zoli

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Feb. 19, 2014, 10:05 a.m. UTC | #4
On Tue, 2014-02-18 at 20:36 +0000, Zoltan Kiss wrote:
> On 18/02/14 17:06, Ian Campbell wrote:
> > On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> >> This patch contains the new definitions necessary for grant mapping.
> >
> > Is this just adding a bunch of (currently) unused functions? That's a
> > slightly odd way to structure a series. They don't seem to be "generic
> > helpers" or anything so it would be more normal to introduce these as
> > they get used -- it's a bit hard to review them out of context.
> I've created two patches because they are quite huge even now, 
> separately. Together they would be a ~500 line change. That was the best 
> I could figure out keeping in mind that bisect should work. But as I 
> wrote in the first email, I welcome other suggestions. If you and Wei 
> prefer this two patch in one big one, I merge them in the next version.

I suppose it is hard to split a change like this up in a sensible way,
but it is rather hard to review something which is split in two parts
sensibly.

If the combined patch too large to fit on the lists?

> >> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> >> index 7669d49..f0f0c3d 100644
> >> --- a/drivers/net/xen-netback/interface.c
> >> +++ b/drivers/net/xen-netback/interface.c
> >> @@ -38,6 +38,7 @@
> >>
> >>   #include <xen/events.h>
> >>   #include <asm/xen/hypercall.h>
> >> +#include <xen/balloon.h>
> >
> > What is this for?
> For alloc/free_xenballooned_pages

I think I was confused because those changes aren't in this patch.

> >
> >> +					  struct xenvif,
> >> +					  pending_tx_info[0]);
> >> +
> >> +	spin_lock_irqsave(&vif->dealloc_lock, flags);
> >> +	do {
> >> +		pending_idx = ubuf->desc;
> >> +		ubuf = (struct ubuf_info *) ubuf->ctx;
> >> +		index = pending_index(vif->dealloc_prod);
> >> +		vif->dealloc_ring[index] = pending_idx;
> >> +		/* Sync with xenvif_tx_dealloc_action:
> >> +		 * insert idx then incr producer.
> >> +		 */
> >> +		smp_wmb();
> >
> > Is this really needed given that there is a lock held?
> Yes, as the comment right above explains.

My question is why do you need this sync if you are holding a lock, the
comment doesn't tell me that. I suppose xenvif_tx_dealloc_action doesn't
hold the dealloc_lock, but that is non-obvious from the names.

I think I asked in a subsequent patch for an improved description of the
locking going on here.

>  This actually comes from 
> classic kernel's netif_idx_release
> >
> > Or what is dealloc_lock protecting against?
> The callbacks from each other. So it is checjed only in this function.
> >
> >> +		vif->dealloc_prod++;
> >
> > What happens if the dealloc ring becomes full, will this wrap and cause
> > havoc?
> Nope, if the dealloc ring is full, the value of the last increment won't 
> be used to index the dealloc ring again until some space made available. 

I don't follow -- what makes this the case?

> Of course if something broke and we have more pending slots than tx ring 
> or dealloc slots then it can happen. Do you suggest a 
> BUG_ON(vif->dealloc_prod - vif->dealloc_cons >= MAX_PENDING_REQS)?

A
         BUG_ON(space in dealloc ring < number of slots needed to dealloc this skb)
would seem to be the right thing, if that really is the invariant the
code is supposed to be implementing.

> >> +	} while (ubuf);
> >> +	wake_up(&vif->dealloc_wq);
> >> +	spin_unlock_irqrestore(&vif->dealloc_lock, flags);
> >> +}
> >> +
> >> +static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
> >> +{
> >> +	struct gnttab_unmap_grant_ref *gop;
> >> +	pending_ring_idx_t dc, dp;
> >> +	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
> >> +	unsigned int i = 0;
> >> +
> >> +	dc = vif->dealloc_cons;
> >> +	gop = vif->tx_unmap_ops;
> >> +
> >> +	/* Free up any grants we have finished using */
> >> +	do {
> >> +		dp = vif->dealloc_prod;
> >> +
> >> +		/* Ensure we see all indices enqueued by all
> >> +		 * xenvif_zerocopy_callback().
> >> +		 */
> >> +		smp_rmb();
> >> +
> >> +		while (dc != dp) {
> >> +			pending_idx =
> >> +				vif->dealloc_ring[pending_index(dc++)];
> >> +
> >> +			/* Already unmapped? */
> >> +			if (vif->grant_tx_handle[pending_idx] ==
> >> +				NETBACK_INVALID_HANDLE) {
> >> +				netdev_err(vif->dev,
> >> +					   "Trying to unmap invalid handle! "
> >> +					   "pending_idx: %x\n", pending_idx);
> >> +				BUG();
> >> +			}
> >> +
> >> +			pending_idx_release[gop-vif->tx_unmap_ops] =
> >> +				pending_idx;
> >> +			vif->pages_to_unmap[gop-vif->tx_unmap_ops] =
> >> +				vif->mmap_pages[pending_idx];
> >> +			gnttab_set_unmap_op(gop,
> >> +					    idx_to_kaddr(vif, pending_idx),
> >> +					    GNTMAP_host_map,
> >> +					    vif->grant_tx_handle[pending_idx]);
> >> +			vif->grant_tx_handle[pending_idx] =
> >> +				NETBACK_INVALID_HANDLE;
> >> +			++gop;
> >
> > Can we run out of space in the gop array?
> No, unless the same thing happen as at my previous answer. BUG_ON() here 
> as well?

Yes, or at the very least a comment explaining how/why gop is bounded
elsewhere.

> >
> >> +		}
> >> +
> >> +	} while (dp != vif->dealloc_prod);
> >> +
> >> +	vif->dealloc_cons = dc;
> >
> > No barrier here?
> dealloc_cons only used in the dealloc_thread. dealloc_prod is used by 
> the callback and the thread as well, that's why we need mb() in 
> previous. Btw. this function comes from classic's net_tx_action_dealloc

Is this code close enough to that code architecturally that you can
infer correctness due to that though?

So long as you have considered the barrier semantics in the context of
the current code and you think it is correct to not have one here then
I'm ok. But if you have just assumed it is OK because some older code
didn't have it then I'll have to ask you to consider it again...

> >> +				netdev_err(vif->dev,
> >> +					   " host_addr: %llx handle: %x status: %d\n",
> >> +					   gop[i].host_addr,
> >> +					   gop[i].handle,
> >> +					   gop[i].status);
> >> +			}
> >> +			BUG();
> >> +		}
> >> +	}
> >> +
> >> +	for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
> >> +		xenvif_idx_release(vif, pending_idx_release[i],
> >> +				   XEN_NETIF_RSP_OKAY);
> >> +}
> >> +
> >> +
> >>   /* Called after netfront has transmitted */
> >>   int xenvif_tx_action(struct xenvif *vif, int budget)
> >>   {
> >> @@ -1678,6 +1793,25 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
> >>   	vif->mmap_pages[pending_idx] = NULL;
> >>   }
> >>
> >> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
> >
> > This is a single shot version of the batched xenvif_tx_dealloc_action
> > version? Why not just enqueue the idx to be unmapped later?
> This is called only from the NAPI instance. Using the dealloc ring 
> require synchronization with the callback which can increase lock 
> contention. On the other hand, if the guest sends small packets 
> (<PAGE_SIZE), the TLB flushing can cause performance penalty.

Right. When/How often is this called from the NAPI instance?

Is the locking contention from this case so severe that it out weighs
the benefits of batching the unmaps? That would surprise me. After all
the locking contention is there for the zerocopy_callback case too

>  The above 
> mentioned upcoming patch which gntcopy the header can prevent that 

So this is only called when doing the pull-up to the linear area?

Ian.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zoltan Kiss Feb. 19, 2014, 7:19 p.m. UTC | #5
On 18/02/14 17:24, Ian Campbell wrote:
> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>>
>> +       spinlock_t dealloc_lock;
>> +       spinlock_t response_lock;
>
> Please add comments to both of these describing what bits of the
> datastructure they are locking.
>
> You might find it is clearer to group the locks and the things they
> protect together rather than grouping the locks together.

Ok, I'll give more description here. The response_lock is actually quite 
relevant to be here, but indeed that's not obvious, I'll explain that as 
well.

Zoli

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zoltan Kiss Feb. 19, 2014, 7:54 p.m. UTC | #6
On 19/02/14 10:05, Ian Campbell wrote:
> On Tue, 2014-02-18 at 20:36 +0000, Zoltan Kiss wrote:
>> On 18/02/14 17:06, Ian Campbell wrote:
>>> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>>>> This patch contains the new definitions necessary for grant mapping.
>>>
>>> Is this just adding a bunch of (currently) unused functions? That's a
>>> slightly odd way to structure a series. They don't seem to be "generic
>>> helpers" or anything so it would be more normal to introduce these as
>>> they get used -- it's a bit hard to review them out of context.
>> I've created two patches because they are quite huge even now,
>> separately. Together they would be a ~500 line change. That was the best
>> I could figure out keeping in mind that bisect should work. But as I
>> wrote in the first email, I welcome other suggestions. If you and Wei
>> prefer this two patch in one big one, I merge them in the next version.
>
> I suppose it is hard to split a change like this up in a sensible way,
> but it is rather hard to review something which is split in two parts
> sensibly.
>
> If the combined patch too large to fit on the lists?
Well, it's ca. 30 kb, ~500 lines changed. I guess it's possible. It's up 
to you and Wei, if you would like them to be merged, I can do that.

>>>
>>>> +					  struct xenvif,
>>>> +					  pending_tx_info[0]);
>>>> +
>>>> +	spin_lock_irqsave(&vif->dealloc_lock, flags);
>>>> +	do {
>>>> +		pending_idx = ubuf->desc;
>>>> +		ubuf = (struct ubuf_info *) ubuf->ctx;
>>>> +		index = pending_index(vif->dealloc_prod);
>>>> +		vif->dealloc_ring[index] = pending_idx;
>>>> +		/* Sync with xenvif_tx_dealloc_action:
>>>> +		 * insert idx then incr producer.
>>>> +		 */
>>>> +		smp_wmb();
>>>
>>> Is this really needed given that there is a lock held?
>> Yes, as the comment right above explains.
>
> My question is why do you need this sync if you are holding a lock, the
> comment doesn't tell me that. I suppose xenvif_tx_dealloc_action doesn't
> hold the dealloc_lock, but that is non-obvious from the names.
Ok, I'll clarify that in the comment.

>>>
>>>> +		vif->dealloc_prod++;
>>>
>>> What happens if the dealloc ring becomes full, will this wrap and cause
>>> havoc?
>> Nope, if the dealloc ring is full, the value of the last increment won't
>> be used to index the dealloc ring again until some space made available.
>
> I don't follow -- what makes this the case?
The dealloc ring has the same size as the pending ring, and you can only 
add slots to it which are already on the pending ring (the pending_idx 
comes from ubuf->desc), as you are essentially free up slots here on the 
pending ring.
So if the dealloc ring becomes full, vif->dealloc_prod - 
vif->dealloc_cons will be 256, which would be bad. But the while loop 
should exit here, as we shouldn't have any more pending slots. And if we 
dealloc and create free pending slots in dealloc_action, dealloc_cons 
will also advance.

>> Of course if something broke and we have more pending slots than tx ring
>> or dealloc slots then it can happen. Do you suggest a
>> BUG_ON(vif->dealloc_prod - vif->dealloc_cons >= MAX_PENDING_REQS)?
>
> A
>           BUG_ON(space in dealloc ring < number of slots needed to dealloc this skb)
> would seem to be the right thing, if that really is the invariant the
> code is supposed to be implementing.
Not exactly, it means BUG_ON(number of slots to dealloc > 
MAX_PENDING_REQS), and it should be at the end of the loop, without '='.

>>>> +static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
>>>> +{
>>>> +	struct gnttab_unmap_grant_ref *gop;
>>>> +	pending_ring_idx_t dc, dp;
>>>> +	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
>>>> +	unsigned int i = 0;
>>>> +
>>>> +	dc = vif->dealloc_cons;
>>>> +	gop = vif->tx_unmap_ops;
>>>> +
>>>> +	/* Free up any grants we have finished using */
>>>> +	do {
>>>> +		dp = vif->dealloc_prod;
>>>> +
>>>> +		/* Ensure we see all indices enqueued by all
>>>> +		 * xenvif_zerocopy_callback().
>>>> +		 */
>>>> +		smp_rmb();
>>>> +
>>>> +		while (dc != dp) {
>>>> +			pending_idx =
>>>> +				vif->dealloc_ring[pending_index(dc++)];
>>>> +
>>>> +			/* Already unmapped? */
>>>> +			if (vif->grant_tx_handle[pending_idx] ==
>>>> +				NETBACK_INVALID_HANDLE) {
>>>> +				netdev_err(vif->dev,
>>>> +					   "Trying to unmap invalid handle! "
>>>> +					   "pending_idx: %x\n", pending_idx);
>>>> +				BUG();
>>>> +			}
>>>> +
>>>> +			pending_idx_release[gop-vif->tx_unmap_ops] =
>>>> +				pending_idx;
>>>> +			vif->pages_to_unmap[gop-vif->tx_unmap_ops] =
>>>> +				vif->mmap_pages[pending_idx];
>>>> +			gnttab_set_unmap_op(gop,
>>>> +					    idx_to_kaddr(vif, pending_idx),
>>>> +					    GNTMAP_host_map,
>>>> +					    vif->grant_tx_handle[pending_idx]);
>>>> +			vif->grant_tx_handle[pending_idx] =
>>>> +				NETBACK_INVALID_HANDLE;
>>>> +			++gop;
>>>
>>> Can we run out of space in the gop array?
>> No, unless the same thing happen as at my previous answer. BUG_ON() here
>> as well?
>
> Yes, or at the very least a comment explaining how/why gop is bounded
> elsewhere.
Ok, I'll do that.

>
>>>
>>>> +		}
>>>> +
>>>> +	} while (dp != vif->dealloc_prod);
>>>> +
>>>> +	vif->dealloc_cons = dc;
>>>
>>> No barrier here?
>> dealloc_cons only used in the dealloc_thread. dealloc_prod is used by
>> the callback and the thread as well, that's why we need mb() in
>> previous. Btw. this function comes from classic's net_tx_action_dealloc
>
> Is this code close enough to that code architecturally that you can
> infer correctness due to that though?
Nope, I've just mentioned it because knowing that old code can help to 
understand this new, as their logic is very similar some places, like here.

> So long as you have considered the barrier semantics in the context of
> the current code and you think it is correct to not have one here then
> I'm ok. But if you have just assumed it is OK because some older code
> didn't have it then I'll have to ask you to consider it again...
Nope, as I mentioned above, dealloc_cons only accessed in that funcion, 
from the same thread. Dealloc_prod is written in the callback and read 
out here, that's why we need the barrier there.

>
>>>> +				netdev_err(vif->dev,
>>>> +					   " host_addr: %llx handle: %x status: %d\n",
>>>> +					   gop[i].host_addr,
>>>> +					   gop[i].handle,
>>>> +					   gop[i].status);
>>>> +			}
>>>> +			BUG();
>>>> +		}
>>>> +	}
>>>> +
>>>> +	for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
>>>> +		xenvif_idx_release(vif, pending_idx_release[i],
>>>> +				   XEN_NETIF_RSP_OKAY);
>>>> +}
>>>> +
>>>> +
>>>>    /* Called after netfront has transmitted */
>>>>    int xenvif_tx_action(struct xenvif *vif, int budget)
>>>>    {
>>>> @@ -1678,6 +1793,25 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
>>>>    	vif->mmap_pages[pending_idx] = NULL;
>>>>    }
>>>>
>>>> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
>>>
>>> This is a single shot version of the batched xenvif_tx_dealloc_action
>>> version? Why not just enqueue the idx to be unmapped later?
>> This is called only from the NAPI instance. Using the dealloc ring
>> require synchronization with the callback which can increase lock
>> contention. On the other hand, if the guest sends small packets
>> (<PAGE_SIZE), the TLB flushing can cause performance penalty.
>
> Right. When/How often is this called from the NAPI instance?
When grant mapping error detected in xenvif_tx_check_gop, and if a 
packet smaller than PKT_PROT_LEN is sent. The latter would be removed if 
we will grant copy such packets entirely.

> Is the locking contention from this case so severe that it out weighs
> the benefits of batching the unmaps? That would surprise me. After all
> the locking contention is there for the zerocopy_callback case too
>
>>   The above
>> mentioned upcoming patch which gntcopy the header can prevent that
>
> So this is only called when doing the pull-up to the linear area?
Yes, as mentioned above.

Zoli
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Feb. 20, 2014, 9:33 a.m. UTC | #7
On Wed, 2014-02-19 at 19:54 +0000, Zoltan Kiss wrote:
> On 19/02/14 10:05, Ian Campbell wrote:
> > On Tue, 2014-02-18 at 20:36 +0000, Zoltan Kiss wrote:
> >> On 18/02/14 17:06, Ian Campbell wrote:
> >>> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> >>>> This patch contains the new definitions necessary for grant mapping.
> >>>
> >>> Is this just adding a bunch of (currently) unused functions? That's a
> >>> slightly odd way to structure a series. They don't seem to be "generic
> >>> helpers" or anything so it would be more normal to introduce these as
> >>> they get used -- it's a bit hard to review them out of context.
> >> I've created two patches because they are quite huge even now,
> >> separately. Together they would be a ~500 line change. That was the best
> >> I could figure out keeping in mind that bisect should work. But as I
> >> wrote in the first email, I welcome other suggestions. If you and Wei
> >> prefer this two patch in one big one, I merge them in the next version.
> >
> > I suppose it is hard to split a change like this up in a sensible way,
> > but it is rather hard to review something which is split in two parts
> > sensibly.
> >
> > If the combined patch too large to fit on the lists?
> Well, it's ca. 30 kb, ~500 lines changed. I guess it's possible. It's up 
> to you and Wei, if you would like them to be merged, I can do that.

30kb doesn't sound too bad to me.

Patches #1 and #2 are, respectively:

 drivers/net/xen-netback/common.h    |   30 ++++++-
 drivers/net/xen-netback/interface.c |    1 +
 drivers/net/xen-netback/netback.c   |  161 +++++++++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+), 1 deletion(-)

 drivers/net/xen-netback/interface.c |   63 ++++++++-
 drivers/net/xen-netback/netback.c   |  254 ++++++++++++++---------------------
 2 files changed, 160 insertions(+), 157 deletions(-)

I don't think combining those would be terrible, although I'm willing to
be proven wrong ;-)

> >>>
> >>>> +		vif->dealloc_prod++;
> >>>
> >>> What happens if the dealloc ring becomes full, will this wrap and cause
> >>> havoc?
> >> Nope, if the dealloc ring is full, the value of the last increment won't
> >> be used to index the dealloc ring again until some space made available.
> >
> > I don't follow -- what makes this the case?
> The dealloc ring has the same size as the pending ring, and you can only 
> add slots to it which are already on the pending ring (the pending_idx 
> comes from ubuf->desc), as you are essentially free up slots here on the 
> pending ring.
> So if the dealloc ring becomes full, vif->dealloc_prod - 
> vif->dealloc_cons will be 256, which would be bad. But the while loop 
> should exit here, as we shouldn't have any more pending slots. And if we 
> dealloc and create free pending slots in dealloc_action, dealloc_cons 
> will also advance.

OK, so this is limited by the size of the pending array, makes sense,
assuming that array is itself correctly guarded...

> >> Of course if something broke and we have more pending slots than tx ring
> >> or dealloc slots then it can happen. Do you suggest a
> >> BUG_ON(vif->dealloc_prod - vif->dealloc_cons >= MAX_PENDING_REQS)?
> >
> > A
> >           BUG_ON(space in dealloc ring < number of slots needed to dealloc this skb)
> > would seem to be the right thing, if that really is the invariant the
> > code is supposed to be implementing.
> Not exactly, it means BUG_ON(number of slots to dealloc > 
> MAX_PENDING_REQS), and it should be at the end of the loop, without '='.

OK.

> >
> >>>
> >>>> +		}
> >>>> +
> >>>> +	} while (dp != vif->dealloc_prod);
> >>>> +
> >>>> +	vif->dealloc_cons = dc;
> >>>
> >>> No barrier here?
> >> dealloc_cons only used in the dealloc_thread. dealloc_prod is used by
> >> the callback and the thread as well, that's why we need mb() in
> >> previous. Btw. this function comes from classic's net_tx_action_dealloc
> >
> > Is this code close enough to that code architecturally that you can
> > infer correctness due to that though?
> Nope, I've just mentioned it because knowing that old code can help to 
> understand this new, as their logic is very similar some places, like here.
> 
> > So long as you have considered the barrier semantics in the context of
> > the current code and you think it is correct to not have one here then
> > I'm ok. But if you have just assumed it is OK because some older code
> > didn't have it then I'll have to ask you to consider it again...
> Nope, as I mentioned above, dealloc_cons only accessed in that funcion, 
> from the same thread. Dealloc_prod is written in the callback and read 
> out here, that's why we need the barrier there.

OK.

Although this may no longer be true if you added some BUG_ONs as
discussed above?

> 
> >
> >>>> +				netdev_err(vif->dev,
> >>>> +					   " host_addr: %llx handle: %x status: %d\n",
> >>>> +					   gop[i].host_addr,
> >>>> +					   gop[i].handle,
> >>>> +					   gop[i].status);
> >>>> +			}
> >>>> +			BUG();
> >>>> +		}
> >>>> +	}
> >>>> +
> >>>> +	for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
> >>>> +		xenvif_idx_release(vif, pending_idx_release[i],
> >>>> +				   XEN_NETIF_RSP_OKAY);
> >>>> +}
> >>>> +
> >>>> +
> >>>>    /* Called after netfront has transmitted */
> >>>>    int xenvif_tx_action(struct xenvif *vif, int budget)
> >>>>    {
> >>>> @@ -1678,6 +1793,25 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
> >>>>    	vif->mmap_pages[pending_idx] = NULL;
> >>>>    }
> >>>>
> >>>> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
> >>>
> >>> This is a single shot version of the batched xenvif_tx_dealloc_action
> >>> version? Why not just enqueue the idx to be unmapped later?
> >> This is called only from the NAPI instance. Using the dealloc ring
> >> require synchronization with the callback which can increase lock
> >> contention. On the other hand, if the guest sends small packets
> >> (<PAGE_SIZE), the TLB flushing can cause performance penalty.
> >
> > Right. When/How often is this called from the NAPI instance?
> When grant mapping error detected in xenvif_tx_check_gop, and if a 
> packet smaller than PKT_PROT_LEN is sent. The latter would be removed if 
> we will grant copy such packets entirely.
> 
> > Is the locking contention from this case so severe that it out weighs
> > the benefits of batching the unmaps? That would surprise me. After all
> > the locking contention is there for the zerocopy_callback case too
> >
> >>   The above
> >> mentioned upcoming patch which gntcopy the header can prevent that
> >
> > So this is only called when doing the pull-up to the linear area?
> Yes, as mentioned above.

I'm not sure why you don't just enqueue the dealloc with the other
normal ones though.

Ian.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Liu Feb. 20, 2014, 10:13 a.m. UTC | #8
On Wed, Feb 19, 2014 at 07:54:29PM +0000, Zoltan Kiss wrote:
> On 19/02/14 10:05, Ian Campbell wrote:
> >On Tue, 2014-02-18 at 20:36 +0000, Zoltan Kiss wrote:
> >>On 18/02/14 17:06, Ian Campbell wrote:
> >>>On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> >>>>This patch contains the new definitions necessary for grant mapping.
> >>>
> >>>Is this just adding a bunch of (currently) unused functions? That's a
> >>>slightly odd way to structure a series. They don't seem to be "generic
> >>>helpers" or anything so it would be more normal to introduce these as
> >>>they get used -- it's a bit hard to review them out of context.
> >>I've created two patches because they are quite huge even now,
> >>separately. Together they would be a ~500 line change. That was the best
> >>I could figure out keeping in mind that bisect should work. But as I
> >>wrote in the first email, I welcome other suggestions. If you and Wei
> >>prefer this two patch in one big one, I merge them in the next version.
> >
> >I suppose it is hard to split a change like this up in a sensible way,
> >but it is rather hard to review something which is split in two parts
> >sensibly.
> >
> >If the combined patch too large to fit on the lists?
> Well, it's ca. 30 kb, ~500 lines changed. I guess it's possible.
> It's up to you and Wei, if you would like them to be merged, I can
> do that.
> 

As I said before, my bottom line is "don't break bisection". Do whatever
you want to. :-)

Wei.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zoltan Kiss Feb. 21, 2014, 1:19 a.m. UTC | #9
On 20/02/14 09:33, Ian Campbell wrote:
> On Wed, 2014-02-19 at 19:54 +0000, Zoltan Kiss wrote:
>> On 19/02/14 10:05, Ian Campbell wrote:
>>> On Tue, 2014-02-18 at 20:36 +0000, Zoltan Kiss wrote:
>>>> On 18/02/14 17:06, Ian Campbell wrote:
>>>>> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>>>>>> This patch contains the new definitions necessary for grant mapping.
>>>>> Is this just adding a bunch of (currently) unused functions? That's a
>>>>> slightly odd way to structure a series. They don't seem to be "generic
>>>>> helpers" or anything so it would be more normal to introduce these as
>>>>> they get used -- it's a bit hard to review them out of context.
>>>> I've created two patches because they are quite huge even now,
>>>> separately. Together they would be a ~500 line change. That was the best
>>>> I could figure out keeping in mind that bisect should work. But as I
>>>> wrote in the first email, I welcome other suggestions. If you and Wei
>>>> prefer this two patch in one big one, I merge them in the next version.
>>> I suppose it is hard to split a change like this up in a sensible way,
>>> but it is rather hard to review something which is split in two parts
>>> sensibly.
>>>
>>> If the combined patch too large to fit on the lists?
>> Well, it's ca. 30 kb, ~500 lines changed. I guess it's possible. It's up
>> to you and Wei, if you would like them to be merged, I can do that.
> 30kb doesn't sound too bad to me.
>
> Patches #1 and #2 are, respectively:
>
>   drivers/net/xen-netback/common.h    |   30 ++++++-
>   drivers/net/xen-netback/interface.c |    1 +
>   drivers/net/xen-netback/netback.c   |  161 +++++++++++++++++++++++++++++++++++
>   3 files changed, 191 insertions(+), 1 deletion(-)
>
>   drivers/net/xen-netback/interface.c |   63 ++++++++-
>   drivers/net/xen-netback/netback.c   |  254 ++++++++++++++---------------------
>   2 files changed, 160 insertions(+), 157 deletions(-)
>
> I don't think combining those would be terrible, although I'm willing to
> be proven wrong ;-)
Ok, if noone comes up with any better argument before I send in the next 
version, I'll merge the 2 patches.
>
>>>>>> +		vif->dealloc_prod++;
>>>>> What happens if the dealloc ring becomes full, will this wrap and cause
>>>>> havoc?
>>>> Nope, if the dealloc ring is full, the value of the last increment won't
>>>> be used to index the dealloc ring again until some space made available.
>>> I don't follow -- what makes this the case?
>> The dealloc ring has the same size as the pending ring, and you can only
>> add slots to it which are already on the pending ring (the pending_idx
>> comes from ubuf->desc), as you are essentially free up slots here on the
>> pending ring.
>> So if the dealloc ring becomes full, vif->dealloc_prod -
>> vif->dealloc_cons will be 256, which would be bad. But the while loop
>> should exit here, as we shouldn't have any more pending slots. And if we
>> dealloc and create free pending slots in dealloc_action, dealloc_cons
>> will also advance.
> OK, so this is limited by the size of the pending array, makes sense,
> assuming that array is itself correctly guarded...
Well, that pending ring works the same as before, the only difference 
that now the slots are released from dealloc thread as well, not just 
from from NAPI instance. That's why we need response_lock. I'll make a 
comment on that.
>>>>>> +		}
>>>>>> +
>>>>>> +	} while (dp != vif->dealloc_prod);
>>>>>> +
>>>>>> +	vif->dealloc_cons = dc;
>>>>> No barrier here?
>>>> dealloc_cons only used in the dealloc_thread. dealloc_prod is used by
>>>> the callback and the thread as well, that's why we need mb() in
>>>> previous. Btw. this function comes from classic's net_tx_action_dealloc
>>> Is this code close enough to that code architecturally that you can
>>> infer correctness due to that though?
>> Nope, I've just mentioned it because knowing that old code can help to
>> understand this new, as their logic is very similar some places, like here.
>>
>>> So long as you have considered the barrier semantics in the context of
>>> the current code and you think it is correct to not have one here then
>>> I'm ok. But if you have just assumed it is OK because some older code
>>> didn't have it then I'll have to ask you to consider it again...
>> Nope, as I mentioned above, dealloc_cons only accessed in that funcion,
>> from the same thread. Dealloc_prod is written in the callback and read
>> out here, that's why we need the barrier there.
> OK.
>
> Although this may no longer be true if you added some BUG_ONs as
> discussed above?
Yep, that BUG_ON might see a smaller value of dealloc_cons, but that 
should be OK. We will release those slots after grant unmapping, they 
shouldn't be filled up again until then.
>
>>>>>> +				netdev_err(vif->dev,
>>>>>> +					   " host_addr: %llx handle: %x status: %d\n",
>>>>>> +					   gop[i].host_addr,
>>>>>> +					   gop[i].handle,
>>>>>> +					   gop[i].status);
>>>>>> +			}
>>>>>> +			BUG();
>>>>>> +		}
>>>>>> +	}
>>>>>> +
>>>>>> +	for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
>>>>>> +		xenvif_idx_release(vif, pending_idx_release[i],
>>>>>> +				   XEN_NETIF_RSP_OKAY);
>>>>>> +}
>>>>>> +
>>>>>> +
>>>>>>     /* Called after netfront has transmitted */
>>>>>>     int xenvif_tx_action(struct xenvif *vif, int budget)
>>>>>>     {
>>>>>> @@ -1678,6 +1793,25 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
>>>>>>     	vif->mmap_pages[pending_idx] = NULL;
>>>>>>     }
>>>>>>
>>>>>> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
>>>>> This is a single shot version of the batched xenvif_tx_dealloc_action
>>>>> version? Why not just enqueue the idx to be unmapped later?
>>>> This is called only from the NAPI instance. Using the dealloc ring
>>>> require synchronization with the callback which can increase lock
>>>> contention. On the other hand, if the guest sends small packets
>>>> (<PAGE_SIZE), the TLB flushing can cause performance penalty.
>>> Right. When/How often is this called from the NAPI instance?
>> When grant mapping error detected in xenvif_tx_check_gop, and if a
>> packet smaller than PKT_PROT_LEN is sent. The latter would be removed if
>> we will grant copy such packets entirely.
>>
>>> Is the locking contention from this case so severe that it out weighs
>>> the benefits of batching the unmaps? That would surprise me. After all
>>> the locking contention is there for the zerocopy_callback case too
>>>
>>>>    The above
>>>> mentioned upcoming patch which gntcopy the header can prevent that
>>> So this is only called when doing the pull-up to the linear area?
>> Yes, as mentioned above.
> I'm not sure why you don't just enqueue the dealloc with the other
> normal ones though.
Well, I started off from this approach, as it maintains similarity with 
the grant copy ways of doing this. Historically we release the slots in 
xenvif_tx_check_gop straight away if there is a mapping error in any of 
them. I don't know if the guest expects that slots for the same packet 
comes back at the same time. Then I just reused the same function for 
<PKT_PROT_LEN packets instead of writing an another one. That will go 
away soon anyway.

Zoli
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Feb. 24, 2014, 11:13 a.m. UTC | #10
On Fri, 2014-02-21 at 01:19 +0000, Zoltan Kiss wrote:
> I don't know if the guest expects that slots for the same packet 
> comes back at the same time. 

I don't think the guest is allowed to assume that. In particular they
aren't allowed to assume the the slots will be freed in the order they
were presented on the ring. There used to be a debug patch to
deliberately permute the responses, perhaps it was in the old
netchannel2 tree.

Ian.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index ae413a2..66b4696 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -79,6 +79,11 @@  struct pending_tx_info {
 				  * if it is head of one or more tx
 				  * reqs
 				  */
+	/* callback data for released SKBs. The	callback is always
+	 * xenvif_zerocopy_callback, ctx points to the next fragment, desc
+	 * contains the pending_idx
+	 */
+	struct ubuf_info callback_struct;
 };
 
 #define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
@@ -108,6 +113,8 @@  struct xenvif_rx_meta {
  */
 #define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE)
 
+#define NETBACK_INVALID_HANDLE -1
+
 struct xenvif {
 	/* Unique identifier for this interface. */
 	domid_t          domid;
@@ -126,13 +133,26 @@  struct xenvif {
 	pending_ring_idx_t pending_cons;
 	u16 pending_ring[MAX_PENDING_REQS];
 	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
 
 	/* Coalescing tx requests before copying makes number of grant
 	 * copy ops greater or equal to number of slots required. In
 	 * worst case a tx request consumes 2 gnttab_copy.
 	 */
 	struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
-
+	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+	/* passed to gnttab_[un]map_refs with pages under (un)mapping */
+	struct page *pages_to_map[MAX_PENDING_REQS];
+	struct page *pages_to_unmap[MAX_PENDING_REQS];
+
+	spinlock_t dealloc_lock;
+	spinlock_t response_lock;
+	pending_ring_idx_t dealloc_prod;
+	pending_ring_idx_t dealloc_cons;
+	u16 dealloc_ring[MAX_PENDING_REQS];
+	struct task_struct *dealloc_task;
+	wait_queue_head_t dealloc_wq;
 
 	/* Use kthread for guest RX */
 	struct task_struct *task;
@@ -219,6 +239,8 @@  int xenvif_tx_action(struct xenvif *vif, int budget);
 int xenvif_kthread(void *data);
 void xenvif_kick_thread(struct xenvif *vif);
 
+int xenvif_dealloc_kthread(void *data);
+
 /* Determine whether the needed number of slots (req) are available,
  * and set req_event if not.
  */
@@ -226,6 +248,12 @@  bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);
 
 void xenvif_stop_queue(struct xenvif *vif);
 
+/* Callback from stack when TX packet can be released */
+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
+
+/* Unmap a pending page, usually has to be called before xenvif_idx_release */
+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
+
 extern bool separate_tx_rx_irq;
 
 #endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 7669d49..f0f0c3d 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -38,6 +38,7 @@ 
 
 #include <xen/events.h>
 #include <asm/xen/hypercall.h>
+#include <xen/balloon.h>
 
 #define XENVIF_QUEUE_LENGTH 32
 #define XENVIF_NAPI_WEIGHT  64
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index bb241d0..195602f 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -773,6 +773,20 @@  static struct page *xenvif_alloc_page(struct xenvif *vif,
 	return page;
 }
 
+static inline void xenvif_tx_create_gop(struct xenvif *vif,
+					u16 pending_idx,
+					struct xen_netif_tx_request *txp,
+					struct gnttab_map_grant_ref *gop)
+{
+	vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
+	gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
+			  GNTMAP_host_map | GNTMAP_readonly,
+			  txp->gref, vif->domid);
+
+	memcpy(&vif->pending_tx_info[pending_idx].req, txp,
+	       sizeof(*txp));
+}
+
 static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
 					       struct sk_buff *skb,
 					       struct xen_netif_tx_request *txp,
@@ -1612,6 +1626,107 @@  static int xenvif_tx_submit(struct xenvif *vif)
 	return work_done;
 }
 
+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
+{
+	unsigned long flags;
+	pending_ring_idx_t index;
+	u16 pending_idx = ubuf->desc;
+	struct pending_tx_info *temp =
+		container_of(ubuf, struct pending_tx_info, callback_struct);
+	struct xenvif *vif = container_of(temp - pending_idx,
+					  struct xenvif,
+					  pending_tx_info[0]);
+
+	spin_lock_irqsave(&vif->dealloc_lock, flags);
+	do {
+		pending_idx = ubuf->desc;
+		ubuf = (struct ubuf_info *) ubuf->ctx;
+		index = pending_index(vif->dealloc_prod);
+		vif->dealloc_ring[index] = pending_idx;
+		/* Sync with xenvif_tx_dealloc_action:
+		 * insert idx then incr producer.
+		 */
+		smp_wmb();
+		vif->dealloc_prod++;
+	} while (ubuf);
+	wake_up(&vif->dealloc_wq);
+	spin_unlock_irqrestore(&vif->dealloc_lock, flags);
+}
+
+static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
+{
+	struct gnttab_unmap_grant_ref *gop;
+	pending_ring_idx_t dc, dp;
+	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
+	unsigned int i = 0;
+
+	dc = vif->dealloc_cons;
+	gop = vif->tx_unmap_ops;
+
+	/* Free up any grants we have finished using */
+	do {
+		dp = vif->dealloc_prod;
+
+		/* Ensure we see all indices enqueued by all
+		 * xenvif_zerocopy_callback().
+		 */
+		smp_rmb();
+
+		while (dc != dp) {
+			pending_idx =
+				vif->dealloc_ring[pending_index(dc++)];
+
+			/* Already unmapped? */
+			if (vif->grant_tx_handle[pending_idx] ==
+				NETBACK_INVALID_HANDLE) {
+				netdev_err(vif->dev,
+					   "Trying to unmap invalid handle! "
+					   "pending_idx: %x\n", pending_idx);
+				BUG();
+			}
+
+			pending_idx_release[gop-vif->tx_unmap_ops] =
+				pending_idx;
+			vif->pages_to_unmap[gop-vif->tx_unmap_ops] =
+				vif->mmap_pages[pending_idx];
+			gnttab_set_unmap_op(gop,
+					    idx_to_kaddr(vif, pending_idx),
+					    GNTMAP_host_map,
+					    vif->grant_tx_handle[pending_idx]);
+			vif->grant_tx_handle[pending_idx] =
+				NETBACK_INVALID_HANDLE;
+			++gop;
+		}
+
+	} while (dp != vif->dealloc_prod);
+
+	vif->dealloc_cons = dc;
+
+	if (gop - vif->tx_unmap_ops > 0) {
+		int ret;
+		ret = gnttab_unmap_refs(vif->tx_unmap_ops,
+					vif->pages_to_unmap,
+					gop - vif->tx_unmap_ops);
+		if (ret) {
+			netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
+				   gop - vif->tx_unmap_ops, ret);
+			for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {
+				netdev_err(vif->dev,
+					   " host_addr: %llx handle: %x status: %d\n",
+					   gop[i].host_addr,
+					   gop[i].handle,
+					   gop[i].status);
+			}
+			BUG();
+		}
+	}
+
+	for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
+		xenvif_idx_release(vif, pending_idx_release[i],
+				   XEN_NETIF_RSP_OKAY);
+}
+
+
 /* Called after netfront has transmitted */
 int xenvif_tx_action(struct xenvif *vif, int budget)
 {
@@ -1678,6 +1793,25 @@  static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
 	vif->mmap_pages[pending_idx] = NULL;
 }
 
+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
+{
+	int ret;
+	struct gnttab_unmap_grant_ref tx_unmap_op;
+
+	if (vif->grant_tx_handle[pending_idx] == NETBACK_INVALID_HANDLE) {
+		netdev_err(vif->dev,
+			   "Trying to unmap invalid handle! pending_idx: %x\n",
+			   pending_idx);
+		BUG();
+	}
+	gnttab_set_unmap_op(&tx_unmap_op,
+			    idx_to_kaddr(vif, pending_idx),
+			    GNTMAP_host_map,
+			    vif->grant_tx_handle[pending_idx]);
+	ret = gnttab_unmap_refs(&tx_unmap_op, &vif->mmap_pages[pending_idx], 1);
+	BUG_ON(ret);
+	vif->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
+}
 
 static void make_tx_response(struct xenvif *vif,
 			     struct xen_netif_tx_request *txp,
@@ -1740,6 +1874,11 @@  static inline int tx_work_todo(struct xenvif *vif)
 	return 0;
 }
 
+static inline bool tx_dealloc_work_todo(struct xenvif *vif)
+{
+	return vif->dealloc_cons != vif->dealloc_prod
+}
+
 void xenvif_unmap_frontend_rings(struct xenvif *vif)
 {
 	if (vif->tx.sring)
@@ -1826,6 +1965,28 @@  int xenvif_kthread(void *data)
 	return 0;
 }
 
+int xenvif_dealloc_kthread(void *data)
+{
+	struct xenvif *vif = data;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(vif->dealloc_wq,
+					 tx_dealloc_work_todo(vif) ||
+					 kthread_should_stop());
+		if (kthread_should_stop())
+			break;
+
+		xenvif_tx_dealloc_action(vif);
+		cond_resched();
+	}
+
+	/* Unmap anything remaining*/
+	if (tx_dealloc_work_todo(vif))
+		xenvif_tx_dealloc_action(vif);
+
+	return 0;
+}
+
 static int __init netback_init(void)
 {
 	int rc = 0;