diff mbox

[net,v5,6/6] virtio_net: XDP support for adjust_head

Message ID 20170117222259.20280.24625.stgit@john-Precision-Tower-5810
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

John Fastabend Jan. 17, 2017, 10:22 p.m. UTC
Add support for XDP adjust head by allocating a 256B header region
that XDP programs can grow into. This is only enabled when a XDP
program is loaded.

In order to ensure that we do not have to unwind queue headroom push
queue setup below bpf_prog_add. It reads better to do a prog ref
unwind vs another queue setup call.

At the moment this code must do a full reset to ensure old buffers
without headroom on program add or with headroom on program removal
are not used incorrectly in the datapath. Ideally we would only
have to disable/enable the RX queues being updated but there is no
API to do this at the moment in virtio so use the big hammer. In
practice it is likely not that big of a problem as this will only
happen when XDP is enabled/disabled changing programs does not
require the reset. There is some risk that the driver may either
have an allocation failure or for some reason fail to correctly
negotiate with the underlying backend in this case the driver will
be left uninitialized. I have not seen this ever happen on my test
systems and for what its worth this same failure case can occur
from probe and other contexts in virtio framework.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/virtio_net.c |  149 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 125 insertions(+), 24 deletions(-)

Comments

Jason Wang Jan. 18, 2017, 3:35 a.m. UTC | #1
On 2017年01月18日 06:22, John Fastabend wrote:
>   
> +static int virtnet_reset(struct virtnet_info *vi)
> +{
> +	struct virtio_device *dev = vi->vdev;
> +	int ret;
> +
> +	virtio_config_disable(dev);
> +	dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
> +	virtnet_freeze_down(dev);
> +	_remove_vq_common(vi);
> +
> +	dev->config->reset(dev);
> +	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> +	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
> +
> +	ret = virtio_finalize_features(dev);
> +	if (ret)
> +		goto err;
> +
> +	ret = virtnet_restore_up(dev);
> +	if (ret)
> +		goto err;
> +	ret = _virtnet_set_queues(vi, vi->curr_queue_pairs);
> +	if (ret)
> +		goto err;
> +
> +	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
> +	virtio_config_enable(dev);
> +	return 0;
> +err:
> +	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
> +	return ret;
> +}
> +

Hi John:

I still prefer not open code (part of) virtio_device_freeze() and 
virtio_device_restore() here. How about:

1) introduce __virtio_device_freeze/__virtio_device_restore which 
accepts a function pointer of free/restore
2) for virtio_device_freeze/virtio_device_restore just pass 
drv->freeze/drv->restore (locked version)
3) for virtnet_reset(), we can pass unlocked version of freeze and restore

Just my preference, if both Michael and you stick to this, I'm also fine.

Thanks
Michael S. Tsirkin Jan. 18, 2017, 3:15 p.m. UTC | #2
On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
> Add support for XDP adjust head by allocating a 256B header region
> that XDP programs can grow into. This is only enabled when a XDP
> program is loaded.
> 
> In order to ensure that we do not have to unwind queue headroom push
> queue setup below bpf_prog_add. It reads better to do a prog ref
> unwind vs another queue setup call.
> 
> At the moment this code must do a full reset to ensure old buffers
> without headroom on program add or with headroom on program removal
> are not used incorrectly in the datapath. Ideally we would only
> have to disable/enable the RX queues being updated but there is no
> API to do this at the moment in virtio so use the big hammer. In
> practice it is likely not that big of a problem as this will only
> happen when XDP is enabled/disabled changing programs does not
> require the reset. There is some risk that the driver may either
> have an allocation failure or for some reason fail to correctly
> negotiate with the underlying backend in this case the driver will
> be left uninitialized. I have not seen this ever happen on my test
> systems and for what its worth this same failure case can occur
> from probe and other contexts in virtio framework.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

I've been thinking about it - can't we drop
old buffers without the head room which were posted before
xdp attached?

Avoiding the reset would be much nicer.

Thoughts?

> ---
>  drivers/net/virtio_net.c |  149 +++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 125 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 62dbf4b..3b129b4 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -41,6 +41,9 @@
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>  #define GOOD_COPY_LEN	128
>  
> +/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
> +#define VIRTIO_XDP_HEADROOM 256
> +
>  /* RX packet size EWMA. The average packet size is used to determine the packet
>   * buffer size when refilling RX rings. As the entire RX ring may be refilled
>   * at once, the weight is chosen so that the EWMA will be insensitive to short-
> @@ -359,6 +362,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>  	}
>  
>  	if (vi->mergeable_rx_bufs) {
> +		xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
>  		/* Zero header and leave csum up to XDP layers */
>  		hdr = xdp->data;
>  		memset(hdr, 0, vi->hdr_len);
> @@ -375,7 +379,9 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>  		num_sg = 2;
>  		sg_init_table(sq->sg, 2);
>  		sg_set_buf(sq->sg, hdr, vi->hdr_len);
> -		skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
> +		skb_to_sgvec(skb, sq->sg + 1,
> +			     xdp->data - xdp->data_hard_start,
> +			     xdp->data_end - xdp->data);
>  	}
>  	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
>  				   data, GFP_ATOMIC);
> @@ -401,7 +407,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
>  	struct bpf_prog *xdp_prog;
>  
>  	len -= vi->hdr_len;
> -	skb_trim(skb, len);
>  
>  	rcu_read_lock();
>  	xdp_prog = rcu_dereference(rq->xdp_prog);
> @@ -413,11 +418,15 @@ static struct sk_buff *receive_small(struct net_device *dev,
>  		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
>  			goto err_xdp;
>  
> -		xdp.data = skb->data;
> +		xdp.data_hard_start = skb->data;
> +		xdp.data = skb->data + VIRTIO_XDP_HEADROOM;
>  		xdp.data_end = xdp.data + len;
>  		act = bpf_prog_run_xdp(xdp_prog, &xdp);
>  		switch (act) {
>  		case XDP_PASS:
> +			/* Recalculate length in case bpf program changed it */
> +			__skb_pull(skb, xdp.data - xdp.data_hard_start);
> +			len = xdp.data_end - xdp.data;
>  			break;
>  		case XDP_TX:
>  			virtnet_xdp_xmit(vi, rq, &xdp, skb);
> @@ -432,6 +441,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
>  	}
>  	rcu_read_unlock();
>  
> +	skb_trim(skb, len);
>  	return skb;
>  
>  err_xdp:
> @@ -480,7 +490,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>  				       unsigned int *len)
>  {
>  	struct page *page = alloc_page(GFP_ATOMIC);
> -	unsigned int page_off = 0;
> +	unsigned int page_off = VIRTIO_XDP_HEADROOM;
>  
>  	if (!page)
>  		return NULL;
> @@ -516,7 +526,8 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>  		put_page(p);
>  	}
>  
> -	*len = page_off;
> +	/* Headroom does not contribute to packet length */
> +	*len = page_off - VIRTIO_XDP_HEADROOM;
>  	return page;
>  err_buf:
>  	__free_pages(page, 0);
> @@ -555,7 +566,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  						      page, offset, &len);
>  			if (!xdp_page)
>  				goto err_xdp;
> -			offset = 0;
> +			offset = VIRTIO_XDP_HEADROOM;
>  		} else {
>  			xdp_page = page;
>  		}
> @@ -568,18 +579,29 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  		if (unlikely(hdr->hdr.gso_type))
>  			goto err_xdp;
>  
> +		/* Allow consuming headroom but reserve enough space to push
> +		 * the descriptor on if we get an XDP_TX return code.
> +		 */
>  		data = page_address(xdp_page) + offset;
> +		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
>  		xdp.data = data + vi->hdr_len;
>  		xdp.data_end = xdp.data + (len - vi->hdr_len);
>  		act = bpf_prog_run_xdp(xdp_prog, &xdp);
>  		switch (act) {
>  		case XDP_PASS:
> +			/* recalculate offset to account for any header
> +			 * adjustments. Note other cases do not build an
> +			 * skb and avoid using offset
> +			 */
> +			offset = xdp.data -
> +					page_address(xdp_page) - vi->hdr_len;
> +
>  			/* We can only create skb based on xdp_page. */
>  			if (unlikely(xdp_page != page)) {
>  				rcu_read_unlock();
>  				put_page(page);
>  				head_skb = page_to_skb(vi, rq, xdp_page,
> -						       0, len, PAGE_SIZE);
> +						       offset, len, PAGE_SIZE);
>  				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
>  				return head_skb;
>  			}
> @@ -744,23 +766,30 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
>  	dev_kfree_skb(skb);
>  }
>  
> +static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
> +{
> +	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
> +}
> +
>  static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
>  			     gfp_t gfp)
>  {
> +	int headroom = GOOD_PACKET_LEN + virtnet_get_headroom(vi);
> +	unsigned int xdp_headroom = virtnet_get_headroom(vi);
>  	struct sk_buff *skb;
>  	struct virtio_net_hdr_mrg_rxbuf *hdr;
>  	int err;
>  
> -	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
> +	skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
>  	if (unlikely(!skb))
>  		return -ENOMEM;
>  
> -	skb_put(skb, GOOD_PACKET_LEN);
> +	skb_put(skb, headroom);
>  
>  	hdr = skb_vnet_hdr(skb);
>  	sg_init_table(rq->sg, 2);
>  	sg_set_buf(rq->sg, hdr, vi->hdr_len);
> -	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
> +	skb_to_sgvec(skb, rq->sg + 1, xdp_headroom, skb->len - xdp_headroom);
>  
>  	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
>  	if (err < 0)
> @@ -828,24 +857,27 @@ static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
>  	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
>  }
>  
> -static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
> +static int add_recvbuf_mergeable(struct virtnet_info *vi,
> +				 struct receive_queue *rq, gfp_t gfp)
>  {
>  	struct page_frag *alloc_frag = &rq->alloc_frag;
> +	unsigned int headroom = virtnet_get_headroom(vi);
>  	char *buf;
>  	unsigned long ctx;
>  	int err;
>  	unsigned int len, hole;
>  
>  	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
> -	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
> +	if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
>  		return -ENOMEM;
>  
>  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +	buf += headroom; /* advance address leaving hole at front of pkt */
>  	ctx = mergeable_buf_to_ctx(buf, len);
>  	get_page(alloc_frag->page);
> -	alloc_frag->offset += len;
> +	alloc_frag->offset += len + headroom;
>  	hole = alloc_frag->size - alloc_frag->offset;
> -	if (hole < len) {
> +	if (hole < len + headroom) {
>  		/* To avoid internal fragmentation, if there is very likely not
>  		 * enough space for another buffer, add the remaining space to
>  		 * the current buffer. This extra space is not included in
> @@ -879,7 +911,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
>  	gfp |= __GFP_COLD;
>  	do {
>  		if (vi->mergeable_rx_bufs)
> -			err = add_recvbuf_mergeable(rq, gfp);
> +			err = add_recvbuf_mergeable(vi, rq, gfp);
>  		else if (vi->big_packets)
>  			err = add_recvbuf_big(vi, rq, gfp);
>  		else
> @@ -1702,6 +1734,7 @@ static void virtnet_freeze_down(struct virtio_device *vdev)
>  }
>  
>  static int init_vqs(struct virtnet_info *vi);
> +static void _remove_vq_common(struct virtnet_info *vi);
>  
>  static int virtnet_restore_up(struct virtio_device *vdev)
>  {
> @@ -1727,12 +1760,45 @@ static int virtnet_restore_up(struct virtio_device *vdev)
>  	return err;
>  }
>  
> +static int virtnet_reset(struct virtnet_info *vi)
> +{
> +	struct virtio_device *dev = vi->vdev;
> +	int ret;
> +
> +	virtio_config_disable(dev);
> +	dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
> +	virtnet_freeze_down(dev);
> +	_remove_vq_common(vi);
> +
> +	dev->config->reset(dev);
> +	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> +	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
> +
> +	ret = virtio_finalize_features(dev);
> +	if (ret)
> +		goto err;
> +
> +	ret = virtnet_restore_up(dev);
> +	if (ret)
> +		goto err;
> +	ret = _virtnet_set_queues(vi, vi->curr_queue_pairs);
> +	if (ret)
> +		goto err;
> +
> +	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
> +	virtio_config_enable(dev);
> +	return 0;
> +err:
> +	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
> +	return ret;
> +}
> +
>  static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>  {
>  	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	struct bpf_prog *old_prog;
> -	u16 xdp_qp = 0, curr_qp;
> +	u16 oxdp_qp, xdp_qp = 0, curr_qp;
>  	int i, err;
>  
>  	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> @@ -1764,21 +1830,32 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>  		return -ENOMEM;
>  	}
>  
> +	if (prog) {
> +		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
> +		if (IS_ERR(prog))
> +			return PTR_ERR(prog);
> +	}
> +
>  	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
>  	if (err) {
>  		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
> -		return err;
> +		goto virtio_queue_err;
>  	}
>  
> -	if (prog) {
> -		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
> -		if (IS_ERR(prog)) {
> -			_virtnet_set_queues(vi, curr_qp);
> -			return PTR_ERR(prog);
> -		}
> +	oxdp_qp = vi->xdp_queue_pairs;
> +
> +	/* Changing the headroom in buffers is a disruptive operation because
> +	 * existing buffers must be flushed and reallocated. This will happen
> +	 * when a xdp program is initially added or xdp is disabled by removing
> +	 * the xdp program resulting in number of XDP queues changing.
> +	 */
> +	if (vi->xdp_queue_pairs != xdp_qp) {
> +		vi->xdp_queue_pairs = xdp_qp;
> +		err = virtnet_reset(vi);
> +		if (err)
> +			goto virtio_reset_err;
>  	}
>  
> -	vi->xdp_queue_pairs = xdp_qp;
>  	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
>  
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
> @@ -1789,6 +1866,21 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>  	}
>  
>  	return 0;
> +
> +virtio_reset_err:
> +	/* On reset error do our best to unwind XDP changes inflight and return
> +	 * error up to user space for resolution. The underlying reset hung on
> +	 * us so not much we can do here.
> +	 */
> +	dev_warn(&dev->dev, "XDP reset failure and queues unstable\n");
> +	vi->xdp_queue_pairs = oxdp_qp;
> +virtio_queue_err:
> +	/* On queue set error we can unwind bpf ref count and user space can
> +	 * retry this is most likely an allocation failure.
> +	 */
> +	if (prog)
> +		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
> +	return err;
>  }
>  
>  static bool virtnet_xdp_query(struct net_device *dev)
> @@ -2382,6 +2474,15 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	return err;
>  }
>  
> +static void _remove_vq_common(struct virtnet_info *vi)
> +{
> +	vi->vdev->config->reset(vi->vdev);
> +	free_unused_bufs(vi);
> +	_free_receive_bufs(vi);
> +	free_receive_page_frags(vi);
> +	virtnet_del_vqs(vi);
> +}
> +
>  static void remove_vq_common(struct virtnet_info *vi)
>  {
>  	vi->vdev->config->reset(vi->vdev);
Jason Wang Jan. 19, 2017, 3:05 a.m. UTC | #3
On 2017年01月18日 23:15, Michael S. Tsirkin wrote:
> On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
>> Add support for XDP adjust head by allocating a 256B header region
>> that XDP programs can grow into. This is only enabled when a XDP
>> program is loaded.
>>
>> In order to ensure that we do not have to unwind queue headroom push
>> queue setup below bpf_prog_add. It reads better to do a prog ref
>> unwind vs another queue setup call.
>>
>> At the moment this code must do a full reset to ensure old buffers
>> without headroom on program add or with headroom on program removal
>> are not used incorrectly in the datapath. Ideally we would only
>> have to disable/enable the RX queues being updated but there is no
>> API to do this at the moment in virtio so use the big hammer. In
>> practice it is likely not that big of a problem as this will only
>> happen when XDP is enabled/disabled changing programs does not
>> require the reset. There is some risk that the driver may either
>> have an allocation failure or for some reason fail to correctly
>> negotiate with the underlying backend in this case the driver will
>> be left uninitialized. I have not seen this ever happen on my test
>> systems and for what its worth this same failure case can occur
>> from probe and other contexts in virtio framework.
>>
>> Signed-off-by: John Fastabend<john.r.fastabend@intel.com>
> I've been thinking about it - can't we drop
> old buffers without the head room which were posted before
> xdp attached?
>
> Avoiding the reset would be much nicer.
>
> Thoughts?
>

As been discussed before, device may use them in the same time so it's 
not safe. Or do you mean detect them after xdp were set and drop the 
buffer without head room, this looks sub-optimal.

Thanks
Michael S. Tsirkin Jan. 19, 2017, 9:11 p.m. UTC | #4
On Thu, Jan 19, 2017 at 11:05:40AM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月18日 23:15, Michael S. Tsirkin wrote:
> > On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
> > > Add support for XDP adjust head by allocating a 256B header region
> > > that XDP programs can grow into. This is only enabled when a XDP
> > > program is loaded.
> > > 
> > > In order to ensure that we do not have to unwind queue headroom push
> > > queue setup below bpf_prog_add. It reads better to do a prog ref
> > > unwind vs another queue setup call.
> > > 
> > > At the moment this code must do a full reset to ensure old buffers
> > > without headroom on program add or with headroom on program removal
> > > are not used incorrectly in the datapath. Ideally we would only
> > > have to disable/enable the RX queues being updated but there is no
> > > API to do this at the moment in virtio so use the big hammer. In
> > > practice it is likely not that big of a problem as this will only
> > > happen when XDP is enabled/disabled changing programs does not
> > > require the reset. There is some risk that the driver may either
> > > have an allocation failure or for some reason fail to correctly
> > > negotiate with the underlying backend in this case the driver will
> > > be left uninitialized. I have not seen this ever happen on my test
> > > systems and for what its worth this same failure case can occur
> > > from probe and other contexts in virtio framework.
> > > 
> > > Signed-off-by: John Fastabend<john.r.fastabend@intel.com>
> > I've been thinking about it - can't we drop
> > old buffers without the head room which were posted before
> > xdp attached?
> > 
> > Avoiding the reset would be much nicer.
> > 
> > Thoughts?
> > 
> 
> As been discussed before, device may use them in the same time so it's not
> safe. Or do you mean detect them after xdp were set and drop the buffer
> without head room, this looks sub-optimal.
> 
> Thanks

Yes, this is what I mean.  Why is this suboptimal? It's a single branch
in code. Yes we might lose some packets but the big hammer of device
reset will likely lose more.
Jason Wang Jan. 20, 2017, 3:26 a.m. UTC | #5
On 2017年01月20日 05:11, Michael S. Tsirkin wrote:
> On Thu, Jan 19, 2017 at 11:05:40AM +0800, Jason Wang wrote:
>>
>> On 2017年01月18日 23:15, Michael S. Tsirkin wrote:
>>> On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
>>>> Add support for XDP adjust head by allocating a 256B header region
>>>> that XDP programs can grow into. This is only enabled when a XDP
>>>> program is loaded.
>>>>
>>>> In order to ensure that we do not have to unwind queue headroom push
>>>> queue setup below bpf_prog_add. It reads better to do a prog ref
>>>> unwind vs another queue setup call.
>>>>
>>>> At the moment this code must do a full reset to ensure old buffers
>>>> without headroom on program add or with headroom on program removal
>>>> are not used incorrectly in the datapath. Ideally we would only
>>>> have to disable/enable the RX queues being updated but there is no
>>>> API to do this at the moment in virtio so use the big hammer. In
>>>> practice it is likely not that big of a problem as this will only
>>>> happen when XDP is enabled/disabled changing programs does not
>>>> require the reset. There is some risk that the driver may either
>>>> have an allocation failure or for some reason fail to correctly
>>>> negotiate with the underlying backend in this case the driver will
>>>> be left uninitialized. I have not seen this ever happen on my test
>>>> systems and for what its worth this same failure case can occur
>>>> from probe and other contexts in virtio framework.
>>>>
>>>> Signed-off-by: John Fastabend<john.r.fastabend@intel.com>
>>> I've been thinking about it - can't we drop
>>> old buffers without the head room which were posted before
>>> xdp attached?
>>>
>>> Avoiding the reset would be much nicer.
>>>
>>> Thoughts?
>>>
>> As been discussed before, device may use them in the same time so it's not
>> safe. Or do you mean detect them after xdp were set and drop the buffer
>> without head room, this looks sub-optimal.
>>
>> Thanks
> Yes, this is what I mean.  Why is this suboptimal? It's a single branch
> in code. Yes we might lose some packets but the big hammer of device
> reset will likely lose more.
>

Maybe I was wrong but I think driver should try their best to avoid 
dropping packets. (And look at mlx4, it did something similar to this 
patch).

Thanks
John Fastabend Jan. 20, 2017, 3:38 a.m. UTC | #6
On 17-01-19 01:11 PM, Michael S. Tsirkin wrote:
> On Thu, Jan 19, 2017 at 11:05:40AM +0800, Jason Wang wrote:
>>
>>
>> On 2017年01月18日 23:15, Michael S. Tsirkin wrote:
>>> On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
>>>> Add support for XDP adjust head by allocating a 256B header region
>>>> that XDP programs can grow into. This is only enabled when a XDP
>>>> program is loaded.
>>>>
>>>> In order to ensure that we do not have to unwind queue headroom push
>>>> queue setup below bpf_prog_add. It reads better to do a prog ref
>>>> unwind vs another queue setup call.
>>>>
>>>> At the moment this code must do a full reset to ensure old buffers
>>>> without headroom on program add or with headroom on program removal
>>>> are not used incorrectly in the datapath. Ideally we would only
>>>> have to disable/enable the RX queues being updated but there is no
>>>> API to do this at the moment in virtio so use the big hammer. In
>>>> practice it is likely not that big of a problem as this will only
>>>> happen when XDP is enabled/disabled changing programs does not
>>>> require the reset. There is some risk that the driver may either
>>>> have an allocation failure or for some reason fail to correctly
>>>> negotiate with the underlying backend in this case the driver will
>>>> be left uninitialized. I have not seen this ever happen on my test
>>>> systems and for what its worth this same failure case can occur
>>>> from probe and other contexts in virtio framework.
>>>>
>>>> Signed-off-by: John Fastabend<john.r.fastabend@intel.com>
>>> I've been thinking about it - can't we drop
>>> old buffers without the head room which were posted before
>>> xdp attached?
>>>
>>> Avoiding the reset would be much nicer.
>>>
>>> Thoughts?
>>>
>>
>> As been discussed before, device may use them in the same time so it's not
>> safe. Or do you mean detect them after xdp were set and drop the buffer
>> without head room, this looks sub-optimal.
>>
>> Thanks
> 
> Yes, this is what I mean.  Why is this suboptimal? It's a single branch
> in code. Yes we might lose some packets but the big hammer of device
> reset will likely lose more.
> 

Maybe I'm not following, is the suggestion to drop the packets after XDP
is setup for all outstanding buffers until we have done a reallocation of
all the buffers? In this case we can't just detach the buffers we have to
wait until the backend retires them by using them correct?

But when XDP setup call returns we need to guarantee that buffers and
driver are setup. Otherwise the next n packets get dropped in the future.
If there is no traffic currently this could be at some undetermined point
in the future. This will be very buggy.

Did I miss something?

Thanks,
John
John Fastabend Jan. 20, 2017, 3:39 a.m. UTC | #7
On 17-01-19 07:26 PM, Jason Wang wrote:
> 
> 
> On 2017年01月20日 05:11, Michael S. Tsirkin wrote:
>> On Thu, Jan 19, 2017 at 11:05:40AM +0800, Jason Wang wrote:
>>>
>>> On 2017年01月18日 23:15, Michael S. Tsirkin wrote:
>>>> On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
>>>>> Add support for XDP adjust head by allocating a 256B header region
>>>>> that XDP programs can grow into. This is only enabled when a XDP
>>>>> program is loaded.
>>>>>
>>>>> In order to ensure that we do not have to unwind queue headroom push
>>>>> queue setup below bpf_prog_add. It reads better to do a prog ref
>>>>> unwind vs another queue setup call.
>>>>>
>>>>> At the moment this code must do a full reset to ensure old buffers
>>>>> without headroom on program add or with headroom on program removal
>>>>> are not used incorrectly in the datapath. Ideally we would only
>>>>> have to disable/enable the RX queues being updated but there is no
>>>>> API to do this at the moment in virtio so use the big hammer. In
>>>>> practice it is likely not that big of a problem as this will only
>>>>> happen when XDP is enabled/disabled changing programs does not
>>>>> require the reset. There is some risk that the driver may either
>>>>> have an allocation failure or for some reason fail to correctly
>>>>> negotiate with the underlying backend in this case the driver will
>>>>> be left uninitialized. I have not seen this ever happen on my test
>>>>> systems and for what its worth this same failure case can occur
>>>>> from probe and other contexts in virtio framework.
>>>>>
>>>>> Signed-off-by: John Fastabend<john.r.fastabend@intel.com>
>>>> I've been thinking about it - can't we drop
>>>> old buffers without the head room which were posted before
>>>> xdp attached?
>>>>
>>>> Avoiding the reset would be much nicer.
>>>>
>>>> Thoughts?
>>>>
>>> As been discussed before, device may use them in the same time so it's not
>>> safe. Or do you mean detect them after xdp were set and drop the buffer
>>> without head room, this looks sub-optimal.
>>>
>>> Thanks
>> Yes, this is what I mean.  Why is this suboptimal? It's a single branch
>> in code. Yes we might lose some packets but the big hammer of device
>> reset will likely lose more.
>>
> 
> Maybe I was wrong but I think driver should try their best to avoid dropping
> packets. (And look at mlx4, it did something similar to this patch).
> 
> Thanks

+1 sorry didn't see your reply as I was typing mine. Bottom line when XDP
returns I believe the driver must be ready to accept packets or managing
XDP will be problematic.

.John
David Laight Jan. 20, 2017, 4:59 p.m. UTC | #8
From: Michael S. Tsirkin

> Sent: 19 January 2017 21:12

> > On 2017?01?18? 23:15, Michael S. Tsirkin wrote:

> > > On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:

> > > > Add support for XDP adjust head by allocating a 256B header region

> > > > that XDP programs can grow into. This is only enabled when a XDP

> > > > program is loaded.

> > > >

> > > > In order to ensure that we do not have to unwind queue headroom push

> > > > queue setup below bpf_prog_add. It reads better to do a prog ref

> > > > unwind vs another queue setup call.

> > > >

> > > > At the moment this code must do a full reset to ensure old buffers

> > > > without headroom on program add or with headroom on program removal

> > > > are not used incorrectly in the datapath. Ideally we would only

> > > > have to disable/enable the RX queues being updated but there is no

> > > > API to do this at the moment in virtio so use the big hammer. In

> > > > practice it is likely not that big of a problem as this will only

> > > > happen when XDP is enabled/disabled changing programs does not

> > > > require the reset. There is some risk that the driver may either

> > > > have an allocation failure or for some reason fail to correctly

> > > > negotiate with the underlying backend in this case the driver will

> > > > be left uninitialized. I have not seen this ever happen on my test

> > > > systems and for what its worth this same failure case can occur

> > > > from probe and other contexts in virtio framework.

> > > >

> > > > Signed-off-by: John Fastabend<john.r.fastabend@intel.com>

> > > I've been thinking about it - can't we drop

> > > old buffers without the head room which were posted before

> > > xdp attached?

> > >

> > > Avoiding the reset would be much nicer.

> > >

> > > Thoughts?

> > >

> >

> > As been discussed before, device may use them in the same time so it's not

> > safe. Or do you mean detect them after xdp were set and drop the buffer

> > without head room, this looks sub-optimal.

> >

> > Thanks

> 

> Yes, this is what I mean.  Why is this suboptimal? It's a single branch

> in code. Yes we might lose some packets but the big hammer of device

> reset will likely lose more.


Why not leave let the hardware receive into the 'small' buffer (without
headroom) and do a copy when a frame is received.
Replace the buffers with 'big' ones for the next receive.
A data copy on a ring full of buffers won't really be noticed.

	David
Michael S. Tsirkin Jan. 20, 2017, 5:48 p.m. UTC | #9
On Fri, Jan 20, 2017 at 04:59:11PM +0000, David Laight wrote:
> From: Michael S. Tsirkin
> > Sent: 19 January 2017 21:12
> > > On 2017?01?18? 23:15, Michael S. Tsirkin wrote:
> > > > On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
> > > > > Add support for XDP adjust head by allocating a 256B header region
> > > > > that XDP programs can grow into. This is only enabled when a XDP
> > > > > program is loaded.
> > > > >
> > > > > In order to ensure that we do not have to unwind queue headroom push
> > > > > queue setup below bpf_prog_add. It reads better to do a prog ref
> > > > > unwind vs another queue setup call.
> > > > >
> > > > > At the moment this code must do a full reset to ensure old buffers
> > > > > without headroom on program add or with headroom on program removal
> > > > > are not used incorrectly in the datapath. Ideally we would only
> > > > > have to disable/enable the RX queues being updated but there is no
> > > > > API to do this at the moment in virtio so use the big hammer. In
> > > > > practice it is likely not that big of a problem as this will only
> > > > > happen when XDP is enabled/disabled changing programs does not
> > > > > require the reset. There is some risk that the driver may either
> > > > > have an allocation failure or for some reason fail to correctly
> > > > > negotiate with the underlying backend in this case the driver will
> > > > > be left uninitialized. I have not seen this ever happen on my test
> > > > > systems and for what its worth this same failure case can occur
> > > > > from probe and other contexts in virtio framework.
> > > > >
> > > > > Signed-off-by: John Fastabend<john.r.fastabend@intel.com>
> > > > I've been thinking about it - can't we drop
> > > > old buffers without the head room which were posted before
> > > > xdp attached?
> > > >
> > > > Avoiding the reset would be much nicer.
> > > >
> > > > Thoughts?
> > > >
> > >
> > > As been discussed before, device may use them in the same time so it's not
> > > safe. Or do you mean detect them after xdp were set and drop the buffer
> > > without head room, this looks sub-optimal.
> > >
> > > Thanks
> > 
> > Yes, this is what I mean.  Why is this suboptimal? It's a single branch
> > in code. Yes we might lose some packets but the big hammer of device
> > reset will likely lose more.
> 
> Why not leave let the hardware receive into the 'small' buffer (without
> headroom) and do a copy when a frame is received.
> Replace the buffers with 'big' ones for the next receive.
> A data copy on a ring full of buffers won't really be noticed.
> 
> 	David
> 

I like that. John?
Jason Wang Jan. 22, 2017, 2:51 a.m. UTC | #10
On 2017年01月21日 01:48, Michael S. Tsirkin wrote:
> On Fri, Jan 20, 2017 at 04:59:11PM +0000, David Laight wrote:
>> From: Michael S. Tsirkin
>>> Sent: 19 January 2017 21:12
>>>> On 2017?01?18? 23:15, Michael S. Tsirkin wrote:
>>>>> On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
>>>>>> Add support for XDP adjust head by allocating a 256B header region
>>>>>> that XDP programs can grow into. This is only enabled when a XDP
>>>>>> program is loaded.
>>>>>>
>>>>>> In order to ensure that we do not have to unwind queue headroom push
>>>>>> queue setup below bpf_prog_add. It reads better to do a prog ref
>>>>>> unwind vs another queue setup call.
>>>>>>
>>>>>> At the moment this code must do a full reset to ensure old buffers
>>>>>> without headroom on program add or with headroom on program removal
>>>>>> are not used incorrectly in the datapath. Ideally we would only
>>>>>> have to disable/enable the RX queues being updated but there is no
>>>>>> API to do this at the moment in virtio so use the big hammer. In
>>>>>> practice it is likely not that big of a problem as this will only
>>>>>> happen when XDP is enabled/disabled changing programs does not
>>>>>> require the reset. There is some risk that the driver may either
>>>>>> have an allocation failure or for some reason fail to correctly
>>>>>> negotiate with the underlying backend in this case the driver will
>>>>>> be left uninitialized. I have not seen this ever happen on my test
>>>>>> systems and for what its worth this same failure case can occur
>>>>>> from probe and other contexts in virtio framework.
>>>>>>
>>>>>> Signed-off-by: John Fastabend<john.r.fastabend@intel.com>
>>>>> I've been thinking about it - can't we drop
>>>>> old buffers without the head room which were posted before
>>>>> xdp attached?
>>>>>
>>>>> Avoiding the reset would be much nicer.
>>>>>
>>>>> Thoughts?
>>>>>
>>>> As been discussed before, device may use them in the same time so it's not
>>>> safe. Or do you mean detect them after xdp were set and drop the buffer
>>>> without head room, this looks sub-optimal.
>>>>
>>>> Thanks
>>> Yes, this is what I mean.  Why is this suboptimal? It's a single branch
>>> in code. Yes we might lose some packets but the big hammer of device
>>> reset will likely lose more.
>> Why not leave let the hardware receive into the 'small' buffer (without
>> headroom) and do a copy when a frame is received.
>> Replace the buffers with 'big' ones for the next receive.
>> A data copy on a ring full of buffers won't really be noticed.
>>
>> 	David
>>
> I like that. John?
>

This works, I prefer this only if it uses simpler code (but I suspect) 
than reset.

Thanks
John Fastabend Jan. 22, 2017, 4:14 a.m. UTC | #11
On 17-01-21 06:51 PM, Jason Wang wrote:
> 
> 
> On 2017年01月21日 01:48, Michael S. Tsirkin wrote:
>> On Fri, Jan 20, 2017 at 04:59:11PM +0000, David Laight wrote:
>>> From: Michael S. Tsirkin
>>>> Sent: 19 January 2017 21:12
>>>>> On 2017?01?18? 23:15, Michael S. Tsirkin wrote:
>>>>>> On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
>>>>>>> Add support for XDP adjust head by allocating a 256B header region
>>>>>>> that XDP programs can grow into. This is only enabled when a XDP
>>>>>>> program is loaded.
>>>>>>>
>>>>>>> In order to ensure that we do not have to unwind queue headroom push
>>>>>>> queue setup below bpf_prog_add. It reads better to do a prog ref
>>>>>>> unwind vs another queue setup call.
>>>>>>>
>>>>>>> At the moment this code must do a full reset to ensure old buffers
>>>>>>> without headroom on program add or with headroom on program removal
>>>>>>> are not used incorrectly in the datapath. Ideally we would only
>>>>>>> have to disable/enable the RX queues being updated but there is no
>>>>>>> API to do this at the moment in virtio so use the big hammer. In
>>>>>>> practice it is likely not that big of a problem as this will only
>>>>>>> happen when XDP is enabled/disabled changing programs does not
>>>>>>> require the reset. There is some risk that the driver may either
>>>>>>> have an allocation failure or for some reason fail to correctly
>>>>>>> negotiate with the underlying backend in this case the driver will
>>>>>>> be left uninitialized. I have not seen this ever happen on my test
>>>>>>> systems and for what its worth this same failure case can occur
>>>>>>> from probe and other contexts in virtio framework.
>>>>>>>
>>>>>>> Signed-off-by: John Fastabend<john.r.fastabend@intel.com>
>>>>>> I've been thinking about it - can't we drop
>>>>>> old buffers without the head room which were posted before
>>>>>> xdp attached?
>>>>>>
>>>>>> Avoiding the reset would be much nicer.
>>>>>>
>>>>>> Thoughts?
>>>>>>
>>>>> As been discussed before, device may use them in the same time so it's not
>>>>> safe. Or do you mean detect them after xdp were set and drop the buffer
>>>>> without head room, this looks sub-optimal.
>>>>>
>>>>> Thanks
>>>> Yes, this is what I mean.  Why is this suboptimal? It's a single branch
>>>> in code. Yes we might lose some packets but the big hammer of device
>>>> reset will likely lose more.
>>> Why not leave let the hardware receive into the 'small' buffer (without
>>> headroom) and do a copy when a frame is received.
>>> Replace the buffers with 'big' ones for the next receive.
>>> A data copy on a ring full of buffers won't really be noticed.
>>>
>>>     David
>>>
>> I like that. John?
>>
> 
> This works, I prefer this only if it uses simpler code (but I suspect) than reset.
> 
> Thanks

Before the reset path I looked at doing this but it seems to require tracking
if a buffer had headroom on a per buffer basis. I don't see a good spot to
put a bit like this? It could go in the inbuf 'ctx' added by virtqueue_add_inbuf
but I would need to change the current usage of ctx which in the mergeable case
at least is just a simple pointer today. I don't like this because it
complicates the normal path and the XDP hotpath.

Otherwise we could somehow mark the ring at the point where XDP is enabled so
that it can learn when a full iteration around the ring. But I can't see a
simple way to make this work either.

I don't know the reset look straight forward to me and although not ideal is
fairly common on hardware based drivers during configuration changes. I'm open
to any ideas on where to put the metadata to track headroom though.

Thanks,
John
Michael S. Tsirkin Jan. 23, 2017, 5:02 p.m. UTC | #12
On Sat, Jan 21, 2017 at 08:14:19PM -0800, John Fastabend wrote:
> On 17-01-21 06:51 PM, Jason Wang wrote:
> > 
> > 
> > On 2017年01月21日 01:48, Michael S. Tsirkin wrote:
> >> On Fri, Jan 20, 2017 at 04:59:11PM +0000, David Laight wrote:
> >>> From: Michael S. Tsirkin
> >>>> Sent: 19 January 2017 21:12
> >>>>> On 2017?01?18? 23:15, Michael S. Tsirkin wrote:
> >>>>>> On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
> >>>>>>> Add support for XDP adjust head by allocating a 256B header region
> >>>>>>> that XDP programs can grow into. This is only enabled when a XDP
> >>>>>>> program is loaded.
> >>>>>>>
> >>>>>>> In order to ensure that we do not have to unwind queue headroom push
> >>>>>>> queue setup below bpf_prog_add. It reads better to do a prog ref
> >>>>>>> unwind vs another queue setup call.
> >>>>>>>
> >>>>>>> At the moment this code must do a full reset to ensure old buffers
> >>>>>>> without headroom on program add or with headroom on program removal
> >>>>>>> are not used incorrectly in the datapath. Ideally we would only
> >>>>>>> have to disable/enable the RX queues being updated but there is no
> >>>>>>> API to do this at the moment in virtio so use the big hammer. In
> >>>>>>> practice it is likely not that big of a problem as this will only
> >>>>>>> happen when XDP is enabled/disabled changing programs does not
> >>>>>>> require the reset. There is some risk that the driver may either
> >>>>>>> have an allocation failure or for some reason fail to correctly
> >>>>>>> negotiate with the underlying backend in this case the driver will
> >>>>>>> be left uninitialized. I have not seen this ever happen on my test
> >>>>>>> systems and for what its worth this same failure case can occur
> >>>>>>> from probe and other contexts in virtio framework.
> >>>>>>>
> >>>>>>> Signed-off-by: John Fastabend<john.r.fastabend@intel.com>
> >>>>>> I've been thinking about it - can't we drop
> >>>>>> old buffers without the head room which were posted before
> >>>>>> xdp attached?
> >>>>>>
> >>>>>> Avoiding the reset would be much nicer.
> >>>>>>
> >>>>>> Thoughts?
> >>>>>>
> >>>>> As been discussed before, device may use them in the same time so it's not
> >>>>> safe. Or do you mean detect them after xdp were set and drop the buffer
> >>>>> without head room, this looks sub-optimal.
> >>>>>
> >>>>> Thanks
> >>>> Yes, this is what I mean.  Why is this suboptimal? It's a single branch
> >>>> in code. Yes we might lose some packets but the big hammer of device
> >>>> reset will likely lose more.
> >>> Why not leave let the hardware receive into the 'small' buffer (without
> >>> headroom) and do a copy when a frame is received.
> >>> Replace the buffers with 'big' ones for the next receive.
> >>> A data copy on a ring full of buffers won't really be noticed.
> >>>
> >>>     David
> >>>
> >> I like that. John?
> >>
> > 
> > This works, I prefer this only if it uses simpler code (but I suspect) than reset.
> > 
> > Thanks
> 
> Before the reset path I looked at doing this but it seems to require tracking
> if a buffer had headroom on a per buffer basis. I don't see a good spot to
> put a bit like this? It could go in the inbuf 'ctx' added by virtqueue_add_inbuf
> but I would need to change the current usage of ctx which in the mergeable case
> at least is just a simple pointer today. I don't like this because it
> complicates the normal path and the XDP hotpath.
> 
> Otherwise we could somehow mark the ring at the point where XDP is enabled so
> that it can learn when a full iteration around the ring. But I can't see a
> simple way to make this work either.
> 
> I don't know the reset look straight forward to me and although not ideal is
> fairly common on hardware based drivers during configuration changes. I'm open
> to any ideas on where to put the metadata to track headroom though.
> 
> Thanks,
> John


Well with 4K pages we actually have 4 spare bits to use.
In fact this means we could reduce the mergeable buffer alignment.
It starts getting strange with 64K pages where we are
out of space, and I just noticed that with bigger pages
virtio is actually broken.

So let me fix it up first of all, and on top - maybe we can just
increase the alignment for 64k pages and up?
Truesize alignment to 512 is still reasonable and presumably
these 64k page boxes have lots of memory.
Would it make sense to tweak SK_RMEM_MAX up for larger
page sizes?
Michael S. Tsirkin Jan. 23, 2017, 7:22 p.m. UTC | #13
On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 62dbf4b..3b129b4 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -41,6 +41,9 @@
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>  #define GOOD_COPY_LEN	128
>  
> +/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
> +#define VIRTIO_XDP_HEADROOM 256
> +
>  /* RX packet size EWMA. The average packet size is used to determine the packet
>   * buffer size when refilling RX rings. As the entire RX ring may be refilled
>   * at once, the weight is chosen so that the EWMA will be insensitive to short-

I wonder where does this number come from?  This is quite a lot and
means that using XDP_PASS will slow down any sockets on top of it.
Which in turn means people will try to remove XDP when not in use,
causing resets.  E.g. build_skb (which I have a patch to switch to) uses
a much more reasonable NET_SKB_PAD.
diff mbox

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 62dbf4b..3b129b4 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -41,6 +41,9 @@ 
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
 
+/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
+#define VIRTIO_XDP_HEADROOM 256
+
 /* RX packet size EWMA. The average packet size is used to determine the packet
  * buffer size when refilling RX rings. As the entire RX ring may be refilled
  * at once, the weight is chosen so that the EWMA will be insensitive to short-
@@ -359,6 +362,7 @@  static void virtnet_xdp_xmit(struct virtnet_info *vi,
 	}
 
 	if (vi->mergeable_rx_bufs) {
+		xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
 		/* Zero header and leave csum up to XDP layers */
 		hdr = xdp->data;
 		memset(hdr, 0, vi->hdr_len);
@@ -375,7 +379,9 @@  static void virtnet_xdp_xmit(struct virtnet_info *vi,
 		num_sg = 2;
 		sg_init_table(sq->sg, 2);
 		sg_set_buf(sq->sg, hdr, vi->hdr_len);
-		skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
+		skb_to_sgvec(skb, sq->sg + 1,
+			     xdp->data - xdp->data_hard_start,
+			     xdp->data_end - xdp->data);
 	}
 	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
 				   data, GFP_ATOMIC);
@@ -401,7 +407,6 @@  static struct sk_buff *receive_small(struct net_device *dev,
 	struct bpf_prog *xdp_prog;
 
 	len -= vi->hdr_len;
-	skb_trim(skb, len);
 
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -413,11 +418,15 @@  static struct sk_buff *receive_small(struct net_device *dev,
 		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
 			goto err_xdp;
 
-		xdp.data = skb->data;
+		xdp.data_hard_start = skb->data;
+		xdp.data = skb->data + VIRTIO_XDP_HEADROOM;
 		xdp.data_end = xdp.data + len;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 		switch (act) {
 		case XDP_PASS:
+			/* Recalculate length in case bpf program changed it */
+			__skb_pull(skb, xdp.data - xdp.data_hard_start);
+			len = xdp.data_end - xdp.data;
 			break;
 		case XDP_TX:
 			virtnet_xdp_xmit(vi, rq, &xdp, skb);
@@ -432,6 +441,7 @@  static struct sk_buff *receive_small(struct net_device *dev,
 	}
 	rcu_read_unlock();
 
+	skb_trim(skb, len);
 	return skb;
 
 err_xdp:
@@ -480,7 +490,7 @@  static struct page *xdp_linearize_page(struct receive_queue *rq,
 				       unsigned int *len)
 {
 	struct page *page = alloc_page(GFP_ATOMIC);
-	unsigned int page_off = 0;
+	unsigned int page_off = VIRTIO_XDP_HEADROOM;
 
 	if (!page)
 		return NULL;
@@ -516,7 +526,8 @@  static struct page *xdp_linearize_page(struct receive_queue *rq,
 		put_page(p);
 	}
 
-	*len = page_off;
+	/* Headroom does not contribute to packet length */
+	*len = page_off - VIRTIO_XDP_HEADROOM;
 	return page;
 err_buf:
 	__free_pages(page, 0);
@@ -555,7 +566,7 @@  static struct sk_buff *receive_mergeable(struct net_device *dev,
 						      page, offset, &len);
 			if (!xdp_page)
 				goto err_xdp;
-			offset = 0;
+			offset = VIRTIO_XDP_HEADROOM;
 		} else {
 			xdp_page = page;
 		}
@@ -568,18 +579,29 @@  static struct sk_buff *receive_mergeable(struct net_device *dev,
 		if (unlikely(hdr->hdr.gso_type))
 			goto err_xdp;
 
+		/* Allow consuming headroom but reserve enough space to push
+		 * the descriptor on if we get an XDP_TX return code.
+		 */
 		data = page_address(xdp_page) + offset;
+		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
 		xdp.data = data + vi->hdr_len;
 		xdp.data_end = xdp.data + (len - vi->hdr_len);
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 		switch (act) {
 		case XDP_PASS:
+			/* recalculate offset to account for any header
+			 * adjustments. Note other cases do not build an
+			 * skb and avoid using offset
+			 */
+			offset = xdp.data -
+					page_address(xdp_page) - vi->hdr_len;
+
 			/* We can only create skb based on xdp_page. */
 			if (unlikely(xdp_page != page)) {
 				rcu_read_unlock();
 				put_page(page);
 				head_skb = page_to_skb(vi, rq, xdp_page,
-						       0, len, PAGE_SIZE);
+						       offset, len, PAGE_SIZE);
 				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 				return head_skb;
 			}
@@ -744,23 +766,30 @@  static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 	dev_kfree_skb(skb);
 }
 
+static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
+{
+	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
+}
+
 static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
 			     gfp_t gfp)
 {
+	int headroom = GOOD_PACKET_LEN + virtnet_get_headroom(vi);
+	unsigned int xdp_headroom = virtnet_get_headroom(vi);
 	struct sk_buff *skb;
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	int err;
 
-	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
+	skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
 	if (unlikely(!skb))
 		return -ENOMEM;
 
-	skb_put(skb, GOOD_PACKET_LEN);
+	skb_put(skb, headroom);
 
 	hdr = skb_vnet_hdr(skb);
 	sg_init_table(rq->sg, 2);
 	sg_set_buf(rq->sg, hdr, vi->hdr_len);
-	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
+	skb_to_sgvec(skb, rq->sg + 1, xdp_headroom, skb->len - xdp_headroom);
 
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
 	if (err < 0)
@@ -828,24 +857,27 @@  static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
 	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
 }
 
-static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
+static int add_recvbuf_mergeable(struct virtnet_info *vi,
+				 struct receive_queue *rq, gfp_t gfp)
 {
 	struct page_frag *alloc_frag = &rq->alloc_frag;
+	unsigned int headroom = virtnet_get_headroom(vi);
 	char *buf;
 	unsigned long ctx;
 	int err;
 	unsigned int len, hole;
 
 	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
-	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
+	if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
 		return -ENOMEM;
 
 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	buf += headroom; /* advance address leaving hole at front of pkt */
 	ctx = mergeable_buf_to_ctx(buf, len);
 	get_page(alloc_frag->page);
-	alloc_frag->offset += len;
+	alloc_frag->offset += len + headroom;
 	hole = alloc_frag->size - alloc_frag->offset;
-	if (hole < len) {
+	if (hole < len + headroom) {
 		/* To avoid internal fragmentation, if there is very likely not
 		 * enough space for another buffer, add the remaining space to
 		 * the current buffer. This extra space is not included in
@@ -879,7 +911,7 @@  static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
 	gfp |= __GFP_COLD;
 	do {
 		if (vi->mergeable_rx_bufs)
-			err = add_recvbuf_mergeable(rq, gfp);
+			err = add_recvbuf_mergeable(vi, rq, gfp);
 		else if (vi->big_packets)
 			err = add_recvbuf_big(vi, rq, gfp);
 		else
@@ -1702,6 +1734,7 @@  static void virtnet_freeze_down(struct virtio_device *vdev)
 }
 
 static int init_vqs(struct virtnet_info *vi);
+static void _remove_vq_common(struct virtnet_info *vi);
 
 static int virtnet_restore_up(struct virtio_device *vdev)
 {
@@ -1727,12 +1760,45 @@  static int virtnet_restore_up(struct virtio_device *vdev)
 	return err;
 }
 
+static int virtnet_reset(struct virtnet_info *vi)
+{
+	struct virtio_device *dev = vi->vdev;
+	int ret;
+
+	virtio_config_disable(dev);
+	dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
+	virtnet_freeze_down(dev);
+	_remove_vq_common(vi);
+
+	dev->config->reset(dev);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
+
+	ret = virtio_finalize_features(dev);
+	if (ret)
+		goto err;
+
+	ret = virtnet_restore_up(dev);
+	if (ret)
+		goto err;
+	ret = _virtnet_set_queues(vi, vi->curr_queue_pairs);
+	if (ret)
+		goto err;
+
+	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
+	virtio_config_enable(dev);
+	return 0;
+err:
+	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
+	return ret;
+}
+
 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 {
 	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct bpf_prog *old_prog;
-	u16 xdp_qp = 0, curr_qp;
+	u16 oxdp_qp, xdp_qp = 0, curr_qp;
 	int i, err;
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -1764,21 +1830,32 @@  static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 		return -ENOMEM;
 	}
 
+	if (prog) {
+		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+	}
+
 	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
 	if (err) {
 		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
-		return err;
+		goto virtio_queue_err;
 	}
 
-	if (prog) {
-		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
-		if (IS_ERR(prog)) {
-			_virtnet_set_queues(vi, curr_qp);
-			return PTR_ERR(prog);
-		}
+	oxdp_qp = vi->xdp_queue_pairs;
+
+	/* Changing the headroom in buffers is a disruptive operation because
+	 * existing buffers must be flushed and reallocated. This will happen
+	 * when a xdp program is initially added or xdp is disabled by removing
+	 * the xdp program resulting in number of XDP queues changing.
+	 */
+	if (vi->xdp_queue_pairs != xdp_qp) {
+		vi->xdp_queue_pairs = xdp_qp;
+		err = virtnet_reset(vi);
+		if (err)
+			goto virtio_reset_err;
 	}
 
-	vi->xdp_queue_pairs = xdp_qp;
 	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
 
 	for (i = 0; i < vi->max_queue_pairs; i++) {
@@ -1789,6 +1866,21 @@  static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 	}
 
 	return 0;
+
+virtio_reset_err:
+	/* On reset error do our best to unwind XDP changes inflight and return
+	 * error up to user space for resolution. The underlying reset hung on
+	 * us so not much we can do here.
+	 */
+	dev_warn(&dev->dev, "XDP reset failure and queues unstable\n");
+	vi->xdp_queue_pairs = oxdp_qp;
+virtio_queue_err:
+	/* On queue set error we can unwind bpf ref count and user space can
+	 * retry this is most likely an allocation failure.
+	 */
+	if (prog)
+		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
+	return err;
 }
 
 static bool virtnet_xdp_query(struct net_device *dev)
@@ -2382,6 +2474,15 @@  static int virtnet_probe(struct virtio_device *vdev)
 	return err;
 }
 
+static void _remove_vq_common(struct virtnet_info *vi)
+{
+	vi->vdev->config->reset(vi->vdev);
+	free_unused_bufs(vi);
+	_free_receive_bufs(vi);
+	free_receive_page_frags(vi);
+	virtnet_del_vqs(vi);
+}
+
 static void remove_vq_common(struct virtnet_info *vi)
 {
 	vi->vdev->config->reset(vi->vdev);