diff mbox series

[vhost,v2,19/19] virtio_net: sq support premapped mode

Message ID 20240223082726.52915-20-xuanzhuo@linux.alibaba.com
State Superseded
Headers show
Series virtio: drivers maintain dma info for premapped vq | expand

Commit Message

Xuan Zhuo Feb. 23, 2024, 8:27 a.m. UTC
If the xsk is enabling, the xsk tx will share the send queue.
But the xsk requires that the send queue use the premapped mode.
So the send queue must support premapped mode.

cmd:
    sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
        -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
CPU:
    Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz

Machine:
    ecs.g7.2xlarge(Aliyun)

before:              1600010.00
after(no-premapped): 1599966.00
after(premapped):    1600014.00

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 132 insertions(+), 4 deletions(-)

Comments

Michael S. Tsirkin Feb. 25, 2024, 8:38 a.m. UTC | #1
On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> If the xsk is enabling, the xsk tx will share the send queue.
> But the xsk requires that the send queue use the premapped mode.
> So the send queue must support premapped mode.
> 
> cmd:
>     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
>         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> CPU:
>     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> 
> Machine:
>     ecs.g7.2xlarge(Aliyun)
> 
> before:              1600010.00
> after(no-premapped): 1599966.00
> after(premapped):    1600014.00
> 
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
>  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
>  1 file changed, 132 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 7715bb7032ec..b83ef6afc4fb 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
>  	u16 need_sync;
>  };
>  
> +struct virtnet_sq_dma {
> +	union {
> +		struct virtnet_sq_dma *next;
> +		void *data;
> +	};
> +
> +	u32 num;
> +
> +	dma_addr_t addr[MAX_SKB_FRAGS + 2];
> +	u32 len[MAX_SKB_FRAGS + 2];
> +};
> +
> +struct virtnet_sq_dma_head {
> +	/* record for kfree */
> +	void *p;
> +
> +	struct virtnet_sq_dma *free;
> +};
> +
>  /* Internal representation of a send virtqueue */
>  struct send_queue {
>  	/* Virtqueue associated with this send _queue */
> @@ -165,6 +184,8 @@ struct send_queue {
>  
>  	/* Record whether sq is in reset state. */
>  	bool reset;
> +
> +	struct virtnet_sq_dma_head dmainfo;
>  };
>  
>  /* Internal representation of a receive virtqueue */
> @@ -368,6 +389,95 @@ static struct xdp_frame *ptr_to_xdp(void *ptr)
>  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
>  }
>  
> +static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
> +{
> +	struct virtnet_sq_dma *d;
> +	int i;
> +
> +	d = *data;
> +	*data = d->data;
> +
> +	for (i = 0; i < d->num; ++i)
> +		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
> +					       DMA_TO_DEVICE, 0);
> +
> +	d->next = sq->dmainfo.free;
> +	sq->dmainfo.free = d;
> +
> +	return d;
> +}
> +
> +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> +						int nents, void *data)
> +{
> +	struct virtnet_sq_dma *d;
> +	struct scatterlist *sg;
> +	int i;
> +
> +	if (!sq->dmainfo.free)
> +		return NULL;
> +
> +	d = sq->dmainfo.free;
> +	sq->dmainfo.free = d->next;
> +
> +	for_each_sg(sq->sg, sg, nents, i) {
> +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> +			goto err;
> +
> +		d->addr[i] = sg->dma_address;
> +		d->len[i] = sg->length;
> +	}
> +
> +	d->data = data;
> +	d->num = i;
> +	return d;
> +
> +err:
> +	d->num = i;
> +	virtnet_sq_unmap(sq, (void **)&d);
> +	return NULL;
> +}


Do I see a reimplementation of linux/llist.h here?


> +
> +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> +{
> +	int ret;
> +
> +	if (sq->vq->premapped) {
> +		data = virtnet_sq_map_sg(sq, num, data);
> +		if (!data)
> +			return -ENOMEM;
> +	}
> +
> +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> +	if (ret && sq->vq->premapped)
> +		virtnet_sq_unmap(sq, &data);
> +
> +	return ret;
> +}
> +
> +static int virtnet_sq_init_dma_mate(struct send_queue *sq)

Mate? The popular south african drink?

> +{
> +	struct virtnet_sq_dma *d;
> +	int num, i;
> +
> +	num = virtqueue_get_vring_size(sq->vq);
> +
> +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> +	if (!sq->dmainfo.free)
> +		return -ENOMEM;


This could be quite a bit of memory for a large queue.  And for a bunch
of common cases where unmap is a nop (e.g. iommu pt) this does nothing
useful at all.  And also, this does nothing useful if PLATFORM_ACCESS is off
which is super common.

A while ago I proposed:
- extend DMA APIs so one can query whether unmap is a nop
  and whether sync is a nop
- virtio wrapper taking into account PLATFORM_ACCESS too

then we can save all this work and memory when not needed.



> +
> +	sq->dmainfo.p = sq->dmainfo.free;
> +
> +	for (i = 0; i < num; ++i) {
> +		d = &sq->dmainfo.free[i];
> +		d->next = d + 1;
> +	}
> +
> +	d->next = NULL;
> +
> +	return 0;
> +}
> +
>  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
>  			    struct virtnet_sq_free_stats *stats)
>  {
> @@ -377,6 +487,9 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
>  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
>  		++stats->packets;
>  
> +		if (sq->vq->premapped)
> +			virtnet_sq_unmap(sq, &ptr);
> +
>  		if (!is_xdp_frame(ptr)) {
>  			struct sk_buff *skb = ptr;
>  
> @@ -890,8 +1003,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
>  			    skb_frag_size(frag), skb_frag_off(frag));
>  	}
>  
> -	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> -				   xdp_to_ptr(xdpf), GFP_ATOMIC);
> +	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
>  	if (unlikely(err))
>  		return -ENOSPC; /* Caller handle free/refcnt */
>  
> @@ -2357,7 +2469,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
>  			return num_sg;
>  		num_sg++;
>  	}
> -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> +	return virtnet_add_outbuf(sq, num_sg, skb);
>  }
>  
>  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> @@ -4166,6 +4278,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		__netif_napi_del(&vi->rq[i].napi);
>  		__netif_napi_del(&vi->sq[i].napi);
> +
> +		kfree(vi->sq[i].dmainfo.p);
>  	}
>  
>  	/* We called __netif_napi_del(),
> @@ -4214,6 +4328,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
>  
>  static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
>  {
> +	struct virtnet_info *vi = vq->vdev->priv;
> +	struct send_queue *sq;
> +	int i = vq2rxq(vq);
> +
> +	sq = &vi->sq[i];
> +
> +	if (sq->vq->premapped)
> +		virtnet_sq_unmap(sq, &buf);
> +
>  	if (!is_xdp_frame(buf))
>  		dev_kfree_skb(buf);
>  	else
> @@ -4327,8 +4450,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
>  		if (ctx)
>  			ctx[rxq2vq(i)] = true;
>  
> -		if (premapped)
> +		if (premapped) {
>  			premapped[rxq2vq(i)] = true;
> +			premapped[txq2vq(i)] = true;
> +		}
>  	}
>  
>  	cfg.nvqs      = total_vqs;
> @@ -4352,6 +4477,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
>  		vi->rq[i].vq = vqs[rxq2vq(i)];
>  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
>  		vi->sq[i].vq = vqs[txq2vq(i)];
> +
> +		if (vi->sq[i].vq->premapped)
> +			virtnet_sq_init_dma_mate(&vi->sq[i]);
>  	}
>  
>  	/* run here: ret == 0. */
> -- 
> 2.32.0.3.g01195cf9f
Xuan Zhuo Feb. 26, 2024, 6:11 a.m. UTC | #2
On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > If the xsk is enabling, the xsk tx will share the send queue.
> > But the xsk requires that the send queue use the premapped mode.
> > So the send queue must support premapped mode.
> >
> > cmd:
> >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > CPU:
> >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> >
> > Machine:
> >     ecs.g7.2xlarge(Aliyun)
> >
> > before:              1600010.00
> > after(no-premapped): 1599966.00
> > after(premapped):    1600014.00
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 132 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 7715bb7032ec..b83ef6afc4fb 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> >  	u16 need_sync;
> >  };
> >
> > +

[...]

> > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > +						int nents, void *data)
> > +{
> > +	struct virtnet_sq_dma *d;
> > +	struct scatterlist *sg;
> > +	int i;
> > +
> > +	if (!sq->dmainfo.free)
> > +		return NULL;
> > +
> > +	d = sq->dmainfo.free;
> > +	sq->dmainfo.free = d->next;
> > +
> > +	for_each_sg(sq->sg, sg, nents, i) {
> > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > +			goto err;
> > +
> > +		d->addr[i] = sg->dma_address;
> > +		d->len[i] = sg->length;
> > +	}
> > +
> > +	d->data = data;
> > +	d->num = i;
> > +	return d;
> > +
> > +err:
> > +	d->num = i;
> > +	virtnet_sq_unmap(sq, (void **)&d);
> > +	return NULL;
> > +}
>
>
> Do I see a reimplementation of linux/llist.h here?

YES. This can be done by the APIs of linux/lllist.h.

But now, there is not __llist_del_first() (That will be used by
virtnet_sq_map_sg()).
And that is simple and just two places may use the APIs, so I implement it
directly.

>
>
> > +
> > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > +{
> > +	int ret;
> > +
> > +	if (sq->vq->premapped) {
> > +		data = virtnet_sq_map_sg(sq, num, data);
> > +		if (!data)
> > +			return -ENOMEM;
> > +	}
> > +
> > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > +	if (ret && sq->vq->premapped)
> > +		virtnet_sq_unmap(sq, &data);
> > +
> > +	return ret;
> > +}
> > +
> > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
>
> Mate? The popular south african drink?

Sorry, should be meta, I mean metadata.

>
> > +{
> > +	struct virtnet_sq_dma *d;
> > +	int num, i;
> > +
> > +	num = virtqueue_get_vring_size(sq->vq);
> > +
> > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > +	if (!sq->dmainfo.free)
> > +		return -ENOMEM;
>
>
> This could be quite a bit of memory for a large queue.  And for a bunch
> of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> useful at all.

Then can we skip the unmap api, so pass a zero to the unmap api?

> And also, this does nothing useful if PLATFORM_ACCESS is off
> which is super common.

That is ok. That just work when PLATFORM_ACCESS is on.

Thanks.

>
> A while ago I proposed:
> - extend DMA APIs so one can query whether unmap is a nop
>   and whether sync is a nop
> - virtio wrapper taking into account PLATFORM_ACCESS too
>
> then we can save all this work and memory when not needed.
>
>
>
> > +
> > +	sq->dmainfo.p = sq->dmainfo.free;
> > +
> > +	for (i = 0; i < num; ++i) {
> > +		d = &sq->dmainfo.free[i];
> > +		d->next = d + 1;
> > +	}
> > +
> > +	d->next = NULL;
> > +
> > +	return 0;
> > +}
> > +
> >  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> >  			    struct virtnet_sq_free_stats *stats)
> >  {
> > @@ -377,6 +487,9 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> >  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> >  		++stats->packets;
> >
> > +		if (sq->vq->premapped)
> > +			virtnet_sq_unmap(sq, &ptr);
> > +
> >  		if (!is_xdp_frame(ptr)) {
> >  			struct sk_buff *skb = ptr;
> >
> > @@ -890,8 +1003,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> >  			    skb_frag_size(frag), skb_frag_off(frag));
> >  	}
> >
> > -	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> > -				   xdp_to_ptr(xdpf), GFP_ATOMIC);
> > +	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
> >  	if (unlikely(err))
> >  		return -ENOSPC; /* Caller handle free/refcnt */
> >
> > @@ -2357,7 +2469,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> >  			return num_sg;
> >  		num_sg++;
> >  	}
> > -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> > +	return virtnet_add_outbuf(sq, num_sg, skb);
> >  }
> >
> >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > @@ -4166,6 +4278,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> >  		__netif_napi_del(&vi->rq[i].napi);
> >  		__netif_napi_del(&vi->sq[i].napi);
> > +
> > +		kfree(vi->sq[i].dmainfo.p);
> >  	}
> >
> >  	/* We called __netif_napi_del(),
> > @@ -4214,6 +4328,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> >
> >  static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> >  {
> > +	struct virtnet_info *vi = vq->vdev->priv;
> > +	struct send_queue *sq;
> > +	int i = vq2rxq(vq);
> > +
> > +	sq = &vi->sq[i];
> > +
> > +	if (sq->vq->premapped)
> > +		virtnet_sq_unmap(sq, &buf);
> > +
> >  	if (!is_xdp_frame(buf))
> >  		dev_kfree_skb(buf);
> >  	else
> > @@ -4327,8 +4450,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> >  		if (ctx)
> >  			ctx[rxq2vq(i)] = true;
> >
> > -		if (premapped)
> > +		if (premapped) {
> >  			premapped[rxq2vq(i)] = true;
> > +			premapped[txq2vq(i)] = true;
> > +		}
> >  	}
> >
> >  	cfg.nvqs      = total_vqs;
> > @@ -4352,6 +4477,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> >  		vi->rq[i].vq = vqs[rxq2vq(i)];
> >  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> >  		vi->sq[i].vq = vqs[txq2vq(i)];
> > +
> > +		if (vi->sq[i].vq->premapped)
> > +			virtnet_sq_init_dma_mate(&vi->sq[i]);
> >  	}
> >
> >  	/* run here: ret == 0. */
> > --
> > 2.32.0.3.g01195cf9f
>
Xuan Zhuo Feb. 26, 2024, 7:44 a.m. UTC | #3
On Mon, 26 Feb 2024 14:11:01 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > > If the xsk is enabling, the xsk tx will share the send queue.
> > > But the xsk requires that the send queue use the premapped mode.
> > > So the send queue must support premapped mode.
> > >
> > > cmd:
> > >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> > >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > > CPU:
> > >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> > >
> > > Machine:
> > >     ecs.g7.2xlarge(Aliyun)
> > >
> > > before:              1600010.00
> > > after(no-premapped): 1599966.00
> > > after(premapped):    1600014.00
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > ---
> > >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 132 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 7715bb7032ec..b83ef6afc4fb 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> > >  	u16 need_sync;
> > >  };
> > >
> > > +
>
> [...]
>
> > > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > > +						int nents, void *data)
> > > +{
> > > +	struct virtnet_sq_dma *d;
> > > +	struct scatterlist *sg;
> > > +	int i;
> > > +
> > > +	if (!sq->dmainfo.free)
> > > +		return NULL;
> > > +
> > > +	d = sq->dmainfo.free;
> > > +	sq->dmainfo.free = d->next;
> > > +
> > > +	for_each_sg(sq->sg, sg, nents, i) {
> > > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > > +			goto err;
> > > +
> > > +		d->addr[i] = sg->dma_address;
> > > +		d->len[i] = sg->length;
> > > +	}
> > > +
> > > +	d->data = data;
> > > +	d->num = i;
> > > +	return d;
> > > +
> > > +err:
> > > +	d->num = i;
> > > +	virtnet_sq_unmap(sq, (void **)&d);
> > > +	return NULL;
> > > +}
> >
> >
> > Do I see a reimplementation of linux/llist.h here?
>
> YES. This can be done by the APIs of linux/lllist.h.
>
> But now, there is not __llist_del_first() (That will be used by
> virtnet_sq_map_sg()).
> And that is simple and just two places may use the APIs, so I implement it
> directly.
>
> >
> >
> > > +
> > > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > > +{
> > > +	int ret;
> > > +
> > > +	if (sq->vq->premapped) {
> > > +		data = virtnet_sq_map_sg(sq, num, data);
> > > +		if (!data)
> > > +			return -ENOMEM;
> > > +	}
> > > +
> > > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > > +	if (ret && sq->vq->premapped)
> > > +		virtnet_sq_unmap(sq, &data);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
> >
> > Mate? The popular south african drink?
>
> Sorry, should be meta, I mean metadata.
>
> >
> > > +{
> > > +	struct virtnet_sq_dma *d;
> > > +	int num, i;
> > > +
> > > +	num = virtqueue_get_vring_size(sq->vq);
> > > +
> > > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > > +	if (!sq->dmainfo.free)
> > > +		return -ENOMEM;
> >
> >
> > This could be quite a bit of memory for a large queue.  And for a bunch
> > of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> > useful at all.
>
> Then can we skip the unmap api, so pass a zero to the unmap api?

Typo.

Then can we skip the unmap api, or pass a zero(because the dma address is not
recorded) to the unmap api?

Thanks



>
> > And also, this does nothing useful if PLATFORM_ACCESS is off
> > which is super common.
>
> That is ok. That just work when PLATFORM_ACCESS is on.
>
> Thanks.
>
> >
> > A while ago I proposed:
> > - extend DMA APIs so one can query whether unmap is a nop
> >   and whether sync is a nop
> > - virtio wrapper taking into account PLATFORM_ACCESS too
> >
> > then we can save all this work and memory when not needed.
> >
> >
> >
> > > +
> > > +	sq->dmainfo.p = sq->dmainfo.free;
> > > +
> > > +	for (i = 0; i < num; ++i) {
> > > +		d = &sq->dmainfo.free[i];
> > > +		d->next = d + 1;
> > > +	}
> > > +
> > > +	d->next = NULL;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > >  			    struct virtnet_sq_free_stats *stats)
> > >  {
> > > @@ -377,6 +487,9 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > >  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > >  		++stats->packets;
> > >
> > > +		if (sq->vq->premapped)
> > > +			virtnet_sq_unmap(sq, &ptr);
> > > +
> > >  		if (!is_xdp_frame(ptr)) {
> > >  			struct sk_buff *skb = ptr;
> > >
> > > @@ -890,8 +1003,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> > >  			    skb_frag_size(frag), skb_frag_off(frag));
> > >  	}
> > >
> > > -	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> > > -				   xdp_to_ptr(xdpf), GFP_ATOMIC);
> > > +	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
> > >  	if (unlikely(err))
> > >  		return -ENOSPC; /* Caller handle free/refcnt */
> > >
> > > @@ -2357,7 +2469,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> > >  			return num_sg;
> > >  		num_sg++;
> > >  	}
> > > -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> > > +	return virtnet_add_outbuf(sq, num_sg, skb);
> > >  }
> > >
> > >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > > @@ -4166,6 +4278,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > >  		__netif_napi_del(&vi->rq[i].napi);
> > >  		__netif_napi_del(&vi->sq[i].napi);
> > > +
> > > +		kfree(vi->sq[i].dmainfo.p);
> > >  	}
> > >
> > >  	/* We called __netif_napi_del(),
> > > @@ -4214,6 +4328,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> > >
> > >  static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > >  {
> > > +	struct virtnet_info *vi = vq->vdev->priv;
> > > +	struct send_queue *sq;
> > > +	int i = vq2rxq(vq);
> > > +
> > > +	sq = &vi->sq[i];
> > > +
> > > +	if (sq->vq->premapped)
> > > +		virtnet_sq_unmap(sq, &buf);
> > > +
> > >  	if (!is_xdp_frame(buf))
> > >  		dev_kfree_skb(buf);
> > >  	else
> > > @@ -4327,8 +4450,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > >  		if (ctx)
> > >  			ctx[rxq2vq(i)] = true;
> > >
> > > -		if (premapped)
> > > +		if (premapped) {
> > >  			premapped[rxq2vq(i)] = true;
> > > +			premapped[txq2vq(i)] = true;
> > > +		}
> > >  	}
> > >
> > >  	cfg.nvqs      = total_vqs;
> > > @@ -4352,6 +4477,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > >  		vi->rq[i].vq = vqs[rxq2vq(i)];
> > >  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > >  		vi->sq[i].vq = vqs[txq2vq(i)];
> > > +
> > > +		if (vi->sq[i].vq->premapped)
> > > +			virtnet_sq_init_dma_mate(&vi->sq[i]);
> > >  	}
> > >
> > >  	/* run here: ret == 0. */
> > > --
> > > 2.32.0.3.g01195cf9f
> >
>
Xuan Zhuo Feb. 26, 2024, 9:24 a.m. UTC | #4
On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > If the xsk is enabling, the xsk tx will share the send queue.
> > But the xsk requires that the send queue use the premapped mode.
> > So the send queue must support premapped mode.
> >
> > cmd:
> >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > CPU:
> >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> >
> > Machine:
> >     ecs.g7.2xlarge(Aliyun)
> >
> > before:              1600010.00
> > after(no-premapped): 1599966.00
> > after(premapped):    1600014.00
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 132 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 7715bb7032ec..b83ef6afc4fb 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> >  	u16 need_sync;
> >  };
> >
> > +struct virtnet_sq_dma {
> > +	union {
> > +		struct virtnet_sq_dma *next;
> > +		void *data;
> > +	};
> > +
> > +	u32 num;
> > +
> > +	dma_addr_t addr[MAX_SKB_FRAGS + 2];
> > +	u32 len[MAX_SKB_FRAGS + 2];
> > +};
> > +
> > +struct virtnet_sq_dma_head {
> > +	/* record for kfree */
> > +	void *p;
> > +
> > +	struct virtnet_sq_dma *free;
> > +};
> > +
> >  /* Internal representation of a send virtqueue */
> >  struct send_queue {
> >  	/* Virtqueue associated with this send _queue */
> > @@ -165,6 +184,8 @@ struct send_queue {
> >
> >  	/* Record whether sq is in reset state. */
> >  	bool reset;
> > +
> > +	struct virtnet_sq_dma_head dmainfo;
> >  };
> >
> >  /* Internal representation of a receive virtqueue */
> > @@ -368,6 +389,95 @@ static struct xdp_frame *ptr_to_xdp(void *ptr)
> >  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> >  }
> >
> > +static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
> > +{
> > +	struct virtnet_sq_dma *d;
> > +	int i;
> > +
> > +	d = *data;
> > +	*data = d->data;
> > +
> > +	for (i = 0; i < d->num; ++i)
> > +		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
> > +					       DMA_TO_DEVICE, 0);
> > +
> > +	d->next = sq->dmainfo.free;
> > +	sq->dmainfo.free = d;
> > +
> > +	return d;
> > +}
> > +
> > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > +						int nents, void *data)
> > +{
> > +	struct virtnet_sq_dma *d;
> > +	struct scatterlist *sg;
> > +	int i;
> > +
> > +	if (!sq->dmainfo.free)
> > +		return NULL;
> > +
> > +	d = sq->dmainfo.free;
> > +	sq->dmainfo.free = d->next;
> > +
> > +	for_each_sg(sq->sg, sg, nents, i) {
> > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > +			goto err;
> > +
> > +		d->addr[i] = sg->dma_address;
> > +		d->len[i] = sg->length;
> > +	}
> > +
> > +	d->data = data;
> > +	d->num = i;
> > +	return d;
> > +
> > +err:
> > +	d->num = i;
> > +	virtnet_sq_unmap(sq, (void **)&d);
> > +	return NULL;
> > +}
>
>
> Do I see a reimplementation of linux/llist.h here?
>
>
> > +
> > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > +{
> > +	int ret;
> > +
> > +	if (sq->vq->premapped) {
> > +		data = virtnet_sq_map_sg(sq, num, data);
> > +		if (!data)
> > +			return -ENOMEM;
> > +	}
> > +
> > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > +	if (ret && sq->vq->premapped)
> > +		virtnet_sq_unmap(sq, &data);
> > +
> > +	return ret;
> > +}
> > +
> > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
>
> Mate? The popular south african drink?
>
> > +{
> > +	struct virtnet_sq_dma *d;
> > +	int num, i;
> > +
> > +	num = virtqueue_get_vring_size(sq->vq);
> > +
> > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > +	if (!sq->dmainfo.free)
> > +		return -ENOMEM;
>
>
> This could be quite a bit of memory for a large queue.  And for a bunch
> of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> useful at all.  And also, this does nothing useful if PLATFORM_ACCESS is off
> which is super common.
>
> A while ago I proposed:
> - extend DMA APIs so one can query whether unmap is a nop


We may have trouble for this.

dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
		size_t offset, size_t size, enum dma_data_direction dir,
		unsigned long attrs)
{
	const struct dma_map_ops *ops = get_dma_ops(dev);
	dma_addr_t addr;

	BUG_ON(!valid_dma_direction(dir));

	if (WARN_ON_ONCE(!dev->dma_mask))
		return DMA_MAPPING_ERROR;

	if (dma_map_direct(dev, ops) ||
	    arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
	else
		addr = ops->map_page(dev, page, offset, size, dir, attrs);
	kmsan_handle_dma(page, offset, size, dir);
	debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);

	return addr;
}

arch_dma_map_page_direct will check the dma address.
So we can not judge by the API in advance.

Thanks.




>   and whether sync is a nop
> - virtio wrapper taking into account PLATFORM_ACCESS too
>
> then we can save all this work and memory when not needed.
>
>
>
> > +
> > +	sq->dmainfo.p = sq->dmainfo.free;
> > +
> > +	for (i = 0; i < num; ++i) {
> > +		d = &sq->dmainfo.free[i];
> > +		d->next = d + 1;
> > +	}
> > +
> > +	d->next = NULL;
> > +
> > +	return 0;
> > +}
> > +
> >  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> >  			    struct virtnet_sq_free_stats *stats)
> >  {
> > @@ -377,6 +487,9 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> >  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> >  		++stats->packets;
> >
> > +		if (sq->vq->premapped)
> > +			virtnet_sq_unmap(sq, &ptr);
> > +
> >  		if (!is_xdp_frame(ptr)) {
> >  			struct sk_buff *skb = ptr;
> >
> > @@ -890,8 +1003,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> >  			    skb_frag_size(frag), skb_frag_off(frag));
> >  	}
> >
> > -	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> > -				   xdp_to_ptr(xdpf), GFP_ATOMIC);
> > +	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
> >  	if (unlikely(err))
> >  		return -ENOSPC; /* Caller handle free/refcnt */
> >
> > @@ -2357,7 +2469,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> >  			return num_sg;
> >  		num_sg++;
> >  	}
> > -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> > +	return virtnet_add_outbuf(sq, num_sg, skb);
> >  }
> >
> >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > @@ -4166,6 +4278,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> >  		__netif_napi_del(&vi->rq[i].napi);
> >  		__netif_napi_del(&vi->sq[i].napi);
> > +
> > +		kfree(vi->sq[i].dmainfo.p);
> >  	}
> >
> >  	/* We called __netif_napi_del(),
> > @@ -4214,6 +4328,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> >
> >  static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> >  {
> > +	struct virtnet_info *vi = vq->vdev->priv;
> > +	struct send_queue *sq;
> > +	int i = vq2rxq(vq);
> > +
> > +	sq = &vi->sq[i];
> > +
> > +	if (sq->vq->premapped)
> > +		virtnet_sq_unmap(sq, &buf);
> > +
> >  	if (!is_xdp_frame(buf))
> >  		dev_kfree_skb(buf);
> >  	else
> > @@ -4327,8 +4450,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> >  		if (ctx)
> >  			ctx[rxq2vq(i)] = true;
> >
> > -		if (premapped)
> > +		if (premapped) {
> >  			premapped[rxq2vq(i)] = true;
> > +			premapped[txq2vq(i)] = true;
> > +		}
> >  	}
> >
> >  	cfg.nvqs      = total_vqs;
> > @@ -4352,6 +4477,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> >  		vi->rq[i].vq = vqs[rxq2vq(i)];
> >  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> >  		vi->sq[i].vq = vqs[txq2vq(i)];
> > +
> > +		if (vi->sq[i].vq->premapped)
> > +			virtnet_sq_init_dma_mate(&vi->sq[i]);
> >  	}
> >
> >  	/* run here: ret == 0. */
> > --
> > 2.32.0.3.g01195cf9f
>
Xuan Zhuo Feb. 26, 2024, 11:20 a.m. UTC | #5
On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > If the xsk is enabling, the xsk tx will share the send queue.
> > But the xsk requires that the send queue use the premapped mode.
> > So the send queue must support premapped mode.
> >
> > cmd:
> >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > CPU:
> >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> >
> > Machine:
> >     ecs.g7.2xlarge(Aliyun)
> >
> > before:              1600010.00
> > after(no-premapped): 1599966.00
> > after(premapped):    1600014.00
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 132 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 7715bb7032ec..b83ef6afc4fb 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> >  	u16 need_sync;
> >  };
> >
> > +struct virtnet_sq_dma {
> > +	union {
> > +		struct virtnet_sq_dma *next;
> > +		void *data;
> > +	};
> > +
> > +	u32 num;
> > +
> > +	dma_addr_t addr[MAX_SKB_FRAGS + 2];
> > +	u32 len[MAX_SKB_FRAGS + 2];
> > +};
> > +
> > +struct virtnet_sq_dma_head {
> > +	/* record for kfree */
> > +	void *p;
> > +
> > +	struct virtnet_sq_dma *free;
> > +};
> > +
> >  /* Internal representation of a send virtqueue */
> >  struct send_queue {
> >  	/* Virtqueue associated with this send _queue */
> > @@ -165,6 +184,8 @@ struct send_queue {
> >
> >  	/* Record whether sq is in reset state. */
> >  	bool reset;
> > +
> > +	struct virtnet_sq_dma_head dmainfo;
> >  };
> >
> >  /* Internal representation of a receive virtqueue */
> > @@ -368,6 +389,95 @@ static struct xdp_frame *ptr_to_xdp(void *ptr)
> >  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> >  }
> >
> > +static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
> > +{
> > +	struct virtnet_sq_dma *d;
> > +	int i;
> > +
> > +	d = *data;
> > +	*data = d->data;
> > +
> > +	for (i = 0; i < d->num; ++i)
> > +		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
> > +					       DMA_TO_DEVICE, 0);
> > +
> > +	d->next = sq->dmainfo.free;
> > +	sq->dmainfo.free = d;
> > +
> > +	return d;
> > +}
> > +
> > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > +						int nents, void *data)
> > +{
> > +	struct virtnet_sq_dma *d;
> > +	struct scatterlist *sg;
> > +	int i;
> > +
> > +	if (!sq->dmainfo.free)
> > +		return NULL;
> > +
> > +	d = sq->dmainfo.free;
> > +	sq->dmainfo.free = d->next;
> > +
> > +	for_each_sg(sq->sg, sg, nents, i) {
> > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > +			goto err;
> > +
> > +		d->addr[i] = sg->dma_address;
> > +		d->len[i] = sg->length;
> > +	}
> > +
> > +	d->data = data;
> > +	d->num = i;
> > +	return d;
> > +
> > +err:
> > +	d->num = i;
> > +	virtnet_sq_unmap(sq, (void **)&d);
> > +	return NULL;
> > +}
>
>
> Do I see a reimplementation of linux/llist.h here?
>
>
> > +
> > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > +{
> > +	int ret;
> > +
> > +	if (sq->vq->premapped) {
> > +		data = virtnet_sq_map_sg(sq, num, data);
> > +		if (!data)
> > +			return -ENOMEM;
> > +	}
> > +
> > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > +	if (ret && sq->vq->premapped)
> > +		virtnet_sq_unmap(sq, &data);
> > +
> > +	return ret;
> > +}
> > +
> > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
>
> Mate? The popular south african drink?
>
> > +{
> > +	struct virtnet_sq_dma *d;
> > +	int num, i;
> > +
> > +	num = virtqueue_get_vring_size(sq->vq);
> > +
> > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > +	if (!sq->dmainfo.free)
> > +		return -ENOMEM;
>
>
> This could be quite a bit of memory for a large queue.  And for a bunch
> of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> useful at all.  And also, this does nothing useful if PLATFORM_ACCESS is off
> which is super common.
>
> A while ago I proposed:
> - extend DMA APIs so one can query whether unmap is a nop

I think such code is ok:

bool dma_is_direct(struct device *dev)
{
	if (!dma_map_direct(dev, ops))
		return false;

	if (is_swiotlb_force_bounce(dev))
		return false;

	return true;
}

@Christoph Hellwig

Thanks.


>   and whether sync is a nop
> - virtio wrapper taking into account PLATFORM_ACCESS too
>
> then we can save all this work and memory when not needed.
>
>
>
> > +
> > +	sq->dmainfo.p = sq->dmainfo.free;
> > +
> > +	for (i = 0; i < num; ++i) {
> > +		d = &sq->dmainfo.free[i];
> > +		d->next = d + 1;
> > +	}
> > +
> > +	d->next = NULL;
> > +
> > +	return 0;
> > +}
> > +
> >  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> >  			    struct virtnet_sq_free_stats *stats)
> >  {
> > @@ -377,6 +487,9 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> >  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> >  		++stats->packets;
> >
> > +		if (sq->vq->premapped)
> > +			virtnet_sq_unmap(sq, &ptr);
> > +
> >  		if (!is_xdp_frame(ptr)) {
> >  			struct sk_buff *skb = ptr;
> >
> > @@ -890,8 +1003,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> >  			    skb_frag_size(frag), skb_frag_off(frag));
> >  	}
> >
> > -	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> > -				   xdp_to_ptr(xdpf), GFP_ATOMIC);
> > +	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
> >  	if (unlikely(err))
> >  		return -ENOSPC; /* Caller handle free/refcnt */
> >
> > @@ -2357,7 +2469,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> >  			return num_sg;
> >  		num_sg++;
> >  	}
> > -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> > +	return virtnet_add_outbuf(sq, num_sg, skb);
> >  }
> >
> >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > @@ -4166,6 +4278,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> >  		__netif_napi_del(&vi->rq[i].napi);
> >  		__netif_napi_del(&vi->sq[i].napi);
> > +
> > +		kfree(vi->sq[i].dmainfo.p);
> >  	}
> >
> >  	/* We called __netif_napi_del(),
> > @@ -4214,6 +4328,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> >
> >  static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> >  {
> > +	struct virtnet_info *vi = vq->vdev->priv;
> > +	struct send_queue *sq;
> > +	int i = vq2rxq(vq);
> > +
> > +	sq = &vi->sq[i];
> > +
> > +	if (sq->vq->premapped)
> > +		virtnet_sq_unmap(sq, &buf);
> > +
> >  	if (!is_xdp_frame(buf))
> >  		dev_kfree_skb(buf);
> >  	else
> > @@ -4327,8 +4450,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> >  		if (ctx)
> >  			ctx[rxq2vq(i)] = true;
> >
> > -		if (premapped)
> > +		if (premapped) {
> >  			premapped[rxq2vq(i)] = true;
> > +			premapped[txq2vq(i)] = true;
> > +		}
> >  	}
> >
> >  	cfg.nvqs      = total_vqs;
> > @@ -4352,6 +4477,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> >  		vi->rq[i].vq = vqs[rxq2vq(i)];
> >  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> >  		vi->sq[i].vq = vqs[txq2vq(i)];
> > +
> > +		if (vi->sq[i].vq->premapped)
> > +			virtnet_sq_init_dma_mate(&vi->sq[i]);
> >  	}
> >
> >  	/* run here: ret == 0. */
> > --
> > 2.32.0.3.g01195cf9f
>
Michael S. Tsirkin Feb. 26, 2024, 11:31 a.m. UTC | #6
On Mon, Feb 26, 2024 at 07:20:40PM +0800, Xuan Zhuo wrote:
> On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > > If the xsk is enabling, the xsk tx will share the send queue.
> > > But the xsk requires that the send queue use the premapped mode.
> > > So the send queue must support premapped mode.
> > >
> > > cmd:
> > >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> > >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > > CPU:
> > >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> > >
> > > Machine:
> > >     ecs.g7.2xlarge(Aliyun)
> > >
> > > before:              1600010.00
> > > after(no-premapped): 1599966.00
> > > after(premapped):    1600014.00
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > ---
> > >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 132 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 7715bb7032ec..b83ef6afc4fb 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> > >  	u16 need_sync;
> > >  };
> > >
> > > +struct virtnet_sq_dma {
> > > +	union {
> > > +		struct virtnet_sq_dma *next;
> > > +		void *data;
> > > +	};
> > > +
> > > +	u32 num;
> > > +
> > > +	dma_addr_t addr[MAX_SKB_FRAGS + 2];
> > > +	u32 len[MAX_SKB_FRAGS + 2];
> > > +};
> > > +
> > > +struct virtnet_sq_dma_head {
> > > +	/* record for kfree */
> > > +	void *p;
> > > +
> > > +	struct virtnet_sq_dma *free;
> > > +};
> > > +
> > >  /* Internal representation of a send virtqueue */
> > >  struct send_queue {
> > >  	/* Virtqueue associated with this send _queue */
> > > @@ -165,6 +184,8 @@ struct send_queue {
> > >
> > >  	/* Record whether sq is in reset state. */
> > >  	bool reset;
> > > +
> > > +	struct virtnet_sq_dma_head dmainfo;
> > >  };
> > >
> > >  /* Internal representation of a receive virtqueue */
> > > @@ -368,6 +389,95 @@ static struct xdp_frame *ptr_to_xdp(void *ptr)
> > >  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> > >  }
> > >
> > > +static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
> > > +{
> > > +	struct virtnet_sq_dma *d;
> > > +	int i;
> > > +
> > > +	d = *data;
> > > +	*data = d->data;
> > > +
> > > +	for (i = 0; i < d->num; ++i)
> > > +		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
> > > +					       DMA_TO_DEVICE, 0);
> > > +
> > > +	d->next = sq->dmainfo.free;
> > > +	sq->dmainfo.free = d;
> > > +
> > > +	return d;
> > > +}
> > > +
> > > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > > +						int nents, void *data)
> > > +{
> > > +	struct virtnet_sq_dma *d;
> > > +	struct scatterlist *sg;
> > > +	int i;
> > > +
> > > +	if (!sq->dmainfo.free)
> > > +		return NULL;
> > > +
> > > +	d = sq->dmainfo.free;
> > > +	sq->dmainfo.free = d->next;
> > > +
> > > +	for_each_sg(sq->sg, sg, nents, i) {
> > > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > > +			goto err;
> > > +
> > > +		d->addr[i] = sg->dma_address;
> > > +		d->len[i] = sg->length;
> > > +	}
> > > +
> > > +	d->data = data;
> > > +	d->num = i;
> > > +	return d;
> > > +
> > > +err:
> > > +	d->num = i;
> > > +	virtnet_sq_unmap(sq, (void **)&d);
> > > +	return NULL;
> > > +}
> >
> >
> > Do I see a reimplementation of linux/llist.h here?
> >
> >
> > > +
> > > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > > +{
> > > +	int ret;
> > > +
> > > +	if (sq->vq->premapped) {
> > > +		data = virtnet_sq_map_sg(sq, num, data);
> > > +		if (!data)
> > > +			return -ENOMEM;
> > > +	}
> > > +
> > > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > > +	if (ret && sq->vq->premapped)
> > > +		virtnet_sq_unmap(sq, &data);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
> >
> > Mate? The popular south african drink?
> >
> > > +{
> > > +	struct virtnet_sq_dma *d;
> > > +	int num, i;
> > > +
> > > +	num = virtqueue_get_vring_size(sq->vq);
> > > +
> > > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > > +	if (!sq->dmainfo.free)
> > > +		return -ENOMEM;
> >
> >
> > This could be quite a bit of memory for a large queue.  And for a bunch
> > of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> > useful at all.  And also, this does nothing useful if PLATFORM_ACCESS is off
> > which is super common.
> >
> > A while ago I proposed:
> > - extend DMA APIs so one can query whether unmap is a nop
> 
> I think such code is ok:
> 
> bool dma_is_direct(struct device *dev)
> {
> 	if (!dma_map_direct(dev, ops))
> 		return false;

what is dma_map_direct? can't find it in the tree.

> 	if (is_swiotlb_force_bounce(dev))
> 		return false;
> 
> 	return true;
> }
> 
> @Christoph Hellwig
> 
> Thanks.
> 
> 
> >   and whether sync is a nop
> > - virtio wrapper taking into account PLATFORM_ACCESS too
> >
> > then we can save all this work and memory when not needed.
> >
> >
> >
> > > +
> > > +	sq->dmainfo.p = sq->dmainfo.free;
> > > +
> > > +	for (i = 0; i < num; ++i) {
> > > +		d = &sq->dmainfo.free[i];
> > > +		d->next = d + 1;
> > > +	}
> > > +
> > > +	d->next = NULL;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > >  			    struct virtnet_sq_free_stats *stats)
> > >  {
> > > @@ -377,6 +487,9 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > >  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > >  		++stats->packets;
> > >
> > > +		if (sq->vq->premapped)
> > > +			virtnet_sq_unmap(sq, &ptr);
> > > +
> > >  		if (!is_xdp_frame(ptr)) {
> > >  			struct sk_buff *skb = ptr;
> > >
> > > @@ -890,8 +1003,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> > >  			    skb_frag_size(frag), skb_frag_off(frag));
> > >  	}
> > >
> > > -	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> > > -				   xdp_to_ptr(xdpf), GFP_ATOMIC);
> > > +	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
> > >  	if (unlikely(err))
> > >  		return -ENOSPC; /* Caller handle free/refcnt */
> > >
> > > @@ -2357,7 +2469,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> > >  			return num_sg;
> > >  		num_sg++;
> > >  	}
> > > -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> > > +	return virtnet_add_outbuf(sq, num_sg, skb);
> > >  }
> > >
> > >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > > @@ -4166,6 +4278,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > >  		__netif_napi_del(&vi->rq[i].napi);
> > >  		__netif_napi_del(&vi->sq[i].napi);
> > > +
> > > +		kfree(vi->sq[i].dmainfo.p);
> > >  	}
> > >
> > >  	/* We called __netif_napi_del(),
> > > @@ -4214,6 +4328,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> > >
> > >  static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > >  {
> > > +	struct virtnet_info *vi = vq->vdev->priv;
> > > +	struct send_queue *sq;
> > > +	int i = vq2rxq(vq);
> > > +
> > > +	sq = &vi->sq[i];
> > > +
> > > +	if (sq->vq->premapped)
> > > +		virtnet_sq_unmap(sq, &buf);
> > > +
> > >  	if (!is_xdp_frame(buf))
> > >  		dev_kfree_skb(buf);
> > >  	else
> > > @@ -4327,8 +4450,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > >  		if (ctx)
> > >  			ctx[rxq2vq(i)] = true;
> > >
> > > -		if (premapped)
> > > +		if (premapped) {
> > >  			premapped[rxq2vq(i)] = true;
> > > +			premapped[txq2vq(i)] = true;
> > > +		}
> > >  	}
> > >
> > >  	cfg.nvqs      = total_vqs;
> > > @@ -4352,6 +4477,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > >  		vi->rq[i].vq = vqs[rxq2vq(i)];
> > >  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > >  		vi->sq[i].vq = vqs[txq2vq(i)];
> > > +
> > > +		if (vi->sq[i].vq->premapped)
> > > +			virtnet_sq_init_dma_mate(&vi->sq[i]);
> > >  	}
> > >
> > >  	/* run here: ret == 0. */
> > > --
> > > 2.32.0.3.g01195cf9f
> >
Xuan Zhuo Feb. 26, 2024, 11:33 a.m. UTC | #7
On Mon, 26 Feb 2024 06:31:49 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Mon, Feb 26, 2024 at 07:20:40PM +0800, Xuan Zhuo wrote:
> > On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > > > If the xsk is enabling, the xsk tx will share the send queue.
> > > > But the xsk requires that the send queue use the premapped mode.
> > > > So the send queue must support premapped mode.
> > > >
> > > > cmd:
> > > >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> > > >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > > > CPU:
> > > >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> > > >
> > > > Machine:
> > > >     ecs.g7.2xlarge(Aliyun)
> > > >
> > > > before:              1600010.00
> > > > after(no-premapped): 1599966.00
> > > > after(premapped):    1600014.00
> > > >
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > ---
> > > >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> > > >  1 file changed, 132 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index 7715bb7032ec..b83ef6afc4fb 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> > > >  	u16 need_sync;
> > > >  };
> > > >
> > > > +struct virtnet_sq_dma {
> > > > +	union {
> > > > +		struct virtnet_sq_dma *next;
> > > > +		void *data;
> > > > +	};
> > > > +
> > > > +	u32 num;
> > > > +
> > > > +	dma_addr_t addr[MAX_SKB_FRAGS + 2];
> > > > +	u32 len[MAX_SKB_FRAGS + 2];
> > > > +};
> > > > +
> > > > +struct virtnet_sq_dma_head {
> > > > +	/* record for kfree */
> > > > +	void *p;
> > > > +
> > > > +	struct virtnet_sq_dma *free;
> > > > +};
> > > > +
> > > >  /* Internal representation of a send virtqueue */
> > > >  struct send_queue {
> > > >  	/* Virtqueue associated with this send _queue */
> > > > @@ -165,6 +184,8 @@ struct send_queue {
> > > >
> > > >  	/* Record whether sq is in reset state. */
> > > >  	bool reset;
> > > > +
> > > > +	struct virtnet_sq_dma_head dmainfo;
> > > >  };
> > > >
> > > >  /* Internal representation of a receive virtqueue */
> > > > @@ -368,6 +389,95 @@ static struct xdp_frame *ptr_to_xdp(void *ptr)
> > > >  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> > > >  }
> > > >
> > > > +static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
> > > > +{
> > > > +	struct virtnet_sq_dma *d;
> > > > +	int i;
> > > > +
> > > > +	d = *data;
> > > > +	*data = d->data;
> > > > +
> > > > +	for (i = 0; i < d->num; ++i)
> > > > +		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
> > > > +					       DMA_TO_DEVICE, 0);
> > > > +
> > > > +	d->next = sq->dmainfo.free;
> > > > +	sq->dmainfo.free = d;
> > > > +
> > > > +	return d;
> > > > +}
> > > > +
> > > > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > > > +						int nents, void *data)
> > > > +{
> > > > +	struct virtnet_sq_dma *d;
> > > > +	struct scatterlist *sg;
> > > > +	int i;
> > > > +
> > > > +	if (!sq->dmainfo.free)
> > > > +		return NULL;
> > > > +
> > > > +	d = sq->dmainfo.free;
> > > > +	sq->dmainfo.free = d->next;
> > > > +
> > > > +	for_each_sg(sq->sg, sg, nents, i) {
> > > > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > > > +			goto err;
> > > > +
> > > > +		d->addr[i] = sg->dma_address;
> > > > +		d->len[i] = sg->length;
> > > > +	}
> > > > +
> > > > +	d->data = data;
> > > > +	d->num = i;
> > > > +	return d;
> > > > +
> > > > +err:
> > > > +	d->num = i;
> > > > +	virtnet_sq_unmap(sq, (void **)&d);
> > > > +	return NULL;
> > > > +}
> > >
> > >
> > > Do I see a reimplementation of linux/llist.h here?
> > >
> > >
> > > > +
> > > > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	if (sq->vq->premapped) {
> > > > +		data = virtnet_sq_map_sg(sq, num, data);
> > > > +		if (!data)
> > > > +			return -ENOMEM;
> > > > +	}
> > > > +
> > > > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > > > +	if (ret && sq->vq->premapped)
> > > > +		virtnet_sq_unmap(sq, &data);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
> > >
> > > Mate? The popular south african drink?
> > >
> > > > +{
> > > > +	struct virtnet_sq_dma *d;
> > > > +	int num, i;
> > > > +
> > > > +	num = virtqueue_get_vring_size(sq->vq);
> > > > +
> > > > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > > > +	if (!sq->dmainfo.free)
> > > > +		return -ENOMEM;
> > >
> > >
> > > This could be quite a bit of memory for a large queue.  And for a bunch
> > > of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> > > useful at all.  And also, this does nothing useful if PLATFORM_ACCESS is off
> > > which is super common.
> > >
> > > A while ago I proposed:
> > > - extend DMA APIs so one can query whether unmap is a nop
> >
> > I think such code is ok:
> >
> > bool dma_is_direct(struct device *dev)
> > {
> > 	if (!dma_map_direct(dev, ops))
> > 		return false;
>
> what is dma_map_direct? can't find it in the tree.

YES.


diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58db8fd70471..5a8f7a927aa1 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -144,6 +144,18 @@ static inline bool dma_map_direct(struct device *dev,
        return dma_go_direct(dev, *dev->dma_mask, ops);
 }

+bool dma_is_direct(struct device *dev)
+{
+       if (!dma_map_direct(dev, ops))
+               return false;
+
+       if (is_swiotlb_force_bounce(dev))
+               return false;
+
+       return true;
+}
+EXPORT_SYMBOL(dma_unmap_page_attrs);
+

Thanks.

>
> > 	if (is_swiotlb_force_bounce(dev))
> > 		return false;
> >
> > 	return true;
> > }
> >
> > @Christoph Hellwig
> >
> > Thanks.
> >
> >
> > >   and whether sync is a nop
> > > - virtio wrapper taking into account PLATFORM_ACCESS too
> > >
> > > then we can save all this work and memory when not needed.
> > >
> > >
> > >
> > > > +
> > > > +	sq->dmainfo.p = sq->dmainfo.free;
> > > > +
> > > > +	for (i = 0; i < num; ++i) {
> > > > +		d = &sq->dmainfo.free[i];
> > > > +		d->next = d + 1;
> > > > +	}
> > > > +
> > > > +	d->next = NULL;
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > >  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > > >  			    struct virtnet_sq_free_stats *stats)
> > > >  {
> > > > @@ -377,6 +487,9 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > > >  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > > >  		++stats->packets;
> > > >
> > > > +		if (sq->vq->premapped)
> > > > +			virtnet_sq_unmap(sq, &ptr);
> > > > +
> > > >  		if (!is_xdp_frame(ptr)) {
> > > >  			struct sk_buff *skb = ptr;
> > > >
> > > > @@ -890,8 +1003,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> > > >  			    skb_frag_size(frag), skb_frag_off(frag));
> > > >  	}
> > > >
> > > > -	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> > > > -				   xdp_to_ptr(xdpf), GFP_ATOMIC);
> > > > +	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
> > > >  	if (unlikely(err))
> > > >  		return -ENOSPC; /* Caller handle free/refcnt */
> > > >
> > > > @@ -2357,7 +2469,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> > > >  			return num_sg;
> > > >  		num_sg++;
> > > >  	}
> > > > -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> > > > +	return virtnet_add_outbuf(sq, num_sg, skb);
> > > >  }
> > > >
> > > >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > > > @@ -4166,6 +4278,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > >  		__netif_napi_del(&vi->rq[i].napi);
> > > >  		__netif_napi_del(&vi->sq[i].napi);
> > > > +
> > > > +		kfree(vi->sq[i].dmainfo.p);
> > > >  	}
> > > >
> > > >  	/* We called __netif_napi_del(),
> > > > @@ -4214,6 +4328,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> > > >
> > > >  static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > >  {
> > > > +	struct virtnet_info *vi = vq->vdev->priv;
> > > > +	struct send_queue *sq;
> > > > +	int i = vq2rxq(vq);
> > > > +
> > > > +	sq = &vi->sq[i];
> > > > +
> > > > +	if (sq->vq->premapped)
> > > > +		virtnet_sq_unmap(sq, &buf);
> > > > +
> > > >  	if (!is_xdp_frame(buf))
> > > >  		dev_kfree_skb(buf);
> > > >  	else
> > > > @@ -4327,8 +4450,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > >  		if (ctx)
> > > >  			ctx[rxq2vq(i)] = true;
> > > >
> > > > -		if (premapped)
> > > > +		if (premapped) {
> > > >  			premapped[rxq2vq(i)] = true;
> > > > +			premapped[txq2vq(i)] = true;
> > > > +		}
> > > >  	}
> > > >
> > > >  	cfg.nvqs      = total_vqs;
> > > > @@ -4352,6 +4477,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > >  		vi->rq[i].vq = vqs[rxq2vq(i)];
> > > >  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > >  		vi->sq[i].vq = vqs[txq2vq(i)];
> > > > +
> > > > +		if (vi->sq[i].vq->premapped)
> > > > +			virtnet_sq_init_dma_mate(&vi->sq[i]);
> > > >  	}
> > > >
> > > >  	/* run here: ret == 0. */
> > > > --
> > > > 2.32.0.3.g01195cf9f
> > >
>
Michael S. Tsirkin Feb. 26, 2024, 11:36 a.m. UTC | #8
On Mon, Feb 26, 2024 at 07:33:29PM +0800, Xuan Zhuo wrote:
> > what is dma_map_direct? can't find it in the tree.
> 
> YES.
> 
> 
> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> index 58db8fd70471..5a8f7a927aa1 100644
> --- a/kernel/dma/mapping.c
> +++ b/kernel/dma/mapping.c
> @@ -144,6 +144,18 @@ static inline bool dma_map_direct(struct device *dev,
>         return dma_go_direct(dev, *dev->dma_mask, ops);
>  }
> 
> +bool dma_is_direct(struct device *dev)
> +{
> +       if (!dma_map_direct(dev, ops))
> +               return false;
> +
> +       if (is_swiotlb_force_bounce(dev))
> +               return false;
> +
> +       return true;
> +}
> +EXPORT_SYMBOL(dma_unmap_page_attrs);
> +
> 
> Thanks.


where is it? linux-next?
Michael S. Tsirkin Feb. 26, 2024, 11:37 a.m. UTC | #9
On Mon, Feb 26, 2024 at 03:44:12PM +0800, Xuan Zhuo wrote:
> On Mon, 26 Feb 2024 14:11:01 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > > > If the xsk is enabling, the xsk tx will share the send queue.
> > > > But the xsk requires that the send queue use the premapped mode.
> > > > So the send queue must support premapped mode.
> > > >
> > > > cmd:
> > > >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> > > >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > > > CPU:
> > > >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> > > >
> > > > Machine:
> > > >     ecs.g7.2xlarge(Aliyun)
> > > >
> > > > before:              1600010.00
> > > > after(no-premapped): 1599966.00
> > > > after(premapped):    1600014.00
> > > >
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > ---
> > > >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> > > >  1 file changed, 132 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index 7715bb7032ec..b83ef6afc4fb 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> > > >  	u16 need_sync;
> > > >  };
> > > >
> > > > +
> >
> > [...]
> >
> > > > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > > > +						int nents, void *data)
> > > > +{
> > > > +	struct virtnet_sq_dma *d;
> > > > +	struct scatterlist *sg;
> > > > +	int i;
> > > > +
> > > > +	if (!sq->dmainfo.free)
> > > > +		return NULL;
> > > > +
> > > > +	d = sq->dmainfo.free;
> > > > +	sq->dmainfo.free = d->next;
> > > > +
> > > > +	for_each_sg(sq->sg, sg, nents, i) {
> > > > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > > > +			goto err;
> > > > +
> > > > +		d->addr[i] = sg->dma_address;
> > > > +		d->len[i] = sg->length;
> > > > +	}
> > > > +
> > > > +	d->data = data;
> > > > +	d->num = i;
> > > > +	return d;
> > > > +
> > > > +err:
> > > > +	d->num = i;
> > > > +	virtnet_sq_unmap(sq, (void **)&d);
> > > > +	return NULL;
> > > > +}
> > >
> > >
> > > Do I see a reimplementation of linux/llist.h here?
> >
> > YES. This can be done by the APIs of linux/lllist.h.
> >
> > But now, there is not __llist_del_first() (That will be used by
> > virtnet_sq_map_sg()).
> > And that is simple and just two places may use the APIs, so I implement it
> > directly.
> >
> > >
> > >
> > > > +
> > > > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	if (sq->vq->premapped) {
> > > > +		data = virtnet_sq_map_sg(sq, num, data);
> > > > +		if (!data)
> > > > +			return -ENOMEM;
> > > > +	}
> > > > +
> > > > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > > > +	if (ret && sq->vq->premapped)
> > > > +		virtnet_sq_unmap(sq, &data);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
> > >
> > > Mate? The popular south african drink?
> >
> > Sorry, should be meta, I mean metadata.
> >
> > >
> > > > +{
> > > > +	struct virtnet_sq_dma *d;
> > > > +	int num, i;
> > > > +
> > > > +	num = virtqueue_get_vring_size(sq->vq);
> > > > +
> > > > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > > > +	if (!sq->dmainfo.free)
> > > > +		return -ENOMEM;
> > >
> > >
> > > This could be quite a bit of memory for a large queue.  And for a bunch
> > > of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> > > useful at all.
> >
> > Then can we skip the unmap api, so pass a zero to the unmap api?
> 
> Typo.
> 
> Then can we skip the unmap api, or pass a zero(because the dma address is not
> recorded) to the unmap api?
> 
> Thanks


That's the idea.

> 
> 
> >
> > > And also, this does nothing useful if PLATFORM_ACCESS is off
> > > which is super common.
> >
> > That is ok. That just work when PLATFORM_ACCESS is on.
> >
> > Thanks.
> >
> > >
> > > A while ago I proposed:
> > > - extend DMA APIs so one can query whether unmap is a nop
> > >   and whether sync is a nop
> > > - virtio wrapper taking into account PLATFORM_ACCESS too
> > >
> > > then we can save all this work and memory when not needed.
> > >
> > >
> > >
> > > > +
> > > > +	sq->dmainfo.p = sq->dmainfo.free;
> > > > +
> > > > +	for (i = 0; i < num; ++i) {
> > > > +		d = &sq->dmainfo.free[i];
> > > > +		d->next = d + 1;
> > > > +	}
> > > > +
> > > > +	d->next = NULL;
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > >  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > > >  			    struct virtnet_sq_free_stats *stats)
> > > >  {
> > > > @@ -377,6 +487,9 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > > >  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > > >  		++stats->packets;
> > > >
> > > > +		if (sq->vq->premapped)
> > > > +			virtnet_sq_unmap(sq, &ptr);
> > > > +
> > > >  		if (!is_xdp_frame(ptr)) {
> > > >  			struct sk_buff *skb = ptr;
> > > >
> > > > @@ -890,8 +1003,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> > > >  			    skb_frag_size(frag), skb_frag_off(frag));
> > > >  	}
> > > >
> > > > -	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> > > > -				   xdp_to_ptr(xdpf), GFP_ATOMIC);
> > > > +	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
> > > >  	if (unlikely(err))
> > > >  		return -ENOSPC; /* Caller handle free/refcnt */
> > > >
> > > > @@ -2357,7 +2469,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> > > >  			return num_sg;
> > > >  		num_sg++;
> > > >  	}
> > > > -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> > > > +	return virtnet_add_outbuf(sq, num_sg, skb);
> > > >  }
> > > >
> > > >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > > > @@ -4166,6 +4278,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > >  		__netif_napi_del(&vi->rq[i].napi);
> > > >  		__netif_napi_del(&vi->sq[i].napi);
> > > > +
> > > > +		kfree(vi->sq[i].dmainfo.p);
> > > >  	}
> > > >
> > > >  	/* We called __netif_napi_del(),
> > > > @@ -4214,6 +4328,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> > > >
> > > >  static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > >  {
> > > > +	struct virtnet_info *vi = vq->vdev->priv;
> > > > +	struct send_queue *sq;
> > > > +	int i = vq2rxq(vq);
> > > > +
> > > > +	sq = &vi->sq[i];
> > > > +
> > > > +	if (sq->vq->premapped)
> > > > +		virtnet_sq_unmap(sq, &buf);
> > > > +
> > > >  	if (!is_xdp_frame(buf))
> > > >  		dev_kfree_skb(buf);
> > > >  	else
> > > > @@ -4327,8 +4450,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > >  		if (ctx)
> > > >  			ctx[rxq2vq(i)] = true;
> > > >
> > > > -		if (premapped)
> > > > +		if (premapped) {
> > > >  			premapped[rxq2vq(i)] = true;
> > > > +			premapped[txq2vq(i)] = true;
> > > > +		}
> > > >  	}
> > > >
> > > >  	cfg.nvqs      = total_vqs;
> > > > @@ -4352,6 +4477,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > >  		vi->rq[i].vq = vqs[rxq2vq(i)];
> > > >  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > >  		vi->sq[i].vq = vqs[txq2vq(i)];
> > > > +
> > > > +		if (vi->sq[i].vq->premapped)
> > > > +			virtnet_sq_init_dma_mate(&vi->sq[i]);
> > > >  	}
> > > >
> > > >  	/* run here: ret == 0. */
> > > > --
> > > > 2.32.0.3.g01195cf9f
> > >
> >
Xuan Zhuo Feb. 26, 2024, 11:39 a.m. UTC | #10
On Mon, 26 Feb 2024 06:36:53 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Mon, Feb 26, 2024 at 07:33:29PM +0800, Xuan Zhuo wrote:
> > > what is dma_map_direct? can't find it in the tree.
> >
> > YES.
> >
> >
> > diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> > index 58db8fd70471..5a8f7a927aa1 100644
> > --- a/kernel/dma/mapping.c
> > +++ b/kernel/dma/mapping.c
> > @@ -144,6 +144,18 @@ static inline bool dma_map_direct(struct device *dev,
> >         return dma_go_direct(dev, *dev->dma_mask, ops);
> >  }
> >
> > +bool dma_is_direct(struct device *dev)
> > +{
> > +       if (!dma_map_direct(dev, ops))
> > +               return false;
> > +
> > +       if (is_swiotlb_force_bounce(dev))
> > +               return false;
> > +
> > +       return true;
> > +}
> > +EXPORT_SYMBOL(dma_unmap_page_attrs);
> > +
> >
> > Thanks.
>
>
> where is it? linux-next?


I see it in the vhost branch kernel/dma/mapping.c.

Maybe you miss it.

Thanks.


>
> --
> MST
>
Michael S. Tsirkin Feb. 26, 2024, 11:39 a.m. UTC | #11
On Mon, Feb 26, 2024 at 05:24:11PM +0800, Xuan Zhuo wrote:
> On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > > If the xsk is enabling, the xsk tx will share the send queue.
> > > But the xsk requires that the send queue use the premapped mode.
> > > So the send queue must support premapped mode.
> > >
> > > cmd:
> > >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> > >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > > CPU:
> > >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> > >
> > > Machine:
> > >     ecs.g7.2xlarge(Aliyun)
> > >
> > > before:              1600010.00
> > > after(no-premapped): 1599966.00
> > > after(premapped):    1600014.00
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > ---
> > >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 132 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 7715bb7032ec..b83ef6afc4fb 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> > >  	u16 need_sync;
> > >  };
> > >
> > > +struct virtnet_sq_dma {
> > > +	union {
> > > +		struct virtnet_sq_dma *next;
> > > +		void *data;
> > > +	};
> > > +
> > > +	u32 num;
> > > +
> > > +	dma_addr_t addr[MAX_SKB_FRAGS + 2];
> > > +	u32 len[MAX_SKB_FRAGS + 2];
> > > +};
> > > +
> > > +struct virtnet_sq_dma_head {
> > > +	/* record for kfree */
> > > +	void *p;
> > > +
> > > +	struct virtnet_sq_dma *free;
> > > +};
> > > +
> > >  /* Internal representation of a send virtqueue */
> > >  struct send_queue {
> > >  	/* Virtqueue associated with this send _queue */
> > > @@ -165,6 +184,8 @@ struct send_queue {
> > >
> > >  	/* Record whether sq is in reset state. */
> > >  	bool reset;
> > > +
> > > +	struct virtnet_sq_dma_head dmainfo;
> > >  };
> > >
> > >  /* Internal representation of a receive virtqueue */
> > > @@ -368,6 +389,95 @@ static struct xdp_frame *ptr_to_xdp(void *ptr)
> > >  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> > >  }
> > >
> > > +static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
> > > +{
> > > +	struct virtnet_sq_dma *d;
> > > +	int i;
> > > +
> > > +	d = *data;
> > > +	*data = d->data;
> > > +
> > > +	for (i = 0; i < d->num; ++i)
> > > +		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
> > > +					       DMA_TO_DEVICE, 0);
> > > +
> > > +	d->next = sq->dmainfo.free;
> > > +	sq->dmainfo.free = d;
> > > +
> > > +	return d;
> > > +}
> > > +
> > > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > > +						int nents, void *data)
> > > +{
> > > +	struct virtnet_sq_dma *d;
> > > +	struct scatterlist *sg;
> > > +	int i;
> > > +
> > > +	if (!sq->dmainfo.free)
> > > +		return NULL;
> > > +
> > > +	d = sq->dmainfo.free;
> > > +	sq->dmainfo.free = d->next;
> > > +
> > > +	for_each_sg(sq->sg, sg, nents, i) {
> > > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > > +			goto err;
> > > +
> > > +		d->addr[i] = sg->dma_address;
> > > +		d->len[i] = sg->length;
> > > +	}
> > > +
> > > +	d->data = data;
> > > +	d->num = i;
> > > +	return d;
> > > +
> > > +err:
> > > +	d->num = i;
> > > +	virtnet_sq_unmap(sq, (void **)&d);
> > > +	return NULL;
> > > +}
> >
> >
> > Do I see a reimplementation of linux/llist.h here?
> >
> >
> > > +
> > > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > > +{
> > > +	int ret;
> > > +
> > > +	if (sq->vq->premapped) {
> > > +		data = virtnet_sq_map_sg(sq, num, data);
> > > +		if (!data)
> > > +			return -ENOMEM;
> > > +	}
> > > +
> > > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > > +	if (ret && sq->vq->premapped)
> > > +		virtnet_sq_unmap(sq, &data);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
> >
> > Mate? The popular south african drink?
> >
> > > +{
> > > +	struct virtnet_sq_dma *d;
> > > +	int num, i;
> > > +
> > > +	num = virtqueue_get_vring_size(sq->vq);
> > > +
> > > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > > +	if (!sq->dmainfo.free)
> > > +		return -ENOMEM;
> >
> >
> > This could be quite a bit of memory for a large queue.  And for a bunch
> > of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> > useful at all.  And also, this does nothing useful if PLATFORM_ACCESS is off
> > which is super common.
> >
> > A while ago I proposed:
> > - extend DMA APIs so one can query whether unmap is a nop
> 
> 
> We may have trouble for this.
> 
> dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
> 		size_t offset, size_t size, enum dma_data_direction dir,
> 		unsigned long attrs)
> {
> 	const struct dma_map_ops *ops = get_dma_ops(dev);
> 	dma_addr_t addr;
> 
> 	BUG_ON(!valid_dma_direction(dir));
> 
> 	if (WARN_ON_ONCE(!dev->dma_mask))
> 		return DMA_MAPPING_ERROR;
> 
> 	if (dma_map_direct(dev, ops) ||
> 	    arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
> 		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
> 	else
> 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
> 	kmsan_handle_dma(page, offset, size, dir);
> 	debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
> 
> 	return addr;
> }
> 
> arch_dma_map_page_direct will check the dma address.
> So we can not judge by the API in advance.
> 
> Thanks.

So if dma_map_direct is false we'll still waste some memory.
So be it.


> 
> 
> 
> >   and whether sync is a nop
> > - virtio wrapper taking into account PLATFORM_ACCESS too
> >
> > then we can save all this work and memory when not needed.
> >
> >
> >
> > > +
> > > +	sq->dmainfo.p = sq->dmainfo.free;
> > > +
> > > +	for (i = 0; i < num; ++i) {
> > > +		d = &sq->dmainfo.free[i];
> > > +		d->next = d + 1;
> > > +	}
> > > +
> > > +	d->next = NULL;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > >  			    struct virtnet_sq_free_stats *stats)
> > >  {
> > > @@ -377,6 +487,9 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > >  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > >  		++stats->packets;
> > >
> > > +		if (sq->vq->premapped)
> > > +			virtnet_sq_unmap(sq, &ptr);
> > > +
> > >  		if (!is_xdp_frame(ptr)) {
> > >  			struct sk_buff *skb = ptr;
> > >
> > > @@ -890,8 +1003,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> > >  			    skb_frag_size(frag), skb_frag_off(frag));
> > >  	}
> > >
> > > -	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> > > -				   xdp_to_ptr(xdpf), GFP_ATOMIC);
> > > +	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
> > >  	if (unlikely(err))
> > >  		return -ENOSPC; /* Caller handle free/refcnt */
> > >
> > > @@ -2357,7 +2469,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> > >  			return num_sg;
> > >  		num_sg++;
> > >  	}
> > > -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> > > +	return virtnet_add_outbuf(sq, num_sg, skb);
> > >  }
> > >
> > >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > > @@ -4166,6 +4278,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > >  		__netif_napi_del(&vi->rq[i].napi);
> > >  		__netif_napi_del(&vi->sq[i].napi);
> > > +
> > > +		kfree(vi->sq[i].dmainfo.p);
> > >  	}
> > >
> > >  	/* We called __netif_napi_del(),
> > > @@ -4214,6 +4328,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> > >
> > >  static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > >  {
> > > +	struct virtnet_info *vi = vq->vdev->priv;
> > > +	struct send_queue *sq;
> > > +	int i = vq2rxq(vq);
> > > +
> > > +	sq = &vi->sq[i];
> > > +
> > > +	if (sq->vq->premapped)
> > > +		virtnet_sq_unmap(sq, &buf);
> > > +
> > >  	if (!is_xdp_frame(buf))
> > >  		dev_kfree_skb(buf);
> > >  	else
> > > @@ -4327,8 +4450,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > >  		if (ctx)
> > >  			ctx[rxq2vq(i)] = true;
> > >
> > > -		if (premapped)
> > > +		if (premapped) {
> > >  			premapped[rxq2vq(i)] = true;
> > > +			premapped[txq2vq(i)] = true;
> > > +		}
> > >  	}
> > >
> > >  	cfg.nvqs      = total_vqs;
> > > @@ -4352,6 +4477,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > >  		vi->rq[i].vq = vqs[rxq2vq(i)];
> > >  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > >  		vi->sq[i].vq = vqs[txq2vq(i)];
> > > +
> > > +		if (vi->sq[i].vq->premapped)
> > > +			virtnet_sq_init_dma_mate(&vi->sq[i]);
> > >  	}
> > >
> > >  	/* run here: ret == 0. */
> > > --
> > > 2.32.0.3.g01195cf9f
> >
Xuan Zhuo Feb. 26, 2024, 11:41 a.m. UTC | #12
On Mon, 26 Feb 2024 06:39:51 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Mon, Feb 26, 2024 at 05:24:11PM +0800, Xuan Zhuo wrote:
> > On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > > > If the xsk is enabling, the xsk tx will share the send queue.
> > > > But the xsk requires that the send queue use the premapped mode.
> > > > So the send queue must support premapped mode.
> > > >
> > > > cmd:
> > > >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> > > >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > > > CPU:
> > > >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> > > >
> > > > Machine:
> > > >     ecs.g7.2xlarge(Aliyun)
> > > >
> > > > before:              1600010.00
> > > > after(no-premapped): 1599966.00
> > > > after(premapped):    1600014.00
> > > >
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > ---
> > > >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> > > >  1 file changed, 132 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index 7715bb7032ec..b83ef6afc4fb 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> > > >  	u16 need_sync;
> > > >  };
> > > >
> > > > +struct virtnet_sq_dma {
> > > > +	union {
> > > > +		struct virtnet_sq_dma *next;
> > > > +		void *data;
> > > > +	};
> > > > +
> > > > +	u32 num;
> > > > +
> > > > +	dma_addr_t addr[MAX_SKB_FRAGS + 2];
> > > > +	u32 len[MAX_SKB_FRAGS + 2];
> > > > +};
> > > > +
> > > > +struct virtnet_sq_dma_head {
> > > > +	/* record for kfree */
> > > > +	void *p;
> > > > +
> > > > +	struct virtnet_sq_dma *free;
> > > > +};
> > > > +
> > > >  /* Internal representation of a send virtqueue */
> > > >  struct send_queue {
> > > >  	/* Virtqueue associated with this send _queue */
> > > > @@ -165,6 +184,8 @@ struct send_queue {
> > > >
> > > >  	/* Record whether sq is in reset state. */
> > > >  	bool reset;
> > > > +
> > > > +	struct virtnet_sq_dma_head dmainfo;
> > > >  };
> > > >
> > > >  /* Internal representation of a receive virtqueue */
> > > > @@ -368,6 +389,95 @@ static struct xdp_frame *ptr_to_xdp(void *ptr)
> > > >  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> > > >  }
> > > >
> > > > +static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
> > > > +{
> > > > +	struct virtnet_sq_dma *d;
> > > > +	int i;
> > > > +
> > > > +	d = *data;
> > > > +	*data = d->data;
> > > > +
> > > > +	for (i = 0; i < d->num; ++i)
> > > > +		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
> > > > +					       DMA_TO_DEVICE, 0);
> > > > +
> > > > +	d->next = sq->dmainfo.free;
> > > > +	sq->dmainfo.free = d;
> > > > +
> > > > +	return d;
> > > > +}
> > > > +
> > > > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > > > +						int nents, void *data)
> > > > +{
> > > > +	struct virtnet_sq_dma *d;
> > > > +	struct scatterlist *sg;
> > > > +	int i;
> > > > +
> > > > +	if (!sq->dmainfo.free)
> > > > +		return NULL;
> > > > +
> > > > +	d = sq->dmainfo.free;
> > > > +	sq->dmainfo.free = d->next;
> > > > +
> > > > +	for_each_sg(sq->sg, sg, nents, i) {
> > > > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > > > +			goto err;
> > > > +
> > > > +		d->addr[i] = sg->dma_address;
> > > > +		d->len[i] = sg->length;
> > > > +	}
> > > > +
> > > > +	d->data = data;
> > > > +	d->num = i;
> > > > +	return d;
> > > > +
> > > > +err:
> > > > +	d->num = i;
> > > > +	virtnet_sq_unmap(sq, (void **)&d);
> > > > +	return NULL;
> > > > +}
> > >
> > >
> > > Do I see a reimplementation of linux/llist.h here?
> > >
> > >
> > > > +
> > > > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	if (sq->vq->premapped) {
> > > > +		data = virtnet_sq_map_sg(sq, num, data);
> > > > +		if (!data)
> > > > +			return -ENOMEM;
> > > > +	}
> > > > +
> > > > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > > > +	if (ret && sq->vq->premapped)
> > > > +		virtnet_sq_unmap(sq, &data);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
> > >
> > > Mate? The popular south african drink?
> > >
> > > > +{
> > > > +	struct virtnet_sq_dma *d;
> > > > +	int num, i;
> > > > +
> > > > +	num = virtqueue_get_vring_size(sq->vq);
> > > > +
> > > > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > > > +	if (!sq->dmainfo.free)
> > > > +		return -ENOMEM;
> > >
> > >
> > > This could be quite a bit of memory for a large queue.  And for a bunch
> > > of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> > > useful at all.  And also, this does nothing useful if PLATFORM_ACCESS is off
> > > which is super common.
> > >
> > > A while ago I proposed:
> > > - extend DMA APIs so one can query whether unmap is a nop
> >
> >
> > We may have trouble for this.
> >
> > dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
> > 		size_t offset, size_t size, enum dma_data_direction dir,
> > 		unsigned long attrs)
> > {
> > 	const struct dma_map_ops *ops = get_dma_ops(dev);
> > 	dma_addr_t addr;
> >
> > 	BUG_ON(!valid_dma_direction(dir));
> >
> > 	if (WARN_ON_ONCE(!dev->dma_mask))
> > 		return DMA_MAPPING_ERROR;
> >
> > 	if (dma_map_direct(dev, ops) ||
> > 	    arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
> > 		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
> > 	else
> > 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
> > 	kmsan_handle_dma(page, offset, size, dir);
> > 	debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
> >
> > 	return addr;
> > }
> >
> > arch_dma_map_page_direct will check the dma address.
> > So we can not judge by the API in advance.
> >
> > Thanks.
>
> So if dma_map_direct is false we'll still waste some memory.
> So be it.

arch_dma_map_page_direct default is marco (false), just for powerpc
it is a function. So I think we can skip it.

If the dma_map_direct is false, I think should save the dma info.

Thanks.

>
>
> >
> >
> >
> > >   and whether sync is a nop
> > > - virtio wrapper taking into account PLATFORM_ACCESS too
> > >
> > > then we can save all this work and memory when not needed.
> > >
> > >
> > >
> > > > +
> > > > +	sq->dmainfo.p = sq->dmainfo.free;
> > > > +
> > > > +	for (i = 0; i < num; ++i) {
> > > > +		d = &sq->dmainfo.free[i];
> > > > +		d->next = d + 1;
> > > > +	}
> > > > +
> > > > +	d->next = NULL;
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > >  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > > >  			    struct virtnet_sq_free_stats *stats)
> > > >  {
> > > > @@ -377,6 +487,9 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > > >  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > > >  		++stats->packets;
> > > >
> > > > +		if (sq->vq->premapped)
> > > > +			virtnet_sq_unmap(sq, &ptr);
> > > > +
> > > >  		if (!is_xdp_frame(ptr)) {
> > > >  			struct sk_buff *skb = ptr;
> > > >
> > > > @@ -890,8 +1003,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> > > >  			    skb_frag_size(frag), skb_frag_off(frag));
> > > >  	}
> > > >
> > > > -	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
> > > > -				   xdp_to_ptr(xdpf), GFP_ATOMIC);
> > > > +	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
> > > >  	if (unlikely(err))
> > > >  		return -ENOSPC; /* Caller handle free/refcnt */
> > > >
> > > > @@ -2357,7 +2469,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> > > >  			return num_sg;
> > > >  		num_sg++;
> > > >  	}
> > > > -	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> > > > +	return virtnet_add_outbuf(sq, num_sg, skb);
> > > >  }
> > > >
> > > >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > > > @@ -4166,6 +4278,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > >  		__netif_napi_del(&vi->rq[i].napi);
> > > >  		__netif_napi_del(&vi->sq[i].napi);
> > > > +
> > > > +		kfree(vi->sq[i].dmainfo.p);
> > > >  	}
> > > >
> > > >  	/* We called __netif_napi_del(),
> > > > @@ -4214,6 +4328,15 @@ static void free_receive_page_frags(struct virtnet_info *vi)
> > > >
> > > >  static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > >  {
> > > > +	struct virtnet_info *vi = vq->vdev->priv;
> > > > +	struct send_queue *sq;
> > > > +	int i = vq2rxq(vq);
> > > > +
> > > > +	sq = &vi->sq[i];
> > > > +
> > > > +	if (sq->vq->premapped)
> > > > +		virtnet_sq_unmap(sq, &buf);
> > > > +
> > > >  	if (!is_xdp_frame(buf))
> > > >  		dev_kfree_skb(buf);
> > > >  	else
> > > > @@ -4327,8 +4450,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > >  		if (ctx)
> > > >  			ctx[rxq2vq(i)] = true;
> > > >
> > > > -		if (premapped)
> > > > +		if (premapped) {
> > > >  			premapped[rxq2vq(i)] = true;
> > > > +			premapped[txq2vq(i)] = true;
> > > > +		}
> > > >  	}
> > > >
> > > >  	cfg.nvqs      = total_vqs;
> > > > @@ -4352,6 +4477,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > >  		vi->rq[i].vq = vqs[rxq2vq(i)];
> > > >  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > >  		vi->sq[i].vq = vqs[txq2vq(i)];
> > > > +
> > > > +		if (vi->sq[i].vq->premapped)
> > > > +			virtnet_sq_init_dma_mate(&vi->sq[i]);
> > > >  	}
> > > >
> > > >  	/* run here: ret == 0. */
> > > > --
> > > > 2.32.0.3.g01195cf9f
> > >
>
Michael S. Tsirkin Feb. 26, 2024, 11:57 a.m. UTC | #13
On Mon, Feb 26, 2024 at 07:39:09PM +0800, Xuan Zhuo wrote:
> On Mon, 26 Feb 2024 06:36:53 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Mon, Feb 26, 2024 at 07:33:29PM +0800, Xuan Zhuo wrote:
> > > > what is dma_map_direct? can't find it in the tree.
> > >
> > > YES.
> > >
> > >
> > > diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> > > index 58db8fd70471..5a8f7a927aa1 100644
> > > --- a/kernel/dma/mapping.c
> > > +++ b/kernel/dma/mapping.c
> > > @@ -144,6 +144,18 @@ static inline bool dma_map_direct(struct device *dev,
> > >         return dma_go_direct(dev, *dev->dma_mask, ops);
> > >  }
> > >
> > > +bool dma_is_direct(struct device *dev)
> > > +{
> > > +       if (!dma_map_direct(dev, ops))
> > > +               return false;
> > > +
> > > +       if (is_swiotlb_force_bounce(dev))
> > > +               return false;
> > > +
> > > +       return true;
> > > +}
> > > +EXPORT_SYMBOL(dma_unmap_page_attrs);
> > > +
> > >
> > > Thanks.
> >
> >
> > where is it? linux-next?
> 
> 
> I see it in the vhost branch kernel/dma/mapping.c.
> 
> Maybe you miss it.
> 
> Thanks.
> 

which hash?

> >
> > --
> > MST
> >
Michael S. Tsirkin Feb. 26, 2024, noon UTC | #14
On Mon, Feb 26, 2024 at 07:41:20PM +0800, Xuan Zhuo wrote:
> On Mon, 26 Feb 2024 06:39:51 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Mon, Feb 26, 2024 at 05:24:11PM +0800, Xuan Zhuo wrote:
> > > On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > > > > If the xsk is enabling, the xsk tx will share the send queue.
> > > > > But the xsk requires that the send queue use the premapped mode.
> > > > > So the send queue must support premapped mode.
> > > > >
> > > > > cmd:
> > > > >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> > > > >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > > > > CPU:
> > > > >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> > > > >
> > > > > Machine:
> > > > >     ecs.g7.2xlarge(Aliyun)
> > > > >
> > > > > before:              1600010.00
> > > > > after(no-premapped): 1599966.00
> > > > > after(premapped):    1600014.00
> > > > >
> > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > ---
> > > > >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> > > > >  1 file changed, 132 insertions(+), 4 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > index 7715bb7032ec..b83ef6afc4fb 100644
> > > > > --- a/drivers/net/virtio_net.c
> > > > > +++ b/drivers/net/virtio_net.c
> > > > > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> > > > >  	u16 need_sync;
> > > > >  };
> > > > >
> > > > > +struct virtnet_sq_dma {
> > > > > +	union {
> > > > > +		struct virtnet_sq_dma *next;
> > > > > +		void *data;
> > > > > +	};
> > > > > +
> > > > > +	u32 num;
> > > > > +
> > > > > +	dma_addr_t addr[MAX_SKB_FRAGS + 2];
> > > > > +	u32 len[MAX_SKB_FRAGS + 2];
> > > > > +};
> > > > > +
> > > > > +struct virtnet_sq_dma_head {
> > > > > +	/* record for kfree */
> > > > > +	void *p;
> > > > > +
> > > > > +	struct virtnet_sq_dma *free;
> > > > > +};
> > > > > +
> > > > >  /* Internal representation of a send virtqueue */
> > > > >  struct send_queue {
> > > > >  	/* Virtqueue associated with this send _queue */
> > > > > @@ -165,6 +184,8 @@ struct send_queue {
> > > > >
> > > > >  	/* Record whether sq is in reset state. */
> > > > >  	bool reset;
> > > > > +
> > > > > +	struct virtnet_sq_dma_head dmainfo;
> > > > >  };
> > > > >
> > > > >  /* Internal representation of a receive virtqueue */
> > > > > @@ -368,6 +389,95 @@ static struct xdp_frame *ptr_to_xdp(void *ptr)
> > > > >  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> > > > >  }
> > > > >
> > > > > +static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
> > > > > +{
> > > > > +	struct virtnet_sq_dma *d;
> > > > > +	int i;
> > > > > +
> > > > > +	d = *data;
> > > > > +	*data = d->data;
> > > > > +
> > > > > +	for (i = 0; i < d->num; ++i)
> > > > > +		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
> > > > > +					       DMA_TO_DEVICE, 0);
> > > > > +
> > > > > +	d->next = sq->dmainfo.free;
> > > > > +	sq->dmainfo.free = d;
> > > > > +
> > > > > +	return d;
> > > > > +}
> > > > > +
> > > > > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > > > > +						int nents, void *data)
> > > > > +{
> > > > > +	struct virtnet_sq_dma *d;
> > > > > +	struct scatterlist *sg;
> > > > > +	int i;
> > > > > +
> > > > > +	if (!sq->dmainfo.free)
> > > > > +		return NULL;
> > > > > +
> > > > > +	d = sq->dmainfo.free;
> > > > > +	sq->dmainfo.free = d->next;
> > > > > +
> > > > > +	for_each_sg(sq->sg, sg, nents, i) {
> > > > > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > > > > +			goto err;
> > > > > +
> > > > > +		d->addr[i] = sg->dma_address;
> > > > > +		d->len[i] = sg->length;
> > > > > +	}
> > > > > +
> > > > > +	d->data = data;
> > > > > +	d->num = i;
> > > > > +	return d;
> > > > > +
> > > > > +err:
> > > > > +	d->num = i;
> > > > > +	virtnet_sq_unmap(sq, (void **)&d);
> > > > > +	return NULL;
> > > > > +}
> > > >
> > > >
> > > > Do I see a reimplementation of linux/llist.h here?
> > > >
> > > >
> > > > > +
> > > > > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > > > > +{
> > > > > +	int ret;
> > > > > +
> > > > > +	if (sq->vq->premapped) {
> > > > > +		data = virtnet_sq_map_sg(sq, num, data);
> > > > > +		if (!data)
> > > > > +			return -ENOMEM;
> > > > > +	}
> > > > > +
> > > > > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > > > > +	if (ret && sq->vq->premapped)
> > > > > +		virtnet_sq_unmap(sq, &data);
> > > > > +
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
> > > >
> > > > Mate? The popular south african drink?
> > > >
> > > > > +{
> > > > > +	struct virtnet_sq_dma *d;
> > > > > +	int num, i;
> > > > > +
> > > > > +	num = virtqueue_get_vring_size(sq->vq);
> > > > > +
> > > > > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > > > > +	if (!sq->dmainfo.free)
> > > > > +		return -ENOMEM;
> > > >
> > > >
> > > > This could be quite a bit of memory for a large queue.  And for a bunch
> > > > of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> > > > useful at all.  And also, this does nothing useful if PLATFORM_ACCESS is off
> > > > which is super common.
> > > >
> > > > A while ago I proposed:
> > > > - extend DMA APIs so one can query whether unmap is a nop
> > >
> > >
> > > We may have trouble for this.
> > >
> > > dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
> > > 		size_t offset, size_t size, enum dma_data_direction dir,
> > > 		unsigned long attrs)
> > > {
> > > 	const struct dma_map_ops *ops = get_dma_ops(dev);
> > > 	dma_addr_t addr;
> > >
> > > 	BUG_ON(!valid_dma_direction(dir));
> > >
> > > 	if (WARN_ON_ONCE(!dev->dma_mask))
> > > 		return DMA_MAPPING_ERROR;
> > >
> > > 	if (dma_map_direct(dev, ops) ||
> > > 	    arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
> > > 		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
> > > 	else
> > > 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
> > > 	kmsan_handle_dma(page, offset, size, dir);
> > > 	debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
> > >
> > > 	return addr;
> > > }
> > >
> > > arch_dma_map_page_direct will check the dma address.
> > > So we can not judge by the API in advance.
> > >
> > > Thanks.
> >
> > So if dma_map_direct is false we'll still waste some memory.
> > So be it.
> 
> arch_dma_map_page_direct default is marco (false), just for powerpc
> it is a function. So I think we can skip it.
> 
> If the dma_map_direct is false, I think should save the dma info.
> 
> Thanks.


Would already be an improvement.
But can we have better names?

I'd prefer:

dma_can_skip_unmap
dma_can_skip_sync

Because we do not know for sure if it's direct unless
we have the page.
Michael S. Tsirkin Feb. 26, 2024, 12:01 p.m. UTC | #15
On Mon, Feb 26, 2024 at 07:00:09AM -0500, Michael S. Tsirkin wrote:
> On Mon, Feb 26, 2024 at 07:41:20PM +0800, Xuan Zhuo wrote:
> > On Mon, 26 Feb 2024 06:39:51 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Mon, Feb 26, 2024 at 05:24:11PM +0800, Xuan Zhuo wrote:
> > > > On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > > > > > If the xsk is enabling, the xsk tx will share the send queue.
> > > > > > But the xsk requires that the send queue use the premapped mode.
> > > > > > So the send queue must support premapped mode.
> > > > > >
> > > > > > cmd:
> > > > > >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> > > > > >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > > > > > CPU:
> > > > > >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> > > > > >
> > > > > > Machine:
> > > > > >     ecs.g7.2xlarge(Aliyun)
> > > > > >
> > > > > > before:              1600010.00
> > > > > > after(no-premapped): 1599966.00
> > > > > > after(premapped):    1600014.00
> > > > > >
> > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > ---
> > > > > >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> > > > > >  1 file changed, 132 insertions(+), 4 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > index 7715bb7032ec..b83ef6afc4fb 100644
> > > > > > --- a/drivers/net/virtio_net.c
> > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> > > > > >  	u16 need_sync;
> > > > > >  };
> > > > > >
> > > > > > +struct virtnet_sq_dma {
> > > > > > +	union {
> > > > > > +		struct virtnet_sq_dma *next;
> > > > > > +		void *data;
> > > > > > +	};
> > > > > > +
> > > > > > +	u32 num;
> > > > > > +
> > > > > > +	dma_addr_t addr[MAX_SKB_FRAGS + 2];
> > > > > > +	u32 len[MAX_SKB_FRAGS + 2];
> > > > > > +};
> > > > > > +
> > > > > > +struct virtnet_sq_dma_head {
> > > > > > +	/* record for kfree */
> > > > > > +	void *p;
> > > > > > +
> > > > > > +	struct virtnet_sq_dma *free;
> > > > > > +};
> > > > > > +
> > > > > >  /* Internal representation of a send virtqueue */
> > > > > >  struct send_queue {
> > > > > >  	/* Virtqueue associated with this send _queue */
> > > > > > @@ -165,6 +184,8 @@ struct send_queue {
> > > > > >
> > > > > >  	/* Record whether sq is in reset state. */
> > > > > >  	bool reset;
> > > > > > +
> > > > > > +	struct virtnet_sq_dma_head dmainfo;
> > > > > >  };
> > > > > >
> > > > > >  /* Internal representation of a receive virtqueue */
> > > > > > @@ -368,6 +389,95 @@ static struct xdp_frame *ptr_to_xdp(void *ptr)
> > > > > >  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> > > > > >  }
> > > > > >
> > > > > > +static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
> > > > > > +{
> > > > > > +	struct virtnet_sq_dma *d;
> > > > > > +	int i;
> > > > > > +
> > > > > > +	d = *data;
> > > > > > +	*data = d->data;
> > > > > > +
> > > > > > +	for (i = 0; i < d->num; ++i)
> > > > > > +		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
> > > > > > +					       DMA_TO_DEVICE, 0);
> > > > > > +
> > > > > > +	d->next = sq->dmainfo.free;
> > > > > > +	sq->dmainfo.free = d;
> > > > > > +
> > > > > > +	return d;
> > > > > > +}
> > > > > > +
> > > > > > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > > > > > +						int nents, void *data)
> > > > > > +{
> > > > > > +	struct virtnet_sq_dma *d;
> > > > > > +	struct scatterlist *sg;
> > > > > > +	int i;
> > > > > > +
> > > > > > +	if (!sq->dmainfo.free)
> > > > > > +		return NULL;
> > > > > > +
> > > > > > +	d = sq->dmainfo.free;
> > > > > > +	sq->dmainfo.free = d->next;
> > > > > > +
> > > > > > +	for_each_sg(sq->sg, sg, nents, i) {
> > > > > > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > > > > > +			goto err;
> > > > > > +
> > > > > > +		d->addr[i] = sg->dma_address;
> > > > > > +		d->len[i] = sg->length;
> > > > > > +	}
> > > > > > +
> > > > > > +	d->data = data;
> > > > > > +	d->num = i;
> > > > > > +	return d;
> > > > > > +
> > > > > > +err:
> > > > > > +	d->num = i;
> > > > > > +	virtnet_sq_unmap(sq, (void **)&d);
> > > > > > +	return NULL;
> > > > > > +}
> > > > >
> > > > >
> > > > > Do I see a reimplementation of linux/llist.h here?
> > > > >
> > > > >
> > > > > > +
> > > > > > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > > > > > +{
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	if (sq->vq->premapped) {
> > > > > > +		data = virtnet_sq_map_sg(sq, num, data);
> > > > > > +		if (!data)
> > > > > > +			return -ENOMEM;
> > > > > > +	}
> > > > > > +
> > > > > > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > > > > > +	if (ret && sq->vq->premapped)
> > > > > > +		virtnet_sq_unmap(sq, &data);
> > > > > > +
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
> > > > >
> > > > > Mate? The popular south african drink?
> > > > >
> > > > > > +{
> > > > > > +	struct virtnet_sq_dma *d;
> > > > > > +	int num, i;
> > > > > > +
> > > > > > +	num = virtqueue_get_vring_size(sq->vq);
> > > > > > +
> > > > > > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > > > > > +	if (!sq->dmainfo.free)
> > > > > > +		return -ENOMEM;
> > > > >
> > > > >
> > > > > This could be quite a bit of memory for a large queue.  And for a bunch
> > > > > of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> > > > > useful at all.  And also, this does nothing useful if PLATFORM_ACCESS is off
> > > > > which is super common.
> > > > >
> > > > > A while ago I proposed:
> > > > > - extend DMA APIs so one can query whether unmap is a nop
> > > >
> > > >
> > > > We may have trouble for this.
> > > >
> > > > dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
> > > > 		size_t offset, size_t size, enum dma_data_direction dir,
> > > > 		unsigned long attrs)
> > > > {
> > > > 	const struct dma_map_ops *ops = get_dma_ops(dev);
> > > > 	dma_addr_t addr;
> > > >
> > > > 	BUG_ON(!valid_dma_direction(dir));
> > > >
> > > > 	if (WARN_ON_ONCE(!dev->dma_mask))
> > > > 		return DMA_MAPPING_ERROR;
> > > >
> > > > 	if (dma_map_direct(dev, ops) ||
> > > > 	    arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
> > > > 		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
> > > > 	else
> > > > 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
> > > > 	kmsan_handle_dma(page, offset, size, dir);
> > > > 	debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
> > > >
> > > > 	return addr;
> > > > }
> > > >
> > > > arch_dma_map_page_direct will check the dma address.
> > > > So we can not judge by the API in advance.
> > > >
> > > > Thanks.
> > >
> > > So if dma_map_direct is false we'll still waste some memory.
> > > So be it.
> > 
> > arch_dma_map_page_direct default is marco (false), just for powerpc
> > it is a function. So I think we can skip it.
> > 
> > If the dma_map_direct is false, I think should save the dma info.
> > 
> > Thanks.
> 
> 
> Would already be an improvement.
> But can we have better names?
> 
> I'd prefer:
> 
> dma_can_skip_unmap
> dma_can_skip_sync
> 
> Because we do not know for sure if it's direct unless
> we have the page.

we might need to add these callbacks to dma ops too, I think
with iommu pt that is the case and it is pretty common,
right?

> -- 
> MST
Xuan Zhuo Feb. 26, 2024, 12:06 p.m. UTC | #16
On Mon, 26 Feb 2024 06:57:17 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Mon, Feb 26, 2024 at 07:39:09PM +0800, Xuan Zhuo wrote:
> > On Mon, 26 Feb 2024 06:36:53 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Mon, Feb 26, 2024 at 07:33:29PM +0800, Xuan Zhuo wrote:
> > > > > what is dma_map_direct? can't find it in the tree.
> > > >
> > > > YES.
> > > >
> > > >
> > > > diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> > > > index 58db8fd70471..5a8f7a927aa1 100644
> > > > --- a/kernel/dma/mapping.c
> > > > +++ b/kernel/dma/mapping.c
> > > > @@ -144,6 +144,18 @@ static inline bool dma_map_direct(struct device *dev,
> > > >         return dma_go_direct(dev, *dev->dma_mask, ops);
> > > >  }
> > > >
> > > > +bool dma_is_direct(struct device *dev)
> > > > +{
> > > > +       if (!dma_map_direct(dev, ops))
> > > > +               return false;
> > > > +
> > > > +       if (is_swiotlb_force_bounce(dev))
> > > > +               return false;
> > > > +
> > > > +       return true;
> > > > +}
> > > > +EXPORT_SYMBOL(dma_unmap_page_attrs);
> > > > +
> > > >
> > > > Thanks.
> > >
> > >
> > > where is it? linux-next?
> >
> >
> > I see it in the vhost branch kernel/dma/mapping.c.
> >
> > Maybe you miss it.
> >
> > Thanks.
> >
>
> which hash?

fd0b29af02bb75acc94eb08c06e2c10cbce2ea67

>
> > >
> > > --
> > > MST
> > >
>
Michael S. Tsirkin Feb. 26, 2024, 12:12 p.m. UTC | #17
On Mon, Feb 26, 2024 at 08:06:23PM +0800, Xuan Zhuo wrote:
> On Mon, 26 Feb 2024 06:57:17 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Mon, Feb 26, 2024 at 07:39:09PM +0800, Xuan Zhuo wrote:
> > > On Mon, 26 Feb 2024 06:36:53 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > On Mon, Feb 26, 2024 at 07:33:29PM +0800, Xuan Zhuo wrote:
> > > > > > what is dma_map_direct? can't find it in the tree.
> > > > >
> > > > > YES.
> > > > >
> > > > >
> > > > > diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> > > > > index 58db8fd70471..5a8f7a927aa1 100644
> > > > > --- a/kernel/dma/mapping.c
> > > > > +++ b/kernel/dma/mapping.c
> > > > > @@ -144,6 +144,18 @@ static inline bool dma_map_direct(struct device *dev,
> > > > >         return dma_go_direct(dev, *dev->dma_mask, ops);
> > > > >  }
> > > > >
> > > > > +bool dma_is_direct(struct device *dev)
> > > > > +{
> > > > > +       if (!dma_map_direct(dev, ops))
> > > > > +               return false;
> > > > > +
> > > > > +       if (is_swiotlb_force_bounce(dev))
> > > > > +               return false;
> > > > > +
> > > > > +       return true;
> > > > > +}
> > > > > +EXPORT_SYMBOL(dma_unmap_page_attrs);
> > > > > +
> > > > >
> > > > > Thanks.
> > > >
> > > >
> > > > where is it? linux-next?
> > >
> > >
> > > I see it in the vhost branch kernel/dma/mapping.c.
> > >
> > > Maybe you miss it.
> > >
> > > Thanks.
> > >
> >
> > which hash?
> 
> fd0b29af02bb75acc94eb08c06e2c10cbce2ea67


Thanks. Now I see it donnu what was wrong with me.


> >
> > > >
> > > > --
> > > > MST
> > > >
> >
Xuan Zhuo Feb. 27, 2024, 1:32 a.m. UTC | #18
On Mon, 26 Feb 2024 07:01:11 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Mon, Feb 26, 2024 at 07:00:09AM -0500, Michael S. Tsirkin wrote:
> > On Mon, Feb 26, 2024 at 07:41:20PM +0800, Xuan Zhuo wrote:
> > > On Mon, 26 Feb 2024 06:39:51 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > On Mon, Feb 26, 2024 at 05:24:11PM +0800, Xuan Zhuo wrote:
> > > > > On Sun, 25 Feb 2024 03:38:48 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > On Fri, Feb 23, 2024 at 04:27:26PM +0800, Xuan Zhuo wrote:
> > > > > > > If the xsk is enabling, the xsk tx will share the send queue.
> > > > > > > But the xsk requires that the send queue use the premapped mode.
> > > > > > > So the send queue must support premapped mode.
> > > > > > >
> > > > > > > cmd:
> > > > > > >     sh samples/pktgen/pktgen_sample01_simple.sh -i eth0 \
> > > > > > >         -s 16 -d 10.0.0.128 -m 00:16:3e:2c:c8:2e -n 0 -p 100
> > > > > > > CPU:
> > > > > > >     Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz
> > > > > > >
> > > > > > > Machine:
> > > > > > >     ecs.g7.2xlarge(Aliyun)
> > > > > > >
> > > > > > > before:              1600010.00
> > > > > > > after(no-premapped): 1599966.00
> > > > > > > after(premapped):    1600014.00
> > > > > > >
> > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > ---
> > > > > > >  drivers/net/virtio_net.c | 136 +++++++++++++++++++++++++++++++++++++--
> > > > > > >  1 file changed, 132 insertions(+), 4 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > index 7715bb7032ec..b83ef6afc4fb 100644
> > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > @@ -146,6 +146,25 @@ struct virtnet_rq_dma {
> > > > > > >  	u16 need_sync;
> > > > > > >  };
> > > > > > >
> > > > > > > +struct virtnet_sq_dma {
> > > > > > > +	union {
> > > > > > > +		struct virtnet_sq_dma *next;
> > > > > > > +		void *data;
> > > > > > > +	};
> > > > > > > +
> > > > > > > +	u32 num;
> > > > > > > +
> > > > > > > +	dma_addr_t addr[MAX_SKB_FRAGS + 2];
> > > > > > > +	u32 len[MAX_SKB_FRAGS + 2];
> > > > > > > +};
> > > > > > > +
> > > > > > > +struct virtnet_sq_dma_head {
> > > > > > > +	/* record for kfree */
> > > > > > > +	void *p;
> > > > > > > +
> > > > > > > +	struct virtnet_sq_dma *free;
> > > > > > > +};
> > > > > > > +
> > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > >  struct send_queue {
> > > > > > >  	/* Virtqueue associated with this send _queue */
> > > > > > > @@ -165,6 +184,8 @@ struct send_queue {
> > > > > > >
> > > > > > >  	/* Record whether sq is in reset state. */
> > > > > > >  	bool reset;
> > > > > > > +
> > > > > > > +	struct virtnet_sq_dma_head dmainfo;
> > > > > > >  };
> > > > > > >
> > > > > > >  /* Internal representation of a receive virtqueue */
> > > > > > > @@ -368,6 +389,95 @@ static struct xdp_frame *ptr_to_xdp(void *ptr)
> > > > > > >  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> > > > > > >  }
> > > > > > >
> > > > > > > +static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
> > > > > > > +{
> > > > > > > +	struct virtnet_sq_dma *d;
> > > > > > > +	int i;
> > > > > > > +
> > > > > > > +	d = *data;
> > > > > > > +	*data = d->data;
> > > > > > > +
> > > > > > > +	for (i = 0; i < d->num; ++i)
> > > > > > > +		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
> > > > > > > +					       DMA_TO_DEVICE, 0);
> > > > > > > +
> > > > > > > +	d->next = sq->dmainfo.free;
> > > > > > > +	sq->dmainfo.free = d;
> > > > > > > +
> > > > > > > +	return d;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
> > > > > > > +						int nents, void *data)
> > > > > > > +{
> > > > > > > +	struct virtnet_sq_dma *d;
> > > > > > > +	struct scatterlist *sg;
> > > > > > > +	int i;
> > > > > > > +
> > > > > > > +	if (!sq->dmainfo.free)
> > > > > > > +		return NULL;
> > > > > > > +
> > > > > > > +	d = sq->dmainfo.free;
> > > > > > > +	sq->dmainfo.free = d->next;
> > > > > > > +
> > > > > > > +	for_each_sg(sq->sg, sg, nents, i) {
> > > > > > > +		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
> > > > > > > +			goto err;
> > > > > > > +
> > > > > > > +		d->addr[i] = sg->dma_address;
> > > > > > > +		d->len[i] = sg->length;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	d->data = data;
> > > > > > > +	d->num = i;
> > > > > > > +	return d;
> > > > > > > +
> > > > > > > +err:
> > > > > > > +	d->num = i;
> > > > > > > +	virtnet_sq_unmap(sq, (void **)&d);
> > > > > > > +	return NULL;
> > > > > > > +}
> > > > > >
> > > > > >
> > > > > > Do I see a reimplementation of linux/llist.h here?
> > > > > >
> > > > > >
> > > > > > > +
> > > > > > > +static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
> > > > > > > +{
> > > > > > > +	int ret;
> > > > > > > +
> > > > > > > +	if (sq->vq->premapped) {
> > > > > > > +		data = virtnet_sq_map_sg(sq, num, data);
> > > > > > > +		if (!data)
> > > > > > > +			return -ENOMEM;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
> > > > > > > +	if (ret && sq->vq->premapped)
> > > > > > > +		virtnet_sq_unmap(sq, &data);
> > > > > > > +
> > > > > > > +	return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int virtnet_sq_init_dma_mate(struct send_queue *sq)
> > > > > >
> > > > > > Mate? The popular south african drink?
> > > > > >
> > > > > > > +{
> > > > > > > +	struct virtnet_sq_dma *d;
> > > > > > > +	int num, i;
> > > > > > > +
> > > > > > > +	num = virtqueue_get_vring_size(sq->vq);
> > > > > > > +
> > > > > > > +	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
> > > > > > > +	if (!sq->dmainfo.free)
> > > > > > > +		return -ENOMEM;
> > > > > >
> > > > > >
> > > > > > This could be quite a bit of memory for a large queue.  And for a bunch
> > > > > > of common cases where unmap is a nop (e.g. iommu pt) this does nothing
> > > > > > useful at all.  And also, this does nothing useful if PLATFORM_ACCESS is off
> > > > > > which is super common.
> > > > > >
> > > > > > A while ago I proposed:
> > > > > > - extend DMA APIs so one can query whether unmap is a nop
> > > > >
> > > > >
> > > > > We may have trouble for this.
> > > > >
> > > > > dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
> > > > > 		size_t offset, size_t size, enum dma_data_direction dir,
> > > > > 		unsigned long attrs)
> > > > > {
> > > > > 	const struct dma_map_ops *ops = get_dma_ops(dev);
> > > > > 	dma_addr_t addr;
> > > > >
> > > > > 	BUG_ON(!valid_dma_direction(dir));
> > > > >
> > > > > 	if (WARN_ON_ONCE(!dev->dma_mask))
> > > > > 		return DMA_MAPPING_ERROR;
> > > > >
> > > > > 	if (dma_map_direct(dev, ops) ||
> > > > > 	    arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
> > > > > 		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
> > > > > 	else
> > > > > 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
> > > > > 	kmsan_handle_dma(page, offset, size, dir);
> > > > > 	debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
> > > > >
> > > > > 	return addr;
> > > > > }
> > > > >
> > > > > arch_dma_map_page_direct will check the dma address.
> > > > > So we can not judge by the API in advance.
> > > > >
> > > > > Thanks.
> > > >
> > > > So if dma_map_direct is false we'll still waste some memory.
> > > > So be it.
> > >
> > > arch_dma_map_page_direct default is marco (false), just for powerpc
> > > it is a function. So I think we can skip it.
> > >
> > > If the dma_map_direct is false, I think should save the dma info.
> > >
> > > Thanks.
> >
> >
> > Would already be an improvement.
> > But can we have better names?
> >
> > I'd prefer:
> >
> > dma_can_skip_unmap
> > dma_can_skip_sync
> >
> > Because we do not know for sure if it's direct unless
> > we have the page.
>
> we might need to add these callbacks to dma ops too, I think
> with iommu pt that is the case and it is pretty common,
> right?


I agree.

But this patch set has 19 commits, I hope to do this on a new patchset on the
top of this.

If we need to check the iommu pt, I need to study the iommu first.

Thanks.


>
> > --
> > MST
>
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7715bb7032ec..b83ef6afc4fb 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -146,6 +146,25 @@  struct virtnet_rq_dma {
 	u16 need_sync;
 };
 
+struct virtnet_sq_dma {
+	union {
+		struct virtnet_sq_dma *next;
+		void *data;
+	};
+
+	u32 num;
+
+	dma_addr_t addr[MAX_SKB_FRAGS + 2];
+	u32 len[MAX_SKB_FRAGS + 2];
+};
+
+struct virtnet_sq_dma_head {
+	/* record for kfree */
+	void *p;
+
+	struct virtnet_sq_dma *free;
+};
+
 /* Internal representation of a send virtqueue */
 struct send_queue {
 	/* Virtqueue associated with this send _queue */
@@ -165,6 +184,8 @@  struct send_queue {
 
 	/* Record whether sq is in reset state. */
 	bool reset;
+
+	struct virtnet_sq_dma_head dmainfo;
 };
 
 /* Internal representation of a receive virtqueue */
@@ -368,6 +389,95 @@  static struct xdp_frame *ptr_to_xdp(void *ptr)
 	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
 }
 
+static struct virtnet_sq_dma *virtnet_sq_unmap(struct send_queue *sq, void **data)
+{
+	struct virtnet_sq_dma *d;
+	int i;
+
+	d = *data;
+	*data = d->data;
+
+	for (i = 0; i < d->num; ++i)
+		virtqueue_dma_unmap_page_attrs(sq->vq, d->addr[i], d->len[i],
+					       DMA_TO_DEVICE, 0);
+
+	d->next = sq->dmainfo.free;
+	sq->dmainfo.free = d;
+
+	return d;
+}
+
+static struct virtnet_sq_dma *virtnet_sq_map_sg(struct send_queue *sq,
+						int nents, void *data)
+{
+	struct virtnet_sq_dma *d;
+	struct scatterlist *sg;
+	int i;
+
+	if (!sq->dmainfo.free)
+		return NULL;
+
+	d = sq->dmainfo.free;
+	sq->dmainfo.free = d->next;
+
+	for_each_sg(sq->sg, sg, nents, i) {
+		if (virtqueue_dma_map_sg_attrs(sq->vq, sg, DMA_TO_DEVICE, 0))
+			goto err;
+
+		d->addr[i] = sg->dma_address;
+		d->len[i] = sg->length;
+	}
+
+	d->data = data;
+	d->num = i;
+	return d;
+
+err:
+	d->num = i;
+	virtnet_sq_unmap(sq, (void **)&d);
+	return NULL;
+}
+
+static int virtnet_add_outbuf(struct send_queue *sq, u32 num, void *data)
+{
+	int ret;
+
+	if (sq->vq->premapped) {
+		data = virtnet_sq_map_sg(sq, num, data);
+		if (!data)
+			return -ENOMEM;
+	}
+
+	ret = virtqueue_add_outbuf(sq->vq, sq->sg, num, data, GFP_ATOMIC);
+	if (ret && sq->vq->premapped)
+		virtnet_sq_unmap(sq, &data);
+
+	return ret;
+}
+
+static int virtnet_sq_init_dma_mate(struct send_queue *sq)
+{
+	struct virtnet_sq_dma *d;
+	int num, i;
+
+	num = virtqueue_get_vring_size(sq->vq);
+
+	sq->dmainfo.free = kcalloc(num, sizeof(*sq->dmainfo.free), GFP_KERNEL);
+	if (!sq->dmainfo.free)
+		return -ENOMEM;
+
+	sq->dmainfo.p = sq->dmainfo.free;
+
+	for (i = 0; i < num; ++i) {
+		d = &sq->dmainfo.free[i];
+		d->next = d + 1;
+	}
+
+	d->next = NULL;
+
+	return 0;
+}
+
 static void __free_old_xmit(struct send_queue *sq, bool in_napi,
 			    struct virtnet_sq_free_stats *stats)
 {
@@ -377,6 +487,9 @@  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
 	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 		++stats->packets;
 
+		if (sq->vq->premapped)
+			virtnet_sq_unmap(sq, &ptr);
+
 		if (!is_xdp_frame(ptr)) {
 			struct sk_buff *skb = ptr;
 
@@ -890,8 +1003,7 @@  static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
 			    skb_frag_size(frag), skb_frag_off(frag));
 	}
 
-	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
-				   xdp_to_ptr(xdpf), GFP_ATOMIC);
+	err = virtnet_add_outbuf(sq, nr_frags + 1, xdp_to_ptr(xdpf));
 	if (unlikely(err))
 		return -ENOSPC; /* Caller handle free/refcnt */
 
@@ -2357,7 +2469,7 @@  static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 			return num_sg;
 		num_sg++;
 	}
-	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
+	return virtnet_add_outbuf(sq, num_sg, skb);
 }
 
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -4166,6 +4278,8 @@  static void virtnet_free_queues(struct virtnet_info *vi)
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		__netif_napi_del(&vi->rq[i].napi);
 		__netif_napi_del(&vi->sq[i].napi);
+
+		kfree(vi->sq[i].dmainfo.p);
 	}
 
 	/* We called __netif_napi_del(),
@@ -4214,6 +4328,15 @@  static void free_receive_page_frags(struct virtnet_info *vi)
 
 static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
 {
+	struct virtnet_info *vi = vq->vdev->priv;
+	struct send_queue *sq;
+	int i = vq2rxq(vq);
+
+	sq = &vi->sq[i];
+
+	if (sq->vq->premapped)
+		virtnet_sq_unmap(sq, &buf);
+
 	if (!is_xdp_frame(buf))
 		dev_kfree_skb(buf);
 	else
@@ -4327,8 +4450,10 @@  static int virtnet_find_vqs(struct virtnet_info *vi)
 		if (ctx)
 			ctx[rxq2vq(i)] = true;
 
-		if (premapped)
+		if (premapped) {
 			premapped[rxq2vq(i)] = true;
+			premapped[txq2vq(i)] = true;
+		}
 	}
 
 	cfg.nvqs      = total_vqs;
@@ -4352,6 +4477,9 @@  static int virtnet_find_vqs(struct virtnet_info *vi)
 		vi->rq[i].vq = vqs[rxq2vq(i)];
 		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
 		vi->sq[i].vq = vqs[txq2vq(i)];
+
+		if (vi->sq[i].vq->premapped)
+			virtnet_sq_init_dma_mate(&vi->sq[i]);
 	}
 
 	/* run here: ret == 0. */