diff mbox series

[bpf-next,v4,08/14] xsk: enable sharing of dma mappings

Message ID 1595307848-20719-9-git-send-email-magnus.karlsson@intel.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series xsk: support shared umems between devices and queues | expand

Commit Message

Magnus Karlsson July 21, 2020, 5:04 a.m. UTC
Enable the sharing of dma mappings by moving them out from the buffer
pool. Instead we put each dma mapped umem region in a list in the umem
structure. If dma has already been mapped for this umem and device, it
is not mapped again and the existing dma mappings are reused.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 include/net/xdp_sock.h      |   1 +
 include/net/xsk_buff_pool.h |   7 +++
 net/xdp/xdp_umem.c          |   1 +
 net/xdp/xsk_buff_pool.c     | 112 ++++++++++++++++++++++++++++++++++++--------
 4 files changed, 102 insertions(+), 19 deletions(-)

Comments

Björn Töpel July 28, 2020, 7:14 a.m. UTC | #1
On 2020-07-21 07:04, Magnus Karlsson wrote:
> Enable the sharing of dma mappings by moving them out from the buffer
> pool. Instead we put each dma mapped umem region in a list in the umem
> structure. If dma has already been mapped for this umem and device, it
> is not mapped again and the existing dma mappings are reused.
> 
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>

Acked-by: Björn Töpel <bjorn.topel@intel.com>


> ---
>   include/net/xdp_sock.h      |   1 +
>   include/net/xsk_buff_pool.h |   7 +++
>   net/xdp/xdp_umem.c          |   1 +
>   net/xdp/xsk_buff_pool.c     | 112 ++++++++++++++++++++++++++++++++++++--------
>   4 files changed, 102 insertions(+), 19 deletions(-)
> 
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 126d243..282aeba 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -30,6 +30,7 @@ struct xdp_umem {
>   	u8 flags;
>   	int id;
>   	bool zc;
> +	struct list_head xsk_dma_list;
>   };
>   
>   struct xsk_map {
> diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> index 83f100c..8f1dc4c 100644
> --- a/include/net/xsk_buff_pool.h
> +++ b/include/net/xsk_buff_pool.h
> @@ -28,6 +28,13 @@ struct xdp_buff_xsk {
>   	struct list_head free_list_node;
>   };
>   
> +struct xsk_dma_map {
> +	dma_addr_t *dma_pages;
> +	struct net_device *dev;
> +	refcount_t users;
> +	struct list_head list; /* Protected by the RTNL_LOCK */
> +};
> +
>   struct xsk_buff_pool {
>   	struct xsk_queue *fq;
>   	struct xsk_queue *cq;
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 372998d..cf27249 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -199,6 +199,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
>   	umem->user = NULL;
>   	umem->flags = mr->flags;
>   
> +	INIT_LIST_HEAD(&umem->xsk_dma_list);
>   	refcount_set(&umem->users, 1);
>   
>   	err = xdp_umem_account_pages(umem);
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index c563874..ca74a3e 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -104,6 +104,25 @@ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
>   }
>   EXPORT_SYMBOL(xp_set_rxq_info);
>   
> +static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
> +{
> +	struct netdev_bpf bpf;
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	if (pool->umem->zc) {
> +		bpf.command = XDP_SETUP_XSK_POOL;
> +		bpf.xsk.pool = NULL;
> +		bpf.xsk.queue_id = pool->queue_id;
> +
> +		err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
> +
> +		if (err)
> +			WARN(1, "Failed to disable zero-copy!\n");
> +	}
> +}
> +
>   int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
>   		  u16 queue_id, u16 flags)
>   {
> @@ -122,6 +141,8 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
>   	if (xsk_get_pool_from_qid(netdev, queue_id))
>   		return -EBUSY;
>   
> +	pool->netdev = netdev;
> +	pool->queue_id = queue_id;
>   	err = xsk_reg_pool_at_qid(netdev, pool, queue_id);
>   	if (err)
>   		return err;
> @@ -155,11 +176,15 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
>   	if (err)
>   		goto err_unreg_pool;
>   
> -	pool->netdev = netdev;
> -	pool->queue_id = queue_id;
> +	if (!pool->dma_pages) {
> +		WARN(1, "Driver did not DMA map zero-copy buffers");
> +		goto err_unreg_xsk;
> +	}
>   	pool->umem->zc = true;
>   	return 0;
>   
> +err_unreg_xsk:
> +	xp_disable_drv_zc(pool);
>   err_unreg_pool:
>   	if (!force_zc)
>   		err = 0; /* fallback to copy mode */
> @@ -170,25 +195,10 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
>   
>   void xp_clear_dev(struct xsk_buff_pool *pool)
>   {
> -	struct netdev_bpf bpf;
> -	int err;
> -
> -	ASSERT_RTNL();
> -
>   	if (!pool->netdev)
>   		return;
>   
> -	if (pool->umem->zc) {
> -		bpf.command = XDP_SETUP_XSK_POOL;
> -		bpf.xsk.pool = NULL;
> -		bpf.xsk.queue_id = pool->queue_id;
> -
> -		err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
> -
> -		if (err)
> -			WARN(1, "Failed to disable zero-copy!\n");
> -	}
> -
> +	xp_disable_drv_zc(pool);
>   	xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
>   	dev_put(pool->netdev);
>   	pool->netdev = NULL;
> @@ -233,14 +243,61 @@ void xp_put_pool(struct xsk_buff_pool *pool)
>   	}
>   }
>   
> +static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool)
> +{
> +	struct xsk_dma_map *dma_map;
> +
> +	list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) {
> +		if (dma_map->dev == pool->netdev)
> +			return dma_map;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void xp_destroy_dma_map(struct xsk_dma_map *dma_map)
> +{
> +	list_del(&dma_map->list);
> +	kfree(dma_map);
> +}
> +
> +static void xp_put_dma_map(struct xsk_dma_map *dma_map)
> +{
> +	if (!refcount_dec_and_test(&dma_map->users))
> +		return;
> +
> +	xp_destroy_dma_map(dma_map);
> +}
> +
> +static struct xsk_dma_map *xp_create_dma_map(struct xsk_buff_pool *pool)
> +{
> +	struct xsk_dma_map *dma_map;
> +
> +	dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL);
> +	if (!dma_map)
> +		return NULL;
> +
> +	dma_map->dev = pool->netdev;
> +	refcount_set(&dma_map->users, 1);
> +	list_add(&dma_map->list, &pool->umem->xsk_dma_list);
> +	return dma_map;
> +}
> +
>   void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
>   {
> +	struct xsk_dma_map *dma_map;
>   	dma_addr_t *dma;
>   	u32 i;
>   
>   	if (pool->dma_pages_cnt == 0)
>   		return;
>   
> +	dma_map = xp_find_dma_map(pool);
> +	if (!dma_map) {
> +		WARN(1, "Could not find dma_map for device");
> +		return;
> +	}
> +
>   	for (i = 0; i < pool->dma_pages_cnt; i++) {
>   		dma = &pool->dma_pages[i];
>   		if (*dma) {
> @@ -250,6 +307,7 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
>   		}
>   	}
>   
> +	xp_put_dma_map(dma_map);
>   	kvfree(pool->dma_pages);
>   	pool->dma_pages_cnt = 0;
>   	pool->dev = NULL;
> @@ -271,14 +329,29 @@ static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
>   int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
>   	       unsigned long attrs, struct page **pages, u32 nr_pages)
>   {
> +	struct xsk_dma_map *dma_map;
>   	dma_addr_t dma;
>   	u32 i;
>   
> +	dma_map = xp_find_dma_map(pool);
> +	if (dma_map) {
> +		pool->dma_pages = dma_map->dma_pages;
> +		refcount_inc(&dma_map->users);
> +		return 0;
> +	}
> +
> +	dma_map = xp_create_dma_map(pool);
> +	if (!dma_map)
> +		return -ENOMEM;
> +
>   	pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
>   				   GFP_KERNEL);
> -	if (!pool->dma_pages)
> +	if (!pool->dma_pages) {
> +		xp_destroy_dma_map(dma_map);
>   		return -ENOMEM;
> +	}
>   
> +	dma_map->dma_pages = pool->dma_pages;
>   	pool->dev = dev;
>   	pool->dma_pages_cnt = nr_pages;
>   	pool->dma_need_sync = false;
> @@ -288,6 +361,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
>   					 DMA_BIDIRECTIONAL, attrs);
>   		if (dma_mapping_error(dev, dma)) {
>   			xp_dma_unmap(pool, attrs);
> +			xp_destroy_dma_map(dma_map);
>   			return -ENOMEM;
>   		}
>   		if (dma_need_sync(dev, dma))
>
Maxim Mikityanskiy July 28, 2020, 8:59 a.m. UTC | #2
On 2020-07-21 08:04, Magnus Karlsson wrote:
> Enable the sharing of dma mappings by moving them out from the buffer
> pool. Instead we put each dma mapped umem region in a list in the umem
> structure. If dma has already been mapped for this umem and device, it
> is not mapped again and the existing dma mappings are reused.
> 
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
> ---
>   include/net/xdp_sock.h      |   1 +
>   include/net/xsk_buff_pool.h |   7 +++
>   net/xdp/xdp_umem.c          |   1 +
>   net/xdp/xsk_buff_pool.c     | 112 ++++++++++++++++++++++++++++++++++++--------
>   4 files changed, 102 insertions(+), 19 deletions(-)
> 
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 126d243..282aeba 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -30,6 +30,7 @@ struct xdp_umem {
>   	u8 flags;
>   	int id;
>   	bool zc;
> +	struct list_head xsk_dma_list;
>   };
>   
>   struct xsk_map {
> diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> index 83f100c..8f1dc4c 100644
> --- a/include/net/xsk_buff_pool.h
> +++ b/include/net/xsk_buff_pool.h
> @@ -28,6 +28,13 @@ struct xdp_buff_xsk {
>   	struct list_head free_list_node;
>   };
>   
> +struct xsk_dma_map {
> +	dma_addr_t *dma_pages;
> +	struct net_device *dev;
> +	refcount_t users;
> +	struct list_head list; /* Protected by the RTNL_LOCK */
> +};
> +
>   struct xsk_buff_pool {
>   	struct xsk_queue *fq;
>   	struct xsk_queue *cq;
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 372998d..cf27249 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -199,6 +199,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
>   	umem->user = NULL;
>   	umem->flags = mr->flags;
>   
> +	INIT_LIST_HEAD(&umem->xsk_dma_list);
>   	refcount_set(&umem->users, 1);
>   
>   	err = xdp_umem_account_pages(umem);
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index c563874..ca74a3e 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -104,6 +104,25 @@ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
>   }
>   EXPORT_SYMBOL(xp_set_rxq_info);
>   
> +static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
> +{
> +	struct netdev_bpf bpf;
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	if (pool->umem->zc) {
> +		bpf.command = XDP_SETUP_XSK_POOL;
> +		bpf.xsk.pool = NULL;
> +		bpf.xsk.queue_id = pool->queue_id;
> +
> +		err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
> +
> +		if (err)
> +			WARN(1, "Failed to disable zero-copy!\n");
> +	}
> +}
> +
>   int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
>   		  u16 queue_id, u16 flags)
>   {
> @@ -122,6 +141,8 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
>   	if (xsk_get_pool_from_qid(netdev, queue_id))
>   		return -EBUSY;
>   
> +	pool->netdev = netdev;
> +	pool->queue_id = queue_id;
>   	err = xsk_reg_pool_at_qid(netdev, pool, queue_id);
>   	if (err)
>   		return err;
> @@ -155,11 +176,15 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
>   	if (err)
>   		goto err_unreg_pool;
>   
> -	pool->netdev = netdev;
> -	pool->queue_id = queue_id;
> +	if (!pool->dma_pages) {
> +		WARN(1, "Driver did not DMA map zero-copy buffers");
> +		goto err_unreg_xsk;
> +	}
>   	pool->umem->zc = true;
>   	return 0;
>   
> +err_unreg_xsk:
> +	xp_disable_drv_zc(pool);
>   err_unreg_pool:
>   	if (!force_zc)
>   		err = 0; /* fallback to copy mode */
> @@ -170,25 +195,10 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
>   
>   void xp_clear_dev(struct xsk_buff_pool *pool)
>   {
> -	struct netdev_bpf bpf;
> -	int err;
> -
> -	ASSERT_RTNL();
> -
>   	if (!pool->netdev)
>   		return;
>   
> -	if (pool->umem->zc) {
> -		bpf.command = XDP_SETUP_XSK_POOL;
> -		bpf.xsk.pool = NULL;
> -		bpf.xsk.queue_id = pool->queue_id;
> -
> -		err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
> -
> -		if (err)
> -			WARN(1, "Failed to disable zero-copy!\n");
> -	}
> -
> +	xp_disable_drv_zc(pool);
>   	xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
>   	dev_put(pool->netdev);
>   	pool->netdev = NULL;
> @@ -233,14 +243,61 @@ void xp_put_pool(struct xsk_buff_pool *pool)
>   	}
>   }
>   
> +static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool)
> +{
> +	struct xsk_dma_map *dma_map;
> +
> +	list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) {
> +		if (dma_map->dev == pool->netdev)
> +			return dma_map;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void xp_destroy_dma_map(struct xsk_dma_map *dma_map)
> +{
> +	list_del(&dma_map->list);
> +	kfree(dma_map);
> +}
> +
> +static void xp_put_dma_map(struct xsk_dma_map *dma_map)
> +{
> +	if (!refcount_dec_and_test(&dma_map->users))
> +		return;
> +
> +	xp_destroy_dma_map(dma_map);
> +}
> +
> +static struct xsk_dma_map *xp_create_dma_map(struct xsk_buff_pool *pool)
> +{
> +	struct xsk_dma_map *dma_map;
> +
> +	dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL);
> +	if (!dma_map)
> +		return NULL;
> +
> +	dma_map->dev = pool->netdev;
> +	refcount_set(&dma_map->users, 1);
> +	list_add(&dma_map->list, &pool->umem->xsk_dma_list);
> +	return dma_map;
> +}
> +
>   void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
>   {
> +	struct xsk_dma_map *dma_map;
>   	dma_addr_t *dma;
>   	u32 i;
>   
>   	if (pool->dma_pages_cnt == 0)
>   		return;
>   
> +	dma_map = xp_find_dma_map(pool);
> +	if (!dma_map) {
> +		WARN(1, "Could not find dma_map for device");
> +		return;
> +	}
> +
>   	for (i = 0; i < pool->dma_pages_cnt; i++) {
>   		dma = &pool->dma_pages[i];
>   		if (*dma) {
> @@ -250,6 +307,7 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
>   		}
>   	}
>   
> +	xp_put_dma_map(dma_map);

I believe that the logic in this function is not correct. Basically, the 
driver calls xp_dma_[un]map when a socket is enabled/disabled on a given 
queue of a given netdev. On xp_dma_map, if the UMEM is already mapped 
for that netdev, we skip all the logic, but on xp_dma_unmap you unmap 
the pages unconditionally, only after that you check the refcount. This 
is not symmetric, and the pages will be unmapped when the first socket 
is closed, rendering the rest of sockets unusable.

>   	kvfree(pool->dma_pages);
>   	pool->dma_pages_cnt = 0;
>   	pool->dev = NULL;
> @@ -271,14 +329,29 @@ static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
>   int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
>   	       unsigned long attrs, struct page **pages, u32 nr_pages)
>   {
> +	struct xsk_dma_map *dma_map;
>   	dma_addr_t dma;
>   	u32 i;
>   
> +	dma_map = xp_find_dma_map(pool);
> +	if (dma_map) {
> +		pool->dma_pages = dma_map->dma_pages;
> +		refcount_inc(&dma_map->users);
> +		return 0;
> +	}
> +
> +	dma_map = xp_create_dma_map(pool);
> +	if (!dma_map)
> +		return -ENOMEM;
> +
>   	pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
>   				   GFP_KERNEL);
> -	if (!pool->dma_pages)
> +	if (!pool->dma_pages) {
> +		xp_destroy_dma_map(dma_map);
>   		return -ENOMEM;
> +	}
>   
> +	dma_map->dma_pages = pool->dma_pages;
>   	pool->dev = dev;
>   	pool->dma_pages_cnt = nr_pages;
>   	pool->dma_need_sync = false;
> @@ -288,6 +361,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
>   					 DMA_BIDIRECTIONAL, attrs);
>   		if (dma_mapping_error(dev, dma)) {
>   			xp_dma_unmap(pool, attrs);
> +			xp_destroy_dma_map(dma_map);
>   			return -ENOMEM;
>   		}
>   		if (dma_need_sync(dev, dma))
>
Magnus Karlsson July 29, 2020, 1:22 p.m. UTC | #3
On Tue, Jul 28, 2020 at 11:00 AM Maxim Mikityanskiy
<maximmi@mellanox.com> wrote:
>
> On 2020-07-21 08:04, Magnus Karlsson wrote:
> > Enable the sharing of dma mappings by moving them out from the buffer
> > pool. Instead we put each dma mapped umem region in a list in the umem
> > structure. If dma has already been mapped for this umem and device, it
> > is not mapped again and the existing dma mappings are reused.
> >
> > Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
> > ---
> >   include/net/xdp_sock.h      |   1 +
> >   include/net/xsk_buff_pool.h |   7 +++
> >   net/xdp/xdp_umem.c          |   1 +
> >   net/xdp/xsk_buff_pool.c     | 112 ++++++++++++++++++++++++++++++++++++--------
> >   4 files changed, 102 insertions(+), 19 deletions(-)
> >
> > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > index 126d243..282aeba 100644
> > --- a/include/net/xdp_sock.h
> > +++ b/include/net/xdp_sock.h
> > @@ -30,6 +30,7 @@ struct xdp_umem {
> >       u8 flags;
> >       int id;
> >       bool zc;
> > +     struct list_head xsk_dma_list;
> >   };
> >
> >   struct xsk_map {
> > diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> > index 83f100c..8f1dc4c 100644
> > --- a/include/net/xsk_buff_pool.h
> > +++ b/include/net/xsk_buff_pool.h
> > @@ -28,6 +28,13 @@ struct xdp_buff_xsk {
> >       struct list_head free_list_node;
> >   };
> >
> > +struct xsk_dma_map {
> > +     dma_addr_t *dma_pages;
> > +     struct net_device *dev;
> > +     refcount_t users;
> > +     struct list_head list; /* Protected by the RTNL_LOCK */
> > +};
> > +
> >   struct xsk_buff_pool {
> >       struct xsk_queue *fq;
> >       struct xsk_queue *cq;
> > diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> > index 372998d..cf27249 100644
> > --- a/net/xdp/xdp_umem.c
> > +++ b/net/xdp/xdp_umem.c
> > @@ -199,6 +199,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> >       umem->user = NULL;
> >       umem->flags = mr->flags;
> >
> > +     INIT_LIST_HEAD(&umem->xsk_dma_list);
> >       refcount_set(&umem->users, 1);
> >
> >       err = xdp_umem_account_pages(umem);
> > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > index c563874..ca74a3e 100644
> > --- a/net/xdp/xsk_buff_pool.c
> > +++ b/net/xdp/xsk_buff_pool.c
> > @@ -104,6 +104,25 @@ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
> >   }
> >   EXPORT_SYMBOL(xp_set_rxq_info);
> >
> > +static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
> > +{
> > +     struct netdev_bpf bpf;
> > +     int err;
> > +
> > +     ASSERT_RTNL();
> > +
> > +     if (pool->umem->zc) {
> > +             bpf.command = XDP_SETUP_XSK_POOL;
> > +             bpf.xsk.pool = NULL;
> > +             bpf.xsk.queue_id = pool->queue_id;
> > +
> > +             err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
> > +
> > +             if (err)
> > +                     WARN(1, "Failed to disable zero-copy!\n");
> > +     }
> > +}
> > +
> >   int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
> >                 u16 queue_id, u16 flags)
> >   {
> > @@ -122,6 +141,8 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
> >       if (xsk_get_pool_from_qid(netdev, queue_id))
> >               return -EBUSY;
> >
> > +     pool->netdev = netdev;
> > +     pool->queue_id = queue_id;
> >       err = xsk_reg_pool_at_qid(netdev, pool, queue_id);
> >       if (err)
> >               return err;
> > @@ -155,11 +176,15 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
> >       if (err)
> >               goto err_unreg_pool;
> >
> > -     pool->netdev = netdev;
> > -     pool->queue_id = queue_id;
> > +     if (!pool->dma_pages) {
> > +             WARN(1, "Driver did not DMA map zero-copy buffers");
> > +             goto err_unreg_xsk;
> > +     }
> >       pool->umem->zc = true;
> >       return 0;
> >
> > +err_unreg_xsk:
> > +     xp_disable_drv_zc(pool);
> >   err_unreg_pool:
> >       if (!force_zc)
> >               err = 0; /* fallback to copy mode */
> > @@ -170,25 +195,10 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
> >
> >   void xp_clear_dev(struct xsk_buff_pool *pool)
> >   {
> > -     struct netdev_bpf bpf;
> > -     int err;
> > -
> > -     ASSERT_RTNL();
> > -
> >       if (!pool->netdev)
> >               return;
> >
> > -     if (pool->umem->zc) {
> > -             bpf.command = XDP_SETUP_XSK_POOL;
> > -             bpf.xsk.pool = NULL;
> > -             bpf.xsk.queue_id = pool->queue_id;
> > -
> > -             err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
> > -
> > -             if (err)
> > -                     WARN(1, "Failed to disable zero-copy!\n");
> > -     }
> > -
> > +     xp_disable_drv_zc(pool);
> >       xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
> >       dev_put(pool->netdev);
> >       pool->netdev = NULL;
> > @@ -233,14 +243,61 @@ void xp_put_pool(struct xsk_buff_pool *pool)
> >       }
> >   }
> >
> > +static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool)
> > +{
> > +     struct xsk_dma_map *dma_map;
> > +
> > +     list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) {
> > +             if (dma_map->dev == pool->netdev)
> > +                     return dma_map;
> > +     }
> > +
> > +     return NULL;
> > +}
> > +
> > +static void xp_destroy_dma_map(struct xsk_dma_map *dma_map)
> > +{
> > +     list_del(&dma_map->list);
> > +     kfree(dma_map);
> > +}
> > +
> > +static void xp_put_dma_map(struct xsk_dma_map *dma_map)
> > +{
> > +     if (!refcount_dec_and_test(&dma_map->users))
> > +             return;
> > +
> > +     xp_destroy_dma_map(dma_map);
> > +}
> > +
> > +static struct xsk_dma_map *xp_create_dma_map(struct xsk_buff_pool *pool)
> > +{
> > +     struct xsk_dma_map *dma_map;
> > +
> > +     dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL);
> > +     if (!dma_map)
> > +             return NULL;
> > +
> > +     dma_map->dev = pool->netdev;
> > +     refcount_set(&dma_map->users, 1);
> > +     list_add(&dma_map->list, &pool->umem->xsk_dma_list);
> > +     return dma_map;
> > +}
> > +
> >   void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
> >   {
> > +     struct xsk_dma_map *dma_map;
> >       dma_addr_t *dma;
> >       u32 i;
> >
> >       if (pool->dma_pages_cnt == 0)
> >               return;
> >
> > +     dma_map = xp_find_dma_map(pool);
> > +     if (!dma_map) {
> > +             WARN(1, "Could not find dma_map for device");
> > +             return;
> > +     }
> > +
> >       for (i = 0; i < pool->dma_pages_cnt; i++) {
> >               dma = &pool->dma_pages[i];
> >               if (*dma) {
> > @@ -250,6 +307,7 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
> >               }
> >       }
> >
> > +     xp_put_dma_map(dma_map);
>
> I believe that the logic in this function is not correct. Basically, the
> driver calls xp_dma_[un]map when a socket is enabled/disabled on a given
> queue of a given netdev. On xp_dma_map, if the UMEM is already mapped
> for that netdev, we skip all the logic, but on xp_dma_unmap you unmap
> the pages unconditionally, only after that you check the refcount. This
> is not symmetric, and the pages will be unmapped when the first socket
> is closed, rendering the rest of sockets unusable.

Thanks for spotting this. Will fix.

/Magnus

> >       kvfree(pool->dma_pages);
> >       pool->dma_pages_cnt = 0;
> >       pool->dev = NULL;
> > @@ -271,14 +329,29 @@ static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
> >   int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
> >              unsigned long attrs, struct page **pages, u32 nr_pages)
> >   {
> > +     struct xsk_dma_map *dma_map;
> >       dma_addr_t dma;
> >       u32 i;
> >
> > +     dma_map = xp_find_dma_map(pool);
> > +     if (dma_map) {
> > +             pool->dma_pages = dma_map->dma_pages;
> > +             refcount_inc(&dma_map->users);
> > +             return 0;
> > +     }
> > +
> > +     dma_map = xp_create_dma_map(pool);
> > +     if (!dma_map)
> > +             return -ENOMEM;
> > +
> >       pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
> >                                  GFP_KERNEL);
> > -     if (!pool->dma_pages)
> > +     if (!pool->dma_pages) {
> > +             xp_destroy_dma_map(dma_map);
> >               return -ENOMEM;
> > +     }
> >
> > +     dma_map->dma_pages = pool->dma_pages;
> >       pool->dev = dev;
> >       pool->dma_pages_cnt = nr_pages;
> >       pool->dma_need_sync = false;
> > @@ -288,6 +361,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
> >                                        DMA_BIDIRECTIONAL, attrs);
> >               if (dma_mapping_error(dev, dma)) {
> >                       xp_dma_unmap(pool, attrs);
> > +                     xp_destroy_dma_map(dma_map);
> >                       return -ENOMEM;
> >               }
> >               if (dma_need_sync(dev, dma))
> >
>
diff mbox series

Patch

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 126d243..282aeba 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -30,6 +30,7 @@  struct xdp_umem {
 	u8 flags;
 	int id;
 	bool zc;
+	struct list_head xsk_dma_list;
 };
 
 struct xsk_map {
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 83f100c..8f1dc4c 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -28,6 +28,13 @@  struct xdp_buff_xsk {
 	struct list_head free_list_node;
 };
 
+struct xsk_dma_map {
+	dma_addr_t *dma_pages;
+	struct net_device *dev;
+	refcount_t users;
+	struct list_head list; /* Protected by the RTNL_LOCK */
+};
+
 struct xsk_buff_pool {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 372998d..cf27249 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -199,6 +199,7 @@  static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	umem->user = NULL;
 	umem->flags = mr->flags;
 
+	INIT_LIST_HEAD(&umem->xsk_dma_list);
 	refcount_set(&umem->users, 1);
 
 	err = xdp_umem_account_pages(umem);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index c563874..ca74a3e 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -104,6 +104,25 @@  void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
 }
 EXPORT_SYMBOL(xp_set_rxq_info);
 
+static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
+{
+	struct netdev_bpf bpf;
+	int err;
+
+	ASSERT_RTNL();
+
+	if (pool->umem->zc) {
+		bpf.command = XDP_SETUP_XSK_POOL;
+		bpf.xsk.pool = NULL;
+		bpf.xsk.queue_id = pool->queue_id;
+
+		err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
+
+		if (err)
+			WARN(1, "Failed to disable zero-copy!\n");
+	}
+}
+
 int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
 		  u16 queue_id, u16 flags)
 {
@@ -122,6 +141,8 @@  int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
 	if (xsk_get_pool_from_qid(netdev, queue_id))
 		return -EBUSY;
 
+	pool->netdev = netdev;
+	pool->queue_id = queue_id;
 	err = xsk_reg_pool_at_qid(netdev, pool, queue_id);
 	if (err)
 		return err;
@@ -155,11 +176,15 @@  int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
 	if (err)
 		goto err_unreg_pool;
 
-	pool->netdev = netdev;
-	pool->queue_id = queue_id;
+	if (!pool->dma_pages) {
+		WARN(1, "Driver did not DMA map zero-copy buffers");
+		goto err_unreg_xsk;
+	}
 	pool->umem->zc = true;
 	return 0;
 
+err_unreg_xsk:
+	xp_disable_drv_zc(pool);
 err_unreg_pool:
 	if (!force_zc)
 		err = 0; /* fallback to copy mode */
@@ -170,25 +195,10 @@  int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev,
 
 void xp_clear_dev(struct xsk_buff_pool *pool)
 {
-	struct netdev_bpf bpf;
-	int err;
-
-	ASSERT_RTNL();
-
 	if (!pool->netdev)
 		return;
 
-	if (pool->umem->zc) {
-		bpf.command = XDP_SETUP_XSK_POOL;
-		bpf.xsk.pool = NULL;
-		bpf.xsk.queue_id = pool->queue_id;
-
-		err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
-
-		if (err)
-			WARN(1, "Failed to disable zero-copy!\n");
-	}
-
+	xp_disable_drv_zc(pool);
 	xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
 	dev_put(pool->netdev);
 	pool->netdev = NULL;
@@ -233,14 +243,61 @@  void xp_put_pool(struct xsk_buff_pool *pool)
 	}
 }
 
+static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool)
+{
+	struct xsk_dma_map *dma_map;
+
+	list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) {
+		if (dma_map->dev == pool->netdev)
+			return dma_map;
+	}
+
+	return NULL;
+}
+
+static void xp_destroy_dma_map(struct xsk_dma_map *dma_map)
+{
+	list_del(&dma_map->list);
+	kfree(dma_map);
+}
+
+static void xp_put_dma_map(struct xsk_dma_map *dma_map)
+{
+	if (!refcount_dec_and_test(&dma_map->users))
+		return;
+
+	xp_destroy_dma_map(dma_map);
+}
+
+static struct xsk_dma_map *xp_create_dma_map(struct xsk_buff_pool *pool)
+{
+	struct xsk_dma_map *dma_map;
+
+	dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL);
+	if (!dma_map)
+		return NULL;
+
+	dma_map->dev = pool->netdev;
+	refcount_set(&dma_map->users, 1);
+	list_add(&dma_map->list, &pool->umem->xsk_dma_list);
+	return dma_map;
+}
+
 void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
 {
+	struct xsk_dma_map *dma_map;
 	dma_addr_t *dma;
 	u32 i;
 
 	if (pool->dma_pages_cnt == 0)
 		return;
 
+	dma_map = xp_find_dma_map(pool);
+	if (!dma_map) {
+		WARN(1, "Could not find dma_map for device");
+		return;
+	}
+
 	for (i = 0; i < pool->dma_pages_cnt; i++) {
 		dma = &pool->dma_pages[i];
 		if (*dma) {
@@ -250,6 +307,7 @@  void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
 		}
 	}
 
+	xp_put_dma_map(dma_map);
 	kvfree(pool->dma_pages);
 	pool->dma_pages_cnt = 0;
 	pool->dev = NULL;
@@ -271,14 +329,29 @@  static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
 int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
 	       unsigned long attrs, struct page **pages, u32 nr_pages)
 {
+	struct xsk_dma_map *dma_map;
 	dma_addr_t dma;
 	u32 i;
 
+	dma_map = xp_find_dma_map(pool);
+	if (dma_map) {
+		pool->dma_pages = dma_map->dma_pages;
+		refcount_inc(&dma_map->users);
+		return 0;
+	}
+
+	dma_map = xp_create_dma_map(pool);
+	if (!dma_map)
+		return -ENOMEM;
+
 	pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
 				   GFP_KERNEL);
-	if (!pool->dma_pages)
+	if (!pool->dma_pages) {
+		xp_destroy_dma_map(dma_map);
 		return -ENOMEM;
+	}
 
+	dma_map->dma_pages = pool->dma_pages;
 	pool->dev = dev;
 	pool->dma_pages_cnt = nr_pages;
 	pool->dma_need_sync = false;
@@ -288,6 +361,7 @@  int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
 					 DMA_BIDIRECTIONAL, attrs);
 		if (dma_mapping_error(dev, dma)) {
 			xp_dma_unmap(pool, attrs);
+			xp_destroy_dma_map(dma_map);
 			return -ENOMEM;
 		}
 		if (dma_need_sync(dev, dma))