diff mbox series

[net-next,2/3] net: page_pool: add the possibility to sync DMA memory for non-coherent devices

Message ID 68229f90060d01c1457ac945b2f6524e2aa27d05.1573383212.git.lorenzo@kernel.org
State Changes Requested
Delegated to: David Miller
Headers show
Series add DMA sync capability to page_pool API | expand

Commit Message

Lorenzo Bianconi Nov. 10, 2019, 12:09 p.m. UTC
Introduce the following parameters in order to add the possibility to sync
DMA memory area before putting allocated buffers in the page_pool caches:
- sync: set to 1 if device is non cache-coherent and needs to flush DMA
  area
- offset: DMA address offset where the DMA engine starts copying rx data
- max_len: maximum DMA memory size page_pool is allowed to flush. This
  is currently used in __page_pool_alloc_pages_slow routine when pages
  are allocated from page allocator
These parameters are supposed to be set by device drivers

Tested-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 include/net/page_pool.h | 11 +++++++----
 net/core/page_pool.c    | 39 +++++++++++++++++++++++++++++++++------
 2 files changed, 40 insertions(+), 10 deletions(-)

Comments

Jesper Dangaard Brouer Nov. 11, 2019, 4:48 p.m. UTC | #1
On Sun, 10 Nov 2019 14:09:09 +0200
Lorenzo Bianconi <lorenzo@kernel.org> wrote:

> Introduce the following parameters in order to add the possibility to sync
> DMA memory area before putting allocated buffers in the page_pool caches:

> - sync: set to 1 if device is non cache-coherent and needs to flush DMA area

I don't agree that this is only for non cache-coherent devices.

This change is generally for all device drivers.  Via setting 'sync'
(which I prefer to rename 'dma_sync') driver request that page_pool
takes over doing DMA-sync-for-device. (Very important, DMA-sync-for-CPU
is still drivers responsibility).  Drivers can benefit from removing
their calls to dma_sync_single_for_device().

We need to define meaning/semantics of this setting (my definition):
- This means that all pages that driver gets from page_pool, will be
  DMA-synced-for-device.

> - offset: DMA address offset where the DMA engine starts copying rx data

> - max_len: maximum DMA memory size page_pool is allowed to flush. This
>   is currently used in __page_pool_alloc_pages_slow routine when pages
>   are allocated from page allocator

Implementation wise (you did as I suggested offlist), and does the
DMA-sync-for-device at return-time page_pool_put_page() time, because
we (often) know the length that was/can touched by CPU.  This is key to
the optimization, that we know this length.

I also think you/we need to explain why this optimization is correct,
my attempt: 

This optimization reduce the length of the DMA-sync-for-device.  The
optimization is valid, because page is initially DMA-synced-for-device,
as defined via max_len.  At driver RX time, the driver will do a
DMA-sync-for-CPU on the memory for the packet length.  What is
important is the memory occupied by packet payload, because this is the
memory CPU is allowed to read and modify.  If CPU have not written into
a cache-line, then we know that CPU will not be flushing this, thus it
doesn't need a DMA-sync-for-device.  As we don't track cache-lines
written into, simply use the full packet length as dma_sync_size, at
page_pool recycle time.  This also take into account any tail-extend.


> These parameters are supposed to be set by device drivers


 
> Tested-by: Matteo Croce <mcroce@redhat.com>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  include/net/page_pool.h | 11 +++++++----
>  net/core/page_pool.c    | 39 +++++++++++++++++++++++++++++++++------
>  2 files changed, 40 insertions(+), 10 deletions(-)
> 
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 2cbcdbdec254..defbfd90ab46 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -65,6 +65,9 @@ struct page_pool_params {
>  	int		nid;  /* Numa node id to allocate from pages from */
>  	struct device	*dev; /* device, for DMA pre-mapping purposes */
>  	enum dma_data_direction dma_dir; /* DMA mapping direction */
> +	unsigned int	max_len; /* max DMA sync memory size */
> +	unsigned int	offset;  /* DMA addr offset */
> +	u8 sync;
>  };
>  
>  struct page_pool {
> @@ -150,8 +153,8 @@ static inline void page_pool_destroy(struct page_pool *pool)
>  }
>  
>  /* Never call this directly, use helpers below */
> -void __page_pool_put_page(struct page_pool *pool,
> -			  struct page *page, bool allow_direct);
> +void __page_pool_put_page(struct page_pool *pool, struct page *page,
> +			  unsigned int dma_sync_size, bool allow_direct);
>  
>  static inline void page_pool_put_page(struct page_pool *pool,
>  				      struct page *page, bool allow_direct)
> @@ -160,14 +163,14 @@ static inline void page_pool_put_page(struct page_pool *pool,
>  	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
>  	 */
>  #ifdef CONFIG_PAGE_POOL
> -	__page_pool_put_page(pool, page, allow_direct);
> +	__page_pool_put_page(pool, page, 0, allow_direct);
>  #endif
>  }
>  /* Very limited use-cases allow recycle direct */
>  static inline void page_pool_recycle_direct(struct page_pool *pool,
>  					    struct page *page)
>  {
> -	__page_pool_put_page(pool, page, true);
> +	__page_pool_put_page(pool, page, 0, true);
>  }
>  
>  /* API user MUST have disconnected alloc-side (not allowed to call
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 5bc65587f1c4..af9514c2d15b 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -112,6 +112,17 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
>  	return page;
>  }
>  
> +/* Used for non-coherent devices */
> +static void page_pool_dma_sync_for_device(struct page_pool *pool,
> +					  struct page *page,
> +					  unsigned int dma_sync_size)
> +{
> +	dma_sync_size = min(dma_sync_size, pool->p.max_len);
> +	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
> +					 pool->p.offset, dma_sync_size,
> +					 pool->p.dma_dir);
> +}
> +
>  /* slow path */
>  noinline
>  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
> @@ -156,6 +167,10 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
>  	}
>  	page->dma_addr = dma;
>  
> +	/* non-coherent devices - flush memory */
> +	if (pool->p.sync)
> +		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
> +
>  skip_dma_map:
>  	/* Track how many pages are held 'in-flight' */
>  	pool->pages_state_hold_cnt++;
> @@ -255,7 +270,8 @@ static void __page_pool_return_page(struct page_pool *pool, struct page *page)
>  }
>  
>  static bool __page_pool_recycle_into_ring(struct page_pool *pool,
> -				   struct page *page)
> +					  struct page *page,
> +					  unsigned int dma_sync_size)
>  {
>  	int ret;
>  	/* BH protection not needed if current is serving softirq */
> @@ -264,6 +280,10 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool,
>  	else
>  		ret = ptr_ring_produce_bh(&pool->ring, page);
>  
> +	/* non-coherent devices - flush memory */
> +	if (ret == 0 && pool->p.sync)
> +		page_pool_dma_sync_for_device(pool, page, dma_sync_size);
> +
>  	return (ret == 0) ? true : false;
>  }
>  
> @@ -273,18 +293,23 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool,
>   * Caller must provide appropriate safe context.
>   */
>  static bool __page_pool_recycle_direct(struct page *page,
> -				       struct page_pool *pool)
> +				       struct page_pool *pool,
> +				       unsigned int dma_sync_size)
>  {
>  	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
>  		return false;
>  
>  	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
>  	pool->alloc.cache[pool->alloc.count++] = page;
> +
> +	/* non-coherent devices - flush memory */
> +	if (pool->p.sync)
> +		page_pool_dma_sync_for_device(pool, page, dma_sync_size);
>  	return true;
>  }
>  
> -void __page_pool_put_page(struct page_pool *pool,
> -			  struct page *page, bool allow_direct)
> +void __page_pool_put_page(struct page_pool *pool, struct page *page,
> +			  unsigned int dma_sync_size, bool allow_direct)
>  {
>  	/* This allocator is optimized for the XDP mode that uses
>  	 * one-frame-per-page, but have fallbacks that act like the
> @@ -296,10 +321,12 @@ void __page_pool_put_page(struct page_pool *pool,
>  		/* Read barrier done in page_ref_count / READ_ONCE */
>  
>  		if (allow_direct && in_serving_softirq())
> -			if (__page_pool_recycle_direct(page, pool))
> +			if (__page_pool_recycle_direct(page, pool,
> +						       dma_sync_size))
>  				return;
>  
> -		if (!__page_pool_recycle_into_ring(pool, page)) {
> +		if (!__page_pool_recycle_into_ring(pool, page,
> +						   dma_sync_size)) {
>  			/* Cache full, fallback to free pages */
>  			__page_pool_return_page(pool, page);
>  		}
Lorenzo Bianconi Nov. 11, 2019, 7:11 p.m. UTC | #2
> On Sun, 10 Nov 2019 14:09:09 +0200
> Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> 
> > Introduce the following parameters in order to add the possibility to sync
> > DMA memory area before putting allocated buffers in the page_pool caches:
> 
> > - sync: set to 1 if device is non cache-coherent and needs to flush DMA area
> 

Hi Jesper,

thx for the review

> I don't agree that this is only for non cache-coherent devices.
> 
> This change is generally for all device drivers.  Via setting 'sync'
> (which I prefer to rename 'dma_sync') driver request that page_pool
> takes over doing DMA-sync-for-device. (Very important, DMA-sync-for-CPU
> is still drivers responsibility).  Drivers can benefit from removing
> their calls to dma_sync_single_for_device().
> 
> We need to define meaning/semantics of this setting (my definition):
> - This means that all pages that driver gets from page_pool, will be
>   DMA-synced-for-device.

ack, will fix it in v2

> 
> > - offset: DMA address offset where the DMA engine starts copying rx data
> 
> > - max_len: maximum DMA memory size page_pool is allowed to flush. This
> >   is currently used in __page_pool_alloc_pages_slow routine when pages
> >   are allocated from page allocator
> 
> Implementation wise (you did as I suggested offlist), and does the
> DMA-sync-for-device at return-time page_pool_put_page() time, because
> we (often) know the length that was/can touched by CPU.  This is key to
> the optimization, that we know this length.

right, refilling the cache we now the exact length that was/can touched by CPU.

> 
> I also think you/we need to explain why this optimization is correct,
> my attempt: 
> 
> This optimization reduce the length of the DMA-sync-for-device.  The
> optimization is valid, because page is initially DMA-synced-for-device,
> as defined via max_len.  At driver RX time, the driver will do a
> DMA-sync-for-CPU on the memory for the packet length.  What is
> important is the memory occupied by packet payload, because this is the
> memory CPU is allowed to read and modify.  If CPU have not written into
> a cache-line, then we know that CPU will not be flushing this, thus it
> doesn't need a DMA-sync-for-device.  As we don't track cache-lines
> written into, simply use the full packet length as dma_sync_size, at
> page_pool recycle time.  This also take into account any tail-extend.

ack, will update it in v2

Regards,
Lorenzo

> 
> 
> > These parameters are supposed to be set by device drivers
> 
> 
>  
> > Tested-by: Matteo Croce <mcroce@redhat.com>
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >  include/net/page_pool.h | 11 +++++++----
> >  net/core/page_pool.c    | 39 +++++++++++++++++++++++++++++++++------
> >  2 files changed, 40 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> > index 2cbcdbdec254..defbfd90ab46 100644
> > --- a/include/net/page_pool.h
> > +++ b/include/net/page_pool.h
> > @@ -65,6 +65,9 @@ struct page_pool_params {
> >  	int		nid;  /* Numa node id to allocate from pages from */
> >  	struct device	*dev; /* device, for DMA pre-mapping purposes */
> >  	enum dma_data_direction dma_dir; /* DMA mapping direction */
> > +	unsigned int	max_len; /* max DMA sync memory size */
> > +	unsigned int	offset;  /* DMA addr offset */
> > +	u8 sync;
> >  };
> >  
> >  struct page_pool {
> > @@ -150,8 +153,8 @@ static inline void page_pool_destroy(struct page_pool *pool)
> >  }
> >  
> >  /* Never call this directly, use helpers below */
> > -void __page_pool_put_page(struct page_pool *pool,
> > -			  struct page *page, bool allow_direct);
> > +void __page_pool_put_page(struct page_pool *pool, struct page *page,
> > +			  unsigned int dma_sync_size, bool allow_direct);
> >  
> >  static inline void page_pool_put_page(struct page_pool *pool,
> >  				      struct page *page, bool allow_direct)
> > @@ -160,14 +163,14 @@ static inline void page_pool_put_page(struct page_pool *pool,
> >  	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
> >  	 */
> >  #ifdef CONFIG_PAGE_POOL
> > -	__page_pool_put_page(pool, page, allow_direct);
> > +	__page_pool_put_page(pool, page, 0, allow_direct);
> >  #endif
> >  }
> >  /* Very limited use-cases allow recycle direct */
> >  static inline void page_pool_recycle_direct(struct page_pool *pool,
> >  					    struct page *page)
> >  {
> > -	__page_pool_put_page(pool, page, true);
> > +	__page_pool_put_page(pool, page, 0, true);
> >  }
> >  
> >  /* API user MUST have disconnected alloc-side (not allowed to call
> > diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> > index 5bc65587f1c4..af9514c2d15b 100644
> > --- a/net/core/page_pool.c
> > +++ b/net/core/page_pool.c
> > @@ -112,6 +112,17 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
> >  	return page;
> >  }
> >  
> > +/* Used for non-coherent devices */
> > +static void page_pool_dma_sync_for_device(struct page_pool *pool,
> > +					  struct page *page,
> > +					  unsigned int dma_sync_size)
> > +{
> > +	dma_sync_size = min(dma_sync_size, pool->p.max_len);
> > +	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
> > +					 pool->p.offset, dma_sync_size,
> > +					 pool->p.dma_dir);
> > +}
> > +
> >  /* slow path */
> >  noinline
> >  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
> > @@ -156,6 +167,10 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
> >  	}
> >  	page->dma_addr = dma;
> >  
> > +	/* non-coherent devices - flush memory */
> > +	if (pool->p.sync)
> > +		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
> > +
> >  skip_dma_map:
> >  	/* Track how many pages are held 'in-flight' */
> >  	pool->pages_state_hold_cnt++;
> > @@ -255,7 +270,8 @@ static void __page_pool_return_page(struct page_pool *pool, struct page *page)
> >  }
> >  
> >  static bool __page_pool_recycle_into_ring(struct page_pool *pool,
> > -				   struct page *page)
> > +					  struct page *page,
> > +					  unsigned int dma_sync_size)
> >  {
> >  	int ret;
> >  	/* BH protection not needed if current is serving softirq */
> > @@ -264,6 +280,10 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool,
> >  	else
> >  		ret = ptr_ring_produce_bh(&pool->ring, page);
> >  
> > +	/* non-coherent devices - flush memory */
> > +	if (ret == 0 && pool->p.sync)
> > +		page_pool_dma_sync_for_device(pool, page, dma_sync_size);
> > +
> >  	return (ret == 0) ? true : false;
> >  }
> >  
> > @@ -273,18 +293,23 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool,
> >   * Caller must provide appropriate safe context.
> >   */
> >  static bool __page_pool_recycle_direct(struct page *page,
> > -				       struct page_pool *pool)
> > +				       struct page_pool *pool,
> > +				       unsigned int dma_sync_size)
> >  {
> >  	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
> >  		return false;
> >  
> >  	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
> >  	pool->alloc.cache[pool->alloc.count++] = page;
> > +
> > +	/* non-coherent devices - flush memory */
> > +	if (pool->p.sync)
> > +		page_pool_dma_sync_for_device(pool, page, dma_sync_size);
> >  	return true;
> >  }
> >  
> > -void __page_pool_put_page(struct page_pool *pool,
> > -			  struct page *page, bool allow_direct)
> > +void __page_pool_put_page(struct page_pool *pool, struct page *page,
> > +			  unsigned int dma_sync_size, bool allow_direct)
> >  {
> >  	/* This allocator is optimized for the XDP mode that uses
> >  	 * one-frame-per-page, but have fallbacks that act like the
> > @@ -296,10 +321,12 @@ void __page_pool_put_page(struct page_pool *pool,
> >  		/* Read barrier done in page_ref_count / READ_ONCE */
> >  
> >  		if (allow_direct && in_serving_softirq())
> > -			if (__page_pool_recycle_direct(page, pool))
> > +			if (__page_pool_recycle_direct(page, pool,
> > +						       dma_sync_size))
> >  				return;
> >  
> > -		if (!__page_pool_recycle_into_ring(pool, page)) {
> > +		if (!__page_pool_recycle_into_ring(pool, page,
> > +						   dma_sync_size)) {
> >  			/* Cache full, fallback to free pages */
> >  			__page_pool_return_page(pool, page);
> >  		}
> 
> 
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer
>
Jesper Dangaard Brouer Nov. 13, 2019, 8:29 a.m. UTC | #3
On Sun, 10 Nov 2019 14:09:09 +0200
Lorenzo Bianconi <lorenzo@kernel.org> wrote:

> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 2cbcdbdec254..defbfd90ab46 100644
[...]
> @@ -150,8 +153,8 @@ static inline void page_pool_destroy(struct page_pool *pool)
>  }
>  
>  /* Never call this directly, use helpers below */
> -void __page_pool_put_page(struct page_pool *pool,
> -			  struct page *page, bool allow_direct);
> +void __page_pool_put_page(struct page_pool *pool, struct page *page,
> +			  unsigned int dma_sync_size, bool allow_direct);
>  
>  static inline void page_pool_put_page(struct page_pool *pool,
>  				      struct page *page, bool allow_direct)
> @@ -160,14 +163,14 @@ static inline void page_pool_put_page(struct page_pool *pool,
>  	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
>  	 */
>  #ifdef CONFIG_PAGE_POOL
> -	__page_pool_put_page(pool, page, allow_direct);
> +	__page_pool_put_page(pool, page, 0, allow_direct);
>  #endif
>  }
>  /* Very limited use-cases allow recycle direct */
>  static inline void page_pool_recycle_direct(struct page_pool *pool,
>  					    struct page *page)
>  {
> -	__page_pool_put_page(pool, page, true);
> +	__page_pool_put_page(pool, page, 0, true);
>  }

We need to use another "default" value than zero for 'dma_sync_size' in
above calls.  I suggest either 0xFFFFFFFF or -1 (which unsigned is
0xFFFFFFFF).

Point is that in case caller doesn't know the length (the CPU have had
access to) then page_pool will need to sync with pool->p.max_len.

If choosing a larger value here default value your code below takes
care of it via min(dma_sync_size, pool->p.max_len).

 
>  /* API user MUST have disconnected alloc-side (not allowed to call
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 5bc65587f1c4..af9514c2d15b 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -112,6 +112,17 @@ static struct page
> *__page_pool_get_cached(struct page_pool *pool) return page;
>  }
>  
> +/* Used for non-coherent devices */
> +static void page_pool_dma_sync_for_device(struct page_pool *pool,
> +					  struct page *page,
> +					  unsigned int dma_sync_size)
> +{
> +	dma_sync_size = min(dma_sync_size, pool->p.max_len);
> +	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
> +					 pool->p.offset, dma_sync_size,
> +					 pool->p.dma_dir);
> +}
Jonathan Lemon Nov. 14, 2019, 6:48 p.m. UTC | #4
On 10 Nov 2019, at 4:09, Lorenzo Bianconi wrote:

> Introduce the following parameters in order to add the possibility to 
> sync
> DMA memory area before putting allocated buffers in the page_pool 
> caches:
> - sync: set to 1 if device is non cache-coherent and needs to flush 
> DMA
>   area
> - offset: DMA address offset where the DMA engine starts copying rx 
> data
> - max_len: maximum DMA memory size page_pool is allowed to flush. This
>   is currently used in __page_pool_alloc_pages_slow routine when pages
>   are allocated from page allocator
> These parameters are supposed to be set by device drivers
>
> Tested-by: Matteo Croce <mcroce@redhat.com>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  include/net/page_pool.h | 11 +++++++----
>  net/core/page_pool.c    | 39 +++++++++++++++++++++++++++++++++------
>  2 files changed, 40 insertions(+), 10 deletions(-)
>
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 2cbcdbdec254..defbfd90ab46 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -65,6 +65,9 @@ struct page_pool_params {
>  	int		nid;  /* Numa node id to allocate from pages from */
>  	struct device	*dev; /* device, for DMA pre-mapping purposes */
>  	enum dma_data_direction dma_dir; /* DMA mapping direction */
> +	unsigned int	max_len; /* max DMA sync memory size */
> +	unsigned int	offset;  /* DMA addr offset */
> +	u8 sync;
>  };

How about using PP_FLAG_DMA_SYNC instead of another flag word?
(then it can also be gated on having DMA_MAP enabled)
Ilias Apalodimas Nov. 14, 2019, 6:53 p.m. UTC | #5
[...]
> > index 2cbcdbdec254..defbfd90ab46 100644
> > --- a/include/net/page_pool.h
> > +++ b/include/net/page_pool.h
> > @@ -65,6 +65,9 @@ struct page_pool_params {
> >  	int		nid;  /* Numa node id to allocate from pages from */
> >  	struct device	*dev; /* device, for DMA pre-mapping purposes */
> >  	enum dma_data_direction dma_dir; /* DMA mapping direction */
> > +	unsigned int	max_len; /* max DMA sync memory size */
> > +	unsigned int	offset;  /* DMA addr offset */
> > +	u8 sync;
> >  };
> 
> How about using PP_FLAG_DMA_SYNC instead of another flag word?
> (then it can also be gated on having DMA_MAP enabled)

You mean instead of the u8?
As you pointed out on your V2 comment of the mail, some cards don't sync back to
device.
As the API tries to be generic a u8 was choosen instead of a flag to cover these
use cases. So in time we'll change the semantics of this to 'always sync', 'dont
sync if it's an skb-only queue' etc.
The first case Lorenzo covered is sync the required len only instead of the full
buffer


Thanks
/Ilias
Jonathan Lemon Nov. 14, 2019, 8:27 p.m. UTC | #6
On 14 Nov 2019, at 10:53, Ilias Apalodimas wrote:

> [...]
>>> index 2cbcdbdec254..defbfd90ab46 100644
>>> --- a/include/net/page_pool.h
>>> +++ b/include/net/page_pool.h
>>> @@ -65,6 +65,9 @@ struct page_pool_params {
>>>  	int		nid;  /* Numa node id to allocate from pages from */
>>>  	struct device	*dev; /* device, for DMA pre-mapping purposes */
>>>  	enum dma_data_direction dma_dir; /* DMA mapping direction */
>>> +	unsigned int	max_len; /* max DMA sync memory size */
>>> +	unsigned int	offset;  /* DMA addr offset */
>>> +	u8 sync;
>>>  };
>>
>> How about using PP_FLAG_DMA_SYNC instead of another flag word?
>> (then it can also be gated on having DMA_MAP enabled)
>
> You mean instead of the u8?
> As you pointed out on your V2 comment of the mail, some cards don't 
> sync back to
> device.
> As the API tries to be generic a u8 was choosen instead of a flag to 
> cover these
> use cases. So in time we'll change the semantics of this to 'always 
> sync', 'dont
> sync if it's an skb-only queue' etc.
> The first case Lorenzo covered is sync the required len only instead 
> of the full
> buffer

Yes, I meant instead of:
+		.sync = 1,

Something like:
         .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC

Since .sync alone doesn't make sense if the page pool isn't performing 
any
DMA mapping, right?  Then existing drivers, if they're converted, can 
just
add the SYNC flag.

I did see the initial case where only the RX_BUF_SIZE (1536) is sync'd
instead of the full page.

Could you expand on your 'skb-only queue' comment?  I'm currently 
running
a variant of your patch where iommu mapped pages are attached to skb's 
and
sent up the stack, then reclaimed on release.  I imagine that with this
change, they would have the full RX_BUF_SIZE sync'd before returning to 
the
driver, since the upper layers could basically do anything with the 
buffer
area.
Ilias Apalodimas Nov. 14, 2019, 8:42 p.m. UTC | #7
Hi Jonathan,

On Thu, Nov 14, 2019 at 12:27:40PM -0800, Jonathan Lemon wrote:
> 
> 
> On 14 Nov 2019, at 10:53, Ilias Apalodimas wrote:
> 
> > [...]
> > > > index 2cbcdbdec254..defbfd90ab46 100644
> > > > --- a/include/net/page_pool.h
> > > > +++ b/include/net/page_pool.h
> > > > @@ -65,6 +65,9 @@ struct page_pool_params {
> > > >  	int		nid;  /* Numa node id to allocate from pages from */
> > > >  	struct device	*dev; /* device, for DMA pre-mapping purposes */
> > > >  	enum dma_data_direction dma_dir; /* DMA mapping direction */
> > > > +	unsigned int	max_len; /* max DMA sync memory size */
> > > > +	unsigned int	offset;  /* DMA addr offset */
> > > > +	u8 sync;
> > > >  };
> > > 
> > > How about using PP_FLAG_DMA_SYNC instead of another flag word?
> > > (then it can also be gated on having DMA_MAP enabled)
> > 
> > You mean instead of the u8?
> > As you pointed out on your V2 comment of the mail, some cards don't sync
> > back to
> > device.
> > As the API tries to be generic a u8 was choosen instead of a flag to
> > cover these
> > use cases. So in time we'll change the semantics of this to 'always
> > sync', 'dont
> > sync if it's an skb-only queue' etc.
> > The first case Lorenzo covered is sync the required len only instead of
> > the full
> > buffer
> 
> Yes, I meant instead of:
> +		.sync = 1,
> 
> Something like:
>         .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC
> 
> Since .sync alone doesn't make sense if the page pool isn't performing any
> DMA mapping, right?  

Correct. If the sync happens regardless of the page pool mapping capabilities,
this will affect performance negatively as well (on non-coherent architectures) 

> Then existing drivers, if they're converted, can just
> add the SYNC flag.
> 
> I did see the initial case where only the RX_BUF_SIZE (1536) is sync'd
> instead of the full page.
> 
> Could you expand on your 'skb-only queue' comment?  I'm currently running
> a variant of your patch where iommu mapped pages are attached to skb's and
> sent up the stack, then reclaimed on release.  I imagine that with this
> change, they would have the full RX_BUF_SIZE sync'd before returning to the
> driver, since the upper layers could basically do anything with the buffer
> area.

The idea was that page_pool lives per device queue. Usually some queues are
reserved for XDP only. Since eBPF progs can change the packet we have to sync
for the device, before we fill in the device descriptors. 

For the skb reserved queues, this depends on the 'anything'. If the rest of the
layers touch (or rather write) into that area, then we'll again gave to sync. 
If we know that the data has not been altered though, we can hand them back to
the device skipping that sync right?


Thanks
/Ilias
> -- 
> Jonathan
Jonathan Lemon Nov. 14, 2019, 9:04 p.m. UTC | #8
On 14 Nov 2019, at 12:42, Ilias Apalodimas wrote:

> Hi Jonathan,
>
> On Thu, Nov 14, 2019 at 12:27:40PM -0800, Jonathan Lemon wrote:
>>
>>
>> On 14 Nov 2019, at 10:53, Ilias Apalodimas wrote:
>>
>>> [...]
>>>>> index 2cbcdbdec254..defbfd90ab46 100644
>>>>> --- a/include/net/page_pool.h
>>>>> +++ b/include/net/page_pool.h
>>>>> @@ -65,6 +65,9 @@ struct page_pool_params {
>>>>>  	int		nid;  /* Numa node id to allocate from pages from */
>>>>>  	struct device	*dev; /* device, for DMA pre-mapping purposes */
>>>>>  	enum dma_data_direction dma_dir; /* DMA mapping direction */
>>>>> +	unsigned int	max_len; /* max DMA sync memory size */
>>>>> +	unsigned int	offset;  /* DMA addr offset */
>>>>> +	u8 sync;
>>>>>  };
>>>>
>>>> How about using PP_FLAG_DMA_SYNC instead of another flag word?
>>>> (then it can also be gated on having DMA_MAP enabled)
>>>
>>> You mean instead of the u8?
>>> As you pointed out on your V2 comment of the mail, some cards don't 
>>> sync
>>> back to
>>> device.
>>> As the API tries to be generic a u8 was choosen instead of a flag to
>>> cover these
>>> use cases. So in time we'll change the semantics of this to 'always
>>> sync', 'dont
>>> sync if it's an skb-only queue' etc.
>>> The first case Lorenzo covered is sync the required len only instead 
>>> of
>>> the full
>>> buffer
>>
>> Yes, I meant instead of:
>> +		.sync = 1,
>>
>> Something like:
>>         .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC
>>
>> Since .sync alone doesn't make sense if the page pool isn't 
>> performing any
>> DMA mapping, right?
>
> Correct. If the sync happens regardless of the page pool mapping 
> capabilities,
> this will affect performance negatively as well (on non-coherent 
> architectures)
>
>> Then existing drivers, if they're converted, can just
>> add the SYNC flag.
>>
>> I did see the initial case where only the RX_BUF_SIZE (1536) is 
>> sync'd
>> instead of the full page.
>>
>> Could you expand on your 'skb-only queue' comment?  I'm currently 
>> running
>> a variant of your patch where iommu mapped pages are attached to 
>> skb's and
>> sent up the stack, then reclaimed on release.  I imagine that with 
>> this
>> change, they would have the full RX_BUF_SIZE sync'd before returning 
>> to the
>> driver, since the upper layers could basically do anything with the 
>> buffer
>> area.
>
> The idea was that page_pool lives per device queue. Usually some 
> queues are
> reserved for XDP only. Since eBPF progs can change the packet we have 
> to sync
> for the device, before we fill in the device descriptors.

And some devices (mlx4) run xdp on the normal RX queue, and if the 
verdict is
PASS, a skb is constructed and sent up the stack.


> For the skb reserved queues, this depends on the 'anything'. If the 
> rest of the
> layers touch (or rather write) into that area, then we'll again gave 
> to sync.
> If we know that the data has not been altered though, we can hand them 
> back to
> the device skipping that sync right?

Sure, but this is also true for eBPF programs.  How would the driver 
know that
the data has not been altered / compacted by the upper layers?
Jesper Dangaard Brouer Nov. 14, 2019, 9:43 p.m. UTC | #9
On Thu, 14 Nov 2019 13:04:26 -0800
"Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:

> On 14 Nov 2019, at 12:42, Ilias Apalodimas wrote:
> 
> > Hi Jonathan,
> >
> > On Thu, Nov 14, 2019 at 12:27:40PM -0800, Jonathan Lemon wrote:  
> >>
> >>
> >> On 14 Nov 2019, at 10:53, Ilias Apalodimas wrote:
> >>  
> >>> [...]  
> >>>>> index 2cbcdbdec254..defbfd90ab46 100644
> >>>>> --- a/include/net/page_pool.h
> >>>>> +++ b/include/net/page_pool.h
> >>>>> @@ -65,6 +65,9 @@ struct page_pool_params {
> >>>>>  	int		nid;  /* Numa node id to allocate from pages from */
> >>>>>  	struct device	*dev; /* device, for DMA pre-mapping purposes */
> >>>>>  	enum dma_data_direction dma_dir; /* DMA mapping direction */
> >>>>> +	unsigned int	max_len; /* max DMA sync memory size */
> >>>>> +	unsigned int	offset;  /* DMA addr offset */
> >>>>> +	u8 sync;
> >>>>>  };  
> >>>>
> >>>> How about using PP_FLAG_DMA_SYNC instead of another flag word?
> >>>> (then it can also be gated on having DMA_MAP enabled)  
> >>>
> >>> You mean instead of the u8?
> >>> As you pointed out on your V2 comment of the mail, some cards don't 
> >>> sync back to device.
> >>>
> >>> As the API tries to be generic a u8 was choosen instead of a flag
> >>> to cover these use cases. So in time we'll change the semantics of
> >>> this to 'always sync', 'dont sync if it's an skb-only queue' etc.
> >>>
> >>> The first case Lorenzo covered is sync the required len only instead 
> >>> of the full buffer  
> >>
> >> Yes, I meant instead of:
> >> +		.sync = 1,
> >>
> >> Something like:
> >>         .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC
> >>

I actually agree and think we could use a flag. I suggest
PP_FLAG_DMA_SYNC_DEV to indicate that this DMA-sync-for-device.

Ilias notice that the change I requested to Lorenzo, that dma_sync_size
default value is 0xFFFFFFFF (-1).  That makes dma_sync_size==0 a valid
value, which you can use in the cases, where you know that nobody have
written into the data-area.  This allow us to selectively choose it for
these cases.
Ilias Apalodimas Nov. 15, 2019, 7:05 a.m. UTC | #10
On Thu, Nov 14, 2019 at 10:43:09PM +0100, Jesper Dangaard Brouer wrote:
> On Thu, 14 Nov 2019 13:04:26 -0800
> "Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:
> 
> > On 14 Nov 2019, at 12:42, Ilias Apalodimas wrote:
> > 
> > > Hi Jonathan,
> > >
> > > On Thu, Nov 14, 2019 at 12:27:40PM -0800, Jonathan Lemon wrote:  
> > >>
> > >>
> > >> On 14 Nov 2019, at 10:53, Ilias Apalodimas wrote:
> > >>  
> > >>> [...]  
> > >>>>> index 2cbcdbdec254..defbfd90ab46 100644
> > >>>>> --- a/include/net/page_pool.h
> > >>>>> +++ b/include/net/page_pool.h
> > >>>>> @@ -65,6 +65,9 @@ struct page_pool_params {
> > >>>>>  	int		nid;  /* Numa node id to allocate from pages from */
> > >>>>>  	struct device	*dev; /* device, for DMA pre-mapping purposes */
> > >>>>>  	enum dma_data_direction dma_dir; /* DMA mapping direction */
> > >>>>> +	unsigned int	max_len; /* max DMA sync memory size */
> > >>>>> +	unsigned int	offset;  /* DMA addr offset */
> > >>>>> +	u8 sync;
> > >>>>>  };  
> > >>>>
> > >>>> How about using PP_FLAG_DMA_SYNC instead of another flag word?
> > >>>> (then it can also be gated on having DMA_MAP enabled)  
> > >>>
> > >>> You mean instead of the u8?
> > >>> As you pointed out on your V2 comment of the mail, some cards don't 
> > >>> sync back to device.
> > >>>
> > >>> As the API tries to be generic a u8 was choosen instead of a flag
> > >>> to cover these use cases. So in time we'll change the semantics of
> > >>> this to 'always sync', 'dont sync if it's an skb-only queue' etc.
> > >>>
> > >>> The first case Lorenzo covered is sync the required len only instead 
> > >>> of the full buffer  
> > >>
> > >> Yes, I meant instead of:
> > >> +		.sync = 1,
> > >>
> > >> Something like:
> > >>         .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC
> > >>
> 
> I actually agree and think we could use a flag. I suggest
> PP_FLAG_DMA_SYNC_DEV to indicate that this DMA-sync-for-device.
> 
> Ilias notice that the change I requested to Lorenzo, that dma_sync_size
> default value is 0xFFFFFFFF (-1).  That makes dma_sync_size==0 a valid
> value, which you can use in the cases, where you know that nobody have
> written into the data-area.  This allow us to selectively choose it for
> these cases.

Okay, then i guess the flag is a better fit for this.
The only difference would be that the sync semantics will be done on 'per
packet' basis,  instead of 'per pool', but that should be fine for our cases.

Cheers
/Ilias
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer
>
Ilias Apalodimas Nov. 15, 2019, 7:17 a.m. UTC | #11
Hi Jonathan, 

> 
[...]
> > For the skb reserved queues, this depends on the 'anything'. If the rest
> > of the
> > layers touch (or rather write) into that area, then we'll again gave to
> > sync.
> > If we know that the data has not been altered though, we can hand them
> > back to
> > the device skipping that sync right?
> 
> Sure, but this is also true for eBPF programs.  How would the driver know
> that
> the data has not been altered / compacted by the upper layers?

I haven't looked that up in detail. I was just explaining the reasoning behind
picking a u8 instead of a flag. As Jesper pointed we can get the same result by
using len = 0, so i am fine with the proposed change.

Thanks
/Ilias
> -- 
> Jonathan
Lorenzo Bianconi Nov. 15, 2019, 7:49 a.m. UTC | #12
> On Thu, Nov 14, 2019 at 10:43:09PM +0100, Jesper Dangaard Brouer wrote:
> > On Thu, 14 Nov 2019 13:04:26 -0800
> > "Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:
> > 
> > > On 14 Nov 2019, at 12:42, Ilias Apalodimas wrote:
> > > 
> > > > Hi Jonathan,
> > > >
> > > > On Thu, Nov 14, 2019 at 12:27:40PM -0800, Jonathan Lemon wrote:  
> > > >>
> > > >>
> > > >> On 14 Nov 2019, at 10:53, Ilias Apalodimas wrote:
> > > >>  
> > > >>> [...]  
> > > >>>>> index 2cbcdbdec254..defbfd90ab46 100644
> > > >>>>> --- a/include/net/page_pool.h
> > > >>>>> +++ b/include/net/page_pool.h
> > > >>>>> @@ -65,6 +65,9 @@ struct page_pool_params {
> > > >>>>>  	int		nid;  /* Numa node id to allocate from pages from */
> > > >>>>>  	struct device	*dev; /* device, for DMA pre-mapping purposes */
> > > >>>>>  	enum dma_data_direction dma_dir; /* DMA mapping direction */
> > > >>>>> +	unsigned int	max_len; /* max DMA sync memory size */
> > > >>>>> +	unsigned int	offset;  /* DMA addr offset */
> > > >>>>> +	u8 sync;
> > > >>>>>  };  
> > > >>>>
> > > >>>> How about using PP_FLAG_DMA_SYNC instead of another flag word?
> > > >>>> (then it can also be gated on having DMA_MAP enabled)  
> > > >>>
> > > >>> You mean instead of the u8?
> > > >>> As you pointed out on your V2 comment of the mail, some cards don't 
> > > >>> sync back to device.
> > > >>>
> > > >>> As the API tries to be generic a u8 was choosen instead of a flag
> > > >>> to cover these use cases. So in time we'll change the semantics of
> > > >>> this to 'always sync', 'dont sync if it's an skb-only queue' etc.
> > > >>>
> > > >>> The first case Lorenzo covered is sync the required len only instead 
> > > >>> of the full buffer  
> > > >>
> > > >> Yes, I meant instead of:
> > > >> +		.sync = 1,
> > > >>
> > > >> Something like:
> > > >>         .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC
> > > >>
> > 
> > I actually agree and think we could use a flag. I suggest
> > PP_FLAG_DMA_SYNC_DEV to indicate that this DMA-sync-for-device.
> > 
> > Ilias notice that the change I requested to Lorenzo, that dma_sync_size
> > default value is 0xFFFFFFFF (-1).  That makes dma_sync_size==0 a valid
> > value, which you can use in the cases, where you know that nobody have
> > written into the data-area.  This allow us to selectively choose it for
> > these cases.
> 
> Okay, then i guess the flag is a better fit for this.
> The only difference would be that the sync semantics will be done on 'per
> packet' basis,  instead of 'per pool', but that should be fine for our cases.

Ack, fine for me.
Do you think when checking for PP_FLAG_DMA_SYNC_DEV we should even verify
PP_FLAG_DMA_MAP? Something like:

if ((pool->p.flags & (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)) ==
    (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV))
	page_pool_dma_sync_for_device();

Regards,
Lorenzo

> 
> Cheers
> /Ilias
> > 
> > -- 
> > Best regards,
> >   Jesper Dangaard Brouer
> >   MSc.CS, Principal Kernel Engineer at Red Hat
> >   LinkedIn: http://www.linkedin.com/in/brouer
> > 
>
Ilias Apalodimas Nov. 15, 2019, 8:03 a.m. UTC | #13
Hi Lorenzo, 

> > > > >>>> How about using PP_FLAG_DMA_SYNC instead of another flag word?
> > > > >>>> (then it can also be gated on having DMA_MAP enabled)  
> > > > >>>
> > > > >>> You mean instead of the u8?
> > > > >>> As you pointed out on your V2 comment of the mail, some cards don't 
> > > > >>> sync back to device.
> > > > >>>
> > > > >>> As the API tries to be generic a u8 was choosen instead of a flag
> > > > >>> to cover these use cases. So in time we'll change the semantics of
> > > > >>> this to 'always sync', 'dont sync if it's an skb-only queue' etc.
> > > > >>>
> > > > >>> The first case Lorenzo covered is sync the required len only instead 
> > > > >>> of the full buffer  
> > > > >>
> > > > >> Yes, I meant instead of:
> > > > >> +		.sync = 1,
> > > > >>
> > > > >> Something like:
> > > > >>         .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC
> > > > >>
> > > 
> > > I actually agree and think we could use a flag. I suggest
> > > PP_FLAG_DMA_SYNC_DEV to indicate that this DMA-sync-for-device.
> > > 
> > > Ilias notice that the change I requested to Lorenzo, that dma_sync_size
> > > default value is 0xFFFFFFFF (-1).  That makes dma_sync_size==0 a valid
> > > value, which you can use in the cases, where you know that nobody have
> > > written into the data-area.  This allow us to selectively choose it for
> > > these cases.
> > 
> > Okay, then i guess the flag is a better fit for this.
> > The only difference would be that the sync semantics will be done on 'per
> > packet' basis,  instead of 'per pool', but that should be fine for our cases.
> 
> Ack, fine for me.
> Do you think when checking for PP_FLAG_DMA_SYNC_DEV we should even verify
> PP_FLAG_DMA_MAP? Something like:
> 
> if ((pool->p.flags & (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)) ==
>     (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV))
> 	page_pool_dma_sync_for_device();
> 
> Regards,
> Lorenzo

I think it's better to do the check once on the pool registration and maybe
refuse to allocate the pool? Syncing without mapping doesn't really make sense

Cheers
/Ilias
> 
> > 
> > Cheers
> > /Ilias
> > > 
> > > -- 
> > > Best regards,
> > >   Jesper Dangaard Brouer
> > >   MSc.CS, Principal Kernel Engineer at Red Hat
> > >   LinkedIn: http://www.linkedin.com/in/brouer
> > > 
> >
Jonathan Lemon Nov. 15, 2019, 4:47 p.m. UTC | #14
On 15 Nov 2019, at 0:03, Ilias Apalodimas wrote:

> Hi Lorenzo,
>
>>>>>>>>> How about using PP_FLAG_DMA_SYNC instead of another flag word?
>>>>>>>>> (then it can also be gated on having DMA_MAP enabled)
>>>>>>>>
>>>>>>>> You mean instead of the u8?
>>>>>>>> As you pointed out on your V2 comment of the mail, some cards 
>>>>>>>> don't
>>>>>>>> sync back to device.
>>>>>>>>
>>>>>>>> As the API tries to be generic a u8 was choosen instead of a 
>>>>>>>> flag
>>>>>>>> to cover these use cases. So in time we'll change the semantics 
>>>>>>>> of
>>>>>>>> this to 'always sync', 'dont sync if it's an skb-only queue' 
>>>>>>>> etc.
>>>>>>>>
>>>>>>>> The first case Lorenzo covered is sync the required len only 
>>>>>>>> instead
>>>>>>>> of the full buffer
>>>>>>>
>>>>>>> Yes, I meant instead of:
>>>>>>> +		.sync = 1,
>>>>>>>
>>>>>>> Something like:
>>>>>>>         .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC
>>>>>>>
>>>>
>>>> I actually agree and think we could use a flag. I suggest
>>>> PP_FLAG_DMA_SYNC_DEV to indicate that this DMA-sync-for-device.
>>>>
>>>> Ilias notice that the change I requested to Lorenzo, that 
>>>> dma_sync_size
>>>> default value is 0xFFFFFFFF (-1).  That makes dma_sync_size==0 a 
>>>> valid
>>>> value, which you can use in the cases, where you know that nobody 
>>>> have
>>>> written into the data-area.  This allow us to selectively choose it 
>>>> for
>>>> these cases.
>>>
>>> Okay, then i guess the flag is a better fit for this.
>>> The only difference would be that the sync semantics will be done on 
>>> 'per
>>> packet' basis,  instead of 'per pool', but that should be fine for 
>>> our cases.
>>
>> Ack, fine for me.
>> Do you think when checking for PP_FLAG_DMA_SYNC_DEV we should even 
>> verify
>> PP_FLAG_DMA_MAP? Something like:
>>
>> if ((pool->p.flags & (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)) ==
>>     (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV))
>> 	page_pool_dma_sync_for_device();
>>
>> Regards,
>> Lorenzo
>
> I think it's better to do the check once on the pool registration and 
> maybe
> refuse to allocate the pool? Syncing without mapping doesn't really 
> make sense

+1.
Lorenzo Bianconi Nov. 15, 2019, 4:53 p.m. UTC | #15
[...]

> > > > 
> > > > Okay, then i guess the flag is a better fit for this.
> > > > The only difference would be that the sync semantics will be
> > > > done on 'per
> > > > packet' basis,  instead of 'per pool', but that should be fine
> > > > for our cases.
> > > 
> > > Ack, fine for me.
> > > Do you think when checking for PP_FLAG_DMA_SYNC_DEV we should even
> > > verify
> > > PP_FLAG_DMA_MAP? Something like:
> > > 
> > > if ((pool->p.flags & (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)) ==
> > >     (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV))
> > > 	page_pool_dma_sync_for_device();
> > > 
> > > Regards,
> > > Lorenzo
> > 
> > I think it's better to do the check once on the pool registration and
> > maybe
> > refuse to allocate the pool? Syncing without mapping doesn't really make
> > sense
> 
> +1.

ack, will post v3 soon.

Regards,
Lorenzo

> -- 
> Jonathan
diff mbox series

Patch

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 2cbcdbdec254..defbfd90ab46 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -65,6 +65,9 @@  struct page_pool_params {
 	int		nid;  /* Numa node id to allocate from pages from */
 	struct device	*dev; /* device, for DMA pre-mapping purposes */
 	enum dma_data_direction dma_dir; /* DMA mapping direction */
+	unsigned int	max_len; /* max DMA sync memory size */
+	unsigned int	offset;  /* DMA addr offset */
+	u8 sync;
 };
 
 struct page_pool {
@@ -150,8 +153,8 @@  static inline void page_pool_destroy(struct page_pool *pool)
 }
 
 /* Never call this directly, use helpers below */
-void __page_pool_put_page(struct page_pool *pool,
-			  struct page *page, bool allow_direct);
+void __page_pool_put_page(struct page_pool *pool, struct page *page,
+			  unsigned int dma_sync_size, bool allow_direct);
 
 static inline void page_pool_put_page(struct page_pool *pool,
 				      struct page *page, bool allow_direct)
@@ -160,14 +163,14 @@  static inline void page_pool_put_page(struct page_pool *pool,
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	__page_pool_put_page(pool, page, allow_direct);
+	__page_pool_put_page(pool, page, 0, allow_direct);
 #endif
 }
 /* Very limited use-cases allow recycle direct */
 static inline void page_pool_recycle_direct(struct page_pool *pool,
 					    struct page *page)
 {
-	__page_pool_put_page(pool, page, true);
+	__page_pool_put_page(pool, page, 0, true);
 }
 
 /* API user MUST have disconnected alloc-side (not allowed to call
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5bc65587f1c4..af9514c2d15b 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -112,6 +112,17 @@  static struct page *__page_pool_get_cached(struct page_pool *pool)
 	return page;
 }
 
+/* Used for non-coherent devices */
+static void page_pool_dma_sync_for_device(struct page_pool *pool,
+					  struct page *page,
+					  unsigned int dma_sync_size)
+{
+	dma_sync_size = min(dma_sync_size, pool->p.max_len);
+	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
+					 pool->p.offset, dma_sync_size,
+					 pool->p.dma_dir);
+}
+
 /* slow path */
 noinline
 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
@@ -156,6 +167,10 @@  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 	}
 	page->dma_addr = dma;
 
+	/* non-coherent devices - flush memory */
+	if (pool->p.sync)
+		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+
 skip_dma_map:
 	/* Track how many pages are held 'in-flight' */
 	pool->pages_state_hold_cnt++;
@@ -255,7 +270,8 @@  static void __page_pool_return_page(struct page_pool *pool, struct page *page)
 }
 
 static bool __page_pool_recycle_into_ring(struct page_pool *pool,
-				   struct page *page)
+					  struct page *page,
+					  unsigned int dma_sync_size)
 {
 	int ret;
 	/* BH protection not needed if current is serving softirq */
@@ -264,6 +280,10 @@  static bool __page_pool_recycle_into_ring(struct page_pool *pool,
 	else
 		ret = ptr_ring_produce_bh(&pool->ring, page);
 
+	/* non-coherent devices - flush memory */
+	if (ret == 0 && pool->p.sync)
+		page_pool_dma_sync_for_device(pool, page, dma_sync_size);
+
 	return (ret == 0) ? true : false;
 }
 
@@ -273,18 +293,23 @@  static bool __page_pool_recycle_into_ring(struct page_pool *pool,
  * Caller must provide appropriate safe context.
  */
 static bool __page_pool_recycle_direct(struct page *page,
-				       struct page_pool *pool)
+				       struct page_pool *pool,
+				       unsigned int dma_sync_size)
 {
 	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
 		return false;
 
 	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
 	pool->alloc.cache[pool->alloc.count++] = page;
+
+	/* non-coherent devices - flush memory */
+	if (pool->p.sync)
+		page_pool_dma_sync_for_device(pool, page, dma_sync_size);
 	return true;
 }
 
-void __page_pool_put_page(struct page_pool *pool,
-			  struct page *page, bool allow_direct)
+void __page_pool_put_page(struct page_pool *pool, struct page *page,
+			  unsigned int dma_sync_size, bool allow_direct)
 {
 	/* This allocator is optimized for the XDP mode that uses
 	 * one-frame-per-page, but have fallbacks that act like the
@@ -296,10 +321,12 @@  void __page_pool_put_page(struct page_pool *pool,
 		/* Read barrier done in page_ref_count / READ_ONCE */
 
 		if (allow_direct && in_serving_softirq())
-			if (__page_pool_recycle_direct(page, pool))
+			if (__page_pool_recycle_direct(page, pool,
+						       dma_sync_size))
 				return;
 
-		if (!__page_pool_recycle_into_ring(pool, page)) {
+		if (!__page_pool_recycle_into_ring(pool, page,
+						   dma_sync_size)) {
 			/* Cache full, fallback to free pages */
 			__page_pool_return_page(pool, page);
 		}