diff mbox

[net-next] net: use bigger pages in __netdev_alloc_frag

Message ID 1348650402.5093.176.camel@edumazet-glaptop
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Sept. 26, 2012, 9:06 a.m. UTC
From: Eric Dumazet <edumazet@google.com>

We currently use percpu order-0 pages in __netdev_alloc_frag
to deliver fragments used by __netdev_alloc_skb()

Depending on NIC driver and arch being 32 or 64 bit, it allows a page to
be split in several fragments (between 1 and 8), assuming PAGE_SIZE=4096

Switching to bigger pages (32768 bytes for PAGE_SIZE=4096 case) allows :

- Better filling of space (the ending hole overhead is less an issue)

- Less calls to page allocator or accesses to page->_count

- Could allow struct skb_shared_info futures changes without major
  performance impact.

This patch implements a transparent fallback to smaller
pages in case of memory pressure.

It also uses a standard "struct page_frag" instead of a custom one.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/core/skbuff.c |   46 ++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Benjamin LaHaise Sept. 26, 2012, 1:11 p.m. UTC | #1
Hello Eric,

On Wed, Sep 26, 2012 at 11:06:42AM +0200, Eric Dumazet wrote:
> +	int order;
...
> +		for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
> +			gfp_t gfp = gfp_mask;
> +
> +			if (order)
> +				gfp |= __GFP_COMP | __GFP_NOWARN;
> +			nc->frag.page = alloc_pages(gfp, order);
> +			if (likely(nc->frag.page))
> +				break;
> +			if (--order <= 0)
> +				goto end;
> +		}

I think you probably intended the last if to be "if (--order < 0)", as 
otherwise the alloc will never be attempted with an order 0 page, which 
could harm systems suffering from extreme memory fragmentation.  Cheers,

		-ben
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 26, 2012, 1:57 p.m. UTC | #2
On Wed, 2012-09-26 at 09:11 -0400, Benjamin LaHaise wrote:
> Hello Eric,
> 
> On Wed, Sep 26, 2012 at 11:06:42AM +0200, Eric Dumazet wrote:
> > +	int order;
> ...
> > +		for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
> > +			gfp_t gfp = gfp_mask;
> > +
> > +			if (order)
> > +				gfp |= __GFP_COMP | __GFP_NOWARN;
> > +			nc->frag.page = alloc_pages(gfp, order);
> > +			if (likely(nc->frag.page))
> > +				break;
> > +			if (--order <= 0)
> > +				goto end;
> > +		}
> 
> I think you probably intended the last if to be "if (--order < 0)", as 
> otherwise the alloc will never be attempted with an order 0 page, which 
> could harm systems suffering from extreme memory fragmentation.  Cheers,
> 

Indeed, thanks Ben !


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Duyck, Alexander H Sept. 26, 2012, 4 p.m. UTC | #3
On 09/26/2012 02:06 AM, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> We currently use percpu order-0 pages in __netdev_alloc_frag
> to deliver fragments used by __netdev_alloc_skb()
>
> Depending on NIC driver and arch being 32 or 64 bit, it allows a page to
> be split in several fragments (between 1 and 8), assuming PAGE_SIZE=4096
>
> Switching to bigger pages (32768 bytes for PAGE_SIZE=4096 case) allows :
>
> - Better filling of space (the ending hole overhead is less an issue)
>
> - Less calls to page allocator or accesses to page->_count
>
> - Could allow struct skb_shared_info futures changes without major
>   performance impact.
>
> This patch implements a transparent fallback to smaller
> pages in case of memory pressure.
>
> It also uses a standard "struct page_frag" instead of a custom one.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Alexander Duyck <alexander.h.duyck@intel.com>
> ---
>  net/core/skbuff.c |   46 ++++++++++++++++++++++++++++----------------
>  1 file changed, 30 insertions(+), 16 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 2ede3cf..4ab83ce 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -340,43 +340,57 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
>  EXPORT_SYMBOL(build_skb);
>  
>  struct netdev_alloc_cache {
> -	struct page *page;
> -	unsigned int offset;
> -	unsigned int pagecnt_bias;
> +	struct page_frag	frag;
> +	/* we maintain a pagecount bias, so that we dont dirty cache line
> +	 * containing page->_count every time we allocate a fragment.
> +	 */
> +	unsigned int		pagecnt_bias;
>  };
>  static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
>  
> -#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES)
> +#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768)
> +#define NETDEV_FRAG_PAGE_MAX_SIZE  (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER)
> +#define NETDEV_PAGECNT_MAX_BIAS	   NETDEV_FRAG_PAGE_MAX_SIZE
>  
>  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
>  {
>  	struct netdev_alloc_cache *nc;
>  	void *data = NULL;
> +	int order;
>  	unsigned long flags;
>  
>  	local_irq_save(flags);
>  	nc = &__get_cpu_var(netdev_alloc_cache);
> -	if (unlikely(!nc->page)) {
> +	if (unlikely(!nc->frag.page)) {
>  refill:
> -		nc->page = alloc_page(gfp_mask);
> -		if (unlikely(!nc->page))
> -			goto end;
> +		for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
> +			gfp_t gfp = gfp_mask;
> +
> +			if (order)
> +				gfp |= __GFP_COMP | __GFP_NOWARN;
> +			nc->frag.page = alloc_pages(gfp, order);
> +			if (likely(nc->frag.page))
> +				break;
> +			if (--order <= 0)
> +				goto end;
> +		}
> +		nc->frag.size = PAGE_SIZE << order;
>  recycle:
> -		atomic_set(&nc->page->_count, NETDEV_PAGECNT_BIAS);
> -		nc->pagecnt_bias = NETDEV_PAGECNT_BIAS;
> -		nc->offset = 0;
> +		atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS);
> +		nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
> +		nc->frag.offset = 0;
>  	}
>  
> -	if (nc->offset + fragsz > PAGE_SIZE) {
> +	if (nc->frag.offset + fragsz > nc->frag.size) {
>  		/* avoid unnecessary locked operations if possible */
> -		if ((atomic_read(&nc->page->_count) == nc->pagecnt_bias) ||
> -		    atomic_sub_and_test(nc->pagecnt_bias, &nc->page->_count))
> +		if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) ||
> +		    atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count))
>  			goto recycle;
>  		goto refill;
>  	}
>  
> -	data = page_address(nc->page) + nc->offset;
> -	nc->offset += fragsz;
> +	data = page_address(nc->frag.page) + nc->frag.offset;
> +	nc->frag.offset += fragsz;
>  	nc->pagecnt_bias--;
>  end:
>  	local_irq_restore(flags);
>
>

One minor thought.  Instead of tracking offset and size why not just
work from the top down instead of the bottom up?

So instead of starting with the frag offset at 0, why not start it at
PAGE_SIZE << order, and then work your way down to 0.  That way you
don't need to track both size and offset.

Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 26, 2012, 4:14 p.m. UTC | #4
On Wed, 2012-09-26 at 09:00 -0700, Alexander Duyck wrote:

> One minor thought.  Instead of tracking offset and size why not just
> work from the top down instead of the bottom up?
> 
> So instead of starting with the frag offset at 0, why not start it at
> PAGE_SIZE << order, and then work your way down to 0.  That way you
> don't need to track both size and offset.
> 

How do you refill then ? 

(ie setting xxx->offset back to PAGE_SIZE << order)

I am not sure we have direct access to a page order given a struct page
pointer.

I also like struct page_frag abstraction, because it might allow us
better code factorization with other frag allocators.

( skb_append_datato_frags(), sk_page_frag_refill(), ...)




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Duyck, Alexander H Sept. 26, 2012, 4:36 p.m. UTC | #5
On 09/26/2012 09:14 AM, Eric Dumazet wrote:
> On Wed, 2012-09-26 at 09:00 -0700, Alexander Duyck wrote:
>
>> One minor thought.  Instead of tracking offset and size why not just
>> work from the top down instead of the bottom up?
>>
>> So instead of starting with the frag offset at 0, why not start it at
>> PAGE_SIZE << order, and then work your way down to 0.  That way you
>> don't need to track both size and offset.
>>
> How do you refill then ? 
>
> (ie setting xxx->offset back to PAGE_SIZE << order)
>
> I am not sure we have direct access to a page order given a struct page
> pointer.
>
> I also like struct page_frag abstraction, because it might allow us
> better code factorization with other frag allocators.
>
> ( skb_append_datato_frags(), sk_page_frag_refill(), ...)

I forgot about the page recycling portion of it.  I was still thinking
of the original implementation that had a fixed page order.  You are
correct, with the page order being dynamic you will have to store size
somewhere.

Thanks,

Alex




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2ede3cf..4ab83ce 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -340,43 +340,57 @@  struct sk_buff *build_skb(void *data, unsigned int frag_size)
 EXPORT_SYMBOL(build_skb);
 
 struct netdev_alloc_cache {
-	struct page *page;
-	unsigned int offset;
-	unsigned int pagecnt_bias;
+	struct page_frag	frag;
+	/* we maintain a pagecount bias, so that we dont dirty cache line
+	 * containing page->_count every time we allocate a fragment.
+	 */
+	unsigned int		pagecnt_bias;
 };
 static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
 
-#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES)
+#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768)
+#define NETDEV_FRAG_PAGE_MAX_SIZE  (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER)
+#define NETDEV_PAGECNT_MAX_BIAS	   NETDEV_FRAG_PAGE_MAX_SIZE
 
 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
 	struct netdev_alloc_cache *nc;
 	void *data = NULL;
+	int order;
 	unsigned long flags;
 
 	local_irq_save(flags);
 	nc = &__get_cpu_var(netdev_alloc_cache);
-	if (unlikely(!nc->page)) {
+	if (unlikely(!nc->frag.page)) {
 refill:
-		nc->page = alloc_page(gfp_mask);
-		if (unlikely(!nc->page))
-			goto end;
+		for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
+			gfp_t gfp = gfp_mask;
+
+			if (order)
+				gfp |= __GFP_COMP | __GFP_NOWARN;
+			nc->frag.page = alloc_pages(gfp, order);
+			if (likely(nc->frag.page))
+				break;
+			if (--order <= 0)
+				goto end;
+		}
+		nc->frag.size = PAGE_SIZE << order;
 recycle:
-		atomic_set(&nc->page->_count, NETDEV_PAGECNT_BIAS);
-		nc->pagecnt_bias = NETDEV_PAGECNT_BIAS;
-		nc->offset = 0;
+		atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS);
+		nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
+		nc->frag.offset = 0;
 	}
 
-	if (nc->offset + fragsz > PAGE_SIZE) {
+	if (nc->frag.offset + fragsz > nc->frag.size) {
 		/* avoid unnecessary locked operations if possible */
-		if ((atomic_read(&nc->page->_count) == nc->pagecnt_bias) ||
-		    atomic_sub_and_test(nc->pagecnt_bias, &nc->page->_count))
+		if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) ||
+		    atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count))
 			goto recycle;
 		goto refill;
 	}
 
-	data = page_address(nc->page) + nc->offset;
-	nc->offset += fragsz;
+	data = page_address(nc->frag.page) + nc->frag.offset;
+	nc->frag.offset += fragsz;
 	nc->pagecnt_bias--;
 end:
 	local_irq_restore(flags);