diff mbox

[RFC,1/3] net: introduce kfree_skb_bulk() user of kmem_cache_free_bulk()

Message ID 20150904170046.4312.38018.stgit@devil
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Jesper Dangaard Brouer Sept. 4, 2015, 5 p.m. UTC
Introduce the first user of SLAB bulk free API kmem_cache_free_bulk(),
in the network stack in form of function kfree_skb_bulk() which bulk
free SKBs (not skb clones or skb->head, yet).

As this is the third user of SKB reference decrementing, split out
refcnt decrement into helper function and use this in all call points.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/skbuff.h |    1 +
 net/core/skbuff.c      |   87 +++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 71 insertions(+), 17 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Tom Herbert Sept. 4, 2015, 6:47 p.m. UTC | #1
On Fri, Sep 4, 2015 at 10:00 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
> Introduce the first user of SLAB bulk free API kmem_cache_free_bulk(),
> in the network stack in form of function kfree_skb_bulk() which bulk
> free SKBs (not skb clones or skb->head, yet).
>
> As this is the third user of SKB reference decrementing, split out
> refcnt decrement into helper function and use this in all call points.
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  include/linux/skbuff.h |    1 +
>  net/core/skbuff.c      |   87 +++++++++++++++++++++++++++++++++++++++---------
>  2 files changed, 71 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index b97597970ce7..e5f1e007723b 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -762,6 +762,7 @@ static inline struct rtable *skb_rtable(const struct sk_buff *skb)
>  }
>
>  void kfree_skb(struct sk_buff *skb);
> +void kfree_skb_bulk(struct sk_buff **skbs, unsigned int size);
>  void kfree_skb_list(struct sk_buff *segs);
>  void skb_tx_error(struct sk_buff *skb);
>  void consume_skb(struct sk_buff *skb);
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 429b407b4fe6..034545934158 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -661,26 +661,83 @@ void __kfree_skb(struct sk_buff *skb)
>  }
>  EXPORT_SYMBOL(__kfree_skb);
>
> +/*
> + *     skb_dec_and_test - Helper to drop ref to SKB and see is ready to free
> + *     @skb: buffer to decrement reference
> + *
> + *     Drop a reference to the buffer, and return true if it is ready
> + *     to free. Which is if the usage count has hit zero or is equal to 1.
> + *
> + *     This is performance critical code that should be inlined.
> + */
> +static inline bool skb_dec_and_test(struct sk_buff *skb)
> +{
> +       if (unlikely(!skb))
> +               return false;
> +       if (likely(atomic_read(&skb->users) == 1))
> +               smp_rmb();
> +       else if (likely(!atomic_dec_and_test(&skb->users)))
> +               return false;
> +       /* If reaching here SKB is ready to free */
> +       return true;
> +}
> +
>  /**
>   *     kfree_skb - free an sk_buff
>   *     @skb: buffer to free
>   *
>   *     Drop a reference to the buffer and free it if the usage count has
> - *     hit zero.
> + *     hit zero or is equal to 1.
>   */
>  void kfree_skb(struct sk_buff *skb)
>  {
> -       if (unlikely(!skb))
> -               return;
> -       if (likely(atomic_read(&skb->users) == 1))
> -               smp_rmb();
> -       else if (likely(!atomic_dec_and_test(&skb->users)))
> -               return;
> -       trace_kfree_skb(skb, __builtin_return_address(0));
> -       __kfree_skb(skb);
> +       if (skb_dec_and_test(skb)) {
> +               trace_kfree_skb(skb, __builtin_return_address(0));
> +               __kfree_skb(skb);
> +       }
>  }
>  EXPORT_SYMBOL(kfree_skb);
>
> +/**
> + *     kfree_skb_bulk - bulk free SKBs when refcnt allows to
> + *     @skbs: array of SKBs to free
> + *     @size: number of SKBs in array
> + *
> + *     If SKB refcnt allows for free, then release any auxiliary data
> + *     and then bulk free SKBs to the SLAB allocator.
> + *
> + *     Note that interrupts must be enabled when calling this function.
> + */
> +void kfree_skb_bulk(struct sk_buff **skbs, unsigned int size)
> +{
What not pass a list of skbs (e.g. using skb->next)?

> +       int i;
> +       size_t cnt = 0;
> +
> +       for (i = 0; i < size; i++) {
> +               struct sk_buff *skb = skbs[i];
> +
> +               if (!skb_dec_and_test(skb))
> +                       continue; /* skip skb, not ready to free */
> +
> +               /* Construct an array of SKBs, ready to be free'ed and
> +                * cleanup all auxiliary, before bulk free to SLAB.
> +                * For now, only handle non-cloned SKBs, related to
> +                * SLAB skbuff_head_cache
> +                */
> +               if (skb->fclone == SKB_FCLONE_UNAVAILABLE) {
> +                       skb_release_all(skb);
> +                       skbs[cnt++] = skb;
> +               } else {
> +                       /* SKB was a clone, don't handle this case */
> +                       __kfree_skb(skb);
> +               }
> +       }
> +       if (likely(cnt)) {
> +               kmem_cache_free_bulk(skbuff_head_cache, cnt, (void **) skbs);
> +       }
> +}
> +EXPORT_SYMBOL(kfree_skb_bulk);
> +
>  void kfree_skb_list(struct sk_buff *segs)
>  {
>         while (segs) {
> @@ -722,14 +779,10 @@ EXPORT_SYMBOL(skb_tx_error);
>   */
>  void consume_skb(struct sk_buff *skb)
>  {
> -       if (unlikely(!skb))
> -               return;
> -       if (likely(atomic_read(&skb->users) == 1))
> -               smp_rmb();
> -       else if (likely(!atomic_dec_and_test(&skb->users)))
> -               return;
> -       trace_consume_skb(skb);
> -       __kfree_skb(skb);
> +       if (skb_dec_and_test(skb)) {
> +               trace_consume_skb(skb);
> +               __kfree_skb(skb);
> +       }
>  }
>  EXPORT_SYMBOL(consume_skb);
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer Sept. 7, 2015, 8:41 a.m. UTC | #2
On Fri, 4 Sep 2015 11:47:17 -0700 Tom Herbert <tom@herbertland.com> wrote:

> On Fri, Sep 4, 2015 at 10:00 AM, Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> > Introduce the first user of SLAB bulk free API kmem_cache_free_bulk(),
> > in the network stack in form of function kfree_skb_bulk() which bulk
> > free SKBs (not skb clones or skb->head, yet).
> >
[...]
> > +/**
> > + *     kfree_skb_bulk - bulk free SKBs when refcnt allows to
> > + *     @skbs: array of SKBs to free
> > + *     @size: number of SKBs in array
> > + *
> > + *     If SKB refcnt allows for free, then release any auxiliary data
> > + *     and then bulk free SKBs to the SLAB allocator.
> > + *
> > + *     Note that interrupts must be enabled when calling this function.
> > + */
> > +void kfree_skb_bulk(struct sk_buff **skbs, unsigned int size)
> > +{
>
> What not pass a list of skbs (e.g. using skb->next)?

Because the next layer, the slab API needs an array:
  kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)

Look at the patch:
 [PATCH V2 3/3] slub: build detached freelist with look-ahead
 http://thread.gmane.org/gmane.linux.kernel.mm/137469/focus=137472

Where I use this array to progressively scan for objects belonging to
the same page.  (A subtle detail is I manage to zero out the array,
which is good from a security/error-handling point of view, as pointers
to the objects are not left dangling on the stack).


I cannot argue that, writing skb->next comes as an additional cost,
because the slUb free also writes into this cacheline.  Perhaps the
slAb allocator does not?

[...]
> > +       if (likely(cnt)) {
> > +               kmem_cache_free_bulk(skbuff_head_cache, cnt, (void **) skbs);
> > +       }
> > +}
> > +EXPORT_SYMBOL(kfree_skb_bulk);
Tom Herbert Sept. 7, 2015, 4:25 p.m. UTC | #3
>> What not pass a list of skbs (e.g. using skb->next)?
>
> Because the next layer, the slab API needs an array:
>   kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
>

I suppose we could ask the same question of that function. IMO
encouraging drivers to define arrays of pointers on the stack like
you're doing in the ixgbe patch is a bad direction.

In any case I believe this would be simpler in the networking side
just to maintain a list of skb's to free. Then the dev_free_waitlist
structure might not be needed then since we could just use a
skb_buf_head for that.


Tom

> Look at the patch:
>  [PATCH V2 3/3] slub: build detached freelist with look-ahead
>  http://thread.gmane.org/gmane.linux.kernel.mm/137469/focus=137472
>
> Where I use this array to progressively scan for objects belonging to
> the same page.  (A subtle detail is I manage to zero out the array,
> which is good from a security/error-handling point of view, as pointers
> to the objects are not left dangling on the stack).
>
>
> I cannot argue that, writing skb->next comes as an additional cost,
> because the slUb free also writes into this cacheline.  Perhaps the
> slAb allocator does not?
>
> [...]
>> > +       if (likely(cnt)) {
>> > +               kmem_cache_free_bulk(skbuff_head_cache, cnt, (void **) skbs);
>> > +       }
>> > +}
>> > +EXPORT_SYMBOL(kfree_skb_bulk);
>
> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Sr. Network Kernel Developer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer Sept. 7, 2015, 8:14 p.m. UTC | #4
On Mon, 7 Sep 2015 09:25:49 -0700 Tom Herbert <tom@herbertland.com> wrote:

> >> What not pass a list of skbs (e.g. using skb->next)?
> >
> > Because the next layer, the slab API needs an array:
> >   kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
> >
> 
> I suppose we could ask the same question of that function. IMO
> encouraging drivers to define arrays of pointers on the stack like
> you're doing in the ixgbe patch is a bad direction.
> 
> In any case I believe this would be simpler in the networking side
> just to maintain a list of skb's to free. Then the dev_free_waitlist
> structure might not be needed then since we could just use a
> skb_buf_head for that.

I guess it is more natural for the network side to work with skb lists.
But I'm keeping it for slab/slub as we cannot assume/enforce objects of a
specific data type.

I worried about how large bulk free we should allow, due to the
interaction with skb->destructor which for sockets affect their memory
accounting. E.g. we have seen issues with hypervisor network drivers
(Xen and HyperV) that are too slow to cleanup their TX completion queue
that their TCP bandwidth get limited by tcp_limit_output_bytes.
I capped it at 32, and the NAPI budget will cap it at 64.


By the following argument, bulk free of 64 objects/skb's is not a problem.
The delay I'm introducing is very small, before the first real
kfree_skb is called, which calls the destructor with free up socket
memory accounting.

Assume measured packet rate of: 2105011 pps
Time between packets (1/2105011*10^9): 475 ns

Perf shows ixgbe_clean_tx_irq() takes: 1.23%
Extrapolating the function call cost: 5.84 ns (475*(1.23/100))

Processing 64 packets in ixgbe_clean_tx_irq() 373 ns.
At 10Gbit/s how many bytes can arrive in this period, only: 466 bytes.
((373/10^9)*(10000*10^6)/8)
David Miller Sept. 8, 2015, 9:01 p.m. UTC | #5
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 04 Sep 2015 19:00:53 +0200

> +/**
> + *	kfree_skb_bulk - bulk free SKBs when refcnt allows to
> + *	@skbs: array of SKBs to free
> + *	@size: number of SKBs in array
> + *
> + *	If SKB refcnt allows for free, then release any auxiliary data
> + *	and then bulk free SKBs to the SLAB allocator.
> + *
> + *	Note that interrupts must be enabled when calling this function.
> + */
> +void kfree_skb_bulk(struct sk_buff **skbs, unsigned int size)
> +{
> +	int i;
> +	size_t cnt = 0;
> +
> +	for (i = 0; i < size; i++) {
> +		struct sk_buff *skb = skbs[i];
> +
> +		if (!skb_dec_and_test(skb))
> +			continue; /* skip skb, not ready to free */
> +
> +		/* Construct an array of SKBs, ready to be free'ed and
> +		 * cleanup all auxiliary, before bulk free to SLAB.
> +		 * For now, only handle non-cloned SKBs, related to
> +		 * SLAB skbuff_head_cache
> +		 */
> +		if (skb->fclone == SKB_FCLONE_UNAVAILABLE) {
> +			skb_release_all(skb);
> +			skbs[cnt++] = skb;
> +		} else {
> +			/* SKB was a clone, don't handle this case */
> +			__kfree_skb(skb);
> +		}
> +	}
> +	if (likely(cnt)) {
> +		kmem_cache_free_bulk(skbuff_head_cache, cnt, (void **) skbs);
> +	}
> +}

You're going to have to do a trace_kfree_skb() or trace_consume_skb() for
these things.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b97597970ce7..e5f1e007723b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -762,6 +762,7 @@  static inline struct rtable *skb_rtable(const struct sk_buff *skb)
 }
 
 void kfree_skb(struct sk_buff *skb);
+void kfree_skb_bulk(struct sk_buff **skbs, unsigned int size);
 void kfree_skb_list(struct sk_buff *segs);
 void skb_tx_error(struct sk_buff *skb);
 void consume_skb(struct sk_buff *skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 429b407b4fe6..034545934158 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -661,26 +661,83 @@  void __kfree_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(__kfree_skb);
 
+/*
+ *	skb_dec_and_test - Helper to drop ref to SKB and see is ready to free
+ *	@skb: buffer to decrement reference
+ *
+ *	Drop a reference to the buffer, and return true if it is ready
+ *	to free. Which is if the usage count has hit zero or is equal to 1.
+ *
+ *	This is performance critical code that should be inlined.
+ */
+static inline bool skb_dec_and_test(struct sk_buff *skb)
+{
+	if (unlikely(!skb))
+		return false;
+	if (likely(atomic_read(&skb->users) == 1))
+		smp_rmb();
+	else if (likely(!atomic_dec_and_test(&skb->users)))
+		return false;
+	/* If reaching here SKB is ready to free */
+	return true;
+}
+
 /**
  *	kfree_skb - free an sk_buff
  *	@skb: buffer to free
  *
  *	Drop a reference to the buffer and free it if the usage count has
- *	hit zero.
+ *	hit zero or is equal to 1.
  */
 void kfree_skb(struct sk_buff *skb)
 {
-	if (unlikely(!skb))
-		return;
-	if (likely(atomic_read(&skb->users) == 1))
-		smp_rmb();
-	else if (likely(!atomic_dec_and_test(&skb->users)))
-		return;
-	trace_kfree_skb(skb, __builtin_return_address(0));
-	__kfree_skb(skb);
+	if (skb_dec_and_test(skb)) {
+		trace_kfree_skb(skb, __builtin_return_address(0));
+		__kfree_skb(skb);
+	}
 }
 EXPORT_SYMBOL(kfree_skb);
 
+/**
+ *	kfree_skb_bulk - bulk free SKBs when refcnt allows to
+ *	@skbs: array of SKBs to free
+ *	@size: number of SKBs in array
+ *
+ *	If SKB refcnt allows for free, then release any auxiliary data
+ *	and then bulk free SKBs to the SLAB allocator.
+ *
+ *	Note that interrupts must be enabled when calling this function.
+ */
+void kfree_skb_bulk(struct sk_buff **skbs, unsigned int size)
+{
+	int i;
+	size_t cnt = 0;
+
+	for (i = 0; i < size; i++) {
+		struct sk_buff *skb = skbs[i];
+
+		if (!skb_dec_and_test(skb))
+			continue; /* skip skb, not ready to free */
+
+		/* Construct an array of SKBs, ready to be free'ed and
+		 * cleanup all auxiliary, before bulk free to SLAB.
+		 * For now, only handle non-cloned SKBs, related to
+		 * SLAB skbuff_head_cache
+		 */
+		if (skb->fclone == SKB_FCLONE_UNAVAILABLE) {
+			skb_release_all(skb);
+			skbs[cnt++] = skb;
+		} else {
+			/* SKB was a clone, don't handle this case */
+			__kfree_skb(skb);
+		}
+	}
+	if (likely(cnt)) {
+		kmem_cache_free_bulk(skbuff_head_cache, cnt, (void **) skbs);
+	}
+}
+EXPORT_SYMBOL(kfree_skb_bulk);
+
 void kfree_skb_list(struct sk_buff *segs)
 {
 	while (segs) {
@@ -722,14 +779,10 @@  EXPORT_SYMBOL(skb_tx_error);
  */
 void consume_skb(struct sk_buff *skb)
 {
-	if (unlikely(!skb))
-		return;
-	if (likely(atomic_read(&skb->users) == 1))
-		smp_rmb();
-	else if (likely(!atomic_dec_and_test(&skb->users)))
-		return;
-	trace_consume_skb(skb);
-	__kfree_skb(skb);
+	if (skb_dec_and_test(skb)) {
+		trace_consume_skb(skb);
+		__kfree_skb(skb);
+	}
 }
 EXPORT_SYMBOL(consume_skb);