diff mbox

[RFC,v2,08/10] net: sched: pfifo_fast use alf_queue

Message ID 20160714062312.8270.65942.stgit@john-Precision-Tower-5810
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

John Fastabend July 14, 2016, 6:23 a.m. UTC
This converts the pfifo_fast qdisc to use the alf_queue enqueue and
dequeue routines then sets the NOLOCK bit.

This also removes the logic used to pick the next band to dequeue from
and instead just checks each alf_queue for packets from top priority
to lowest. This might need to be a bit more clever but seems to work
for now.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 net/sched/sch_generic.c |  131 +++++++++++++++++++++++++++--------------------
 1 file changed, 75 insertions(+), 56 deletions(-)

Comments

Jesper Dangaard Brouer July 14, 2016, 3:11 p.m. UTC | #1
On Wed, 13 Jul 2016 23:23:12 -0700
John Fastabend <john.fastabend@gmail.com> wrote:

> This converts the pfifo_fast qdisc to use the alf_queue enqueue and
> dequeue routines then sets the NOLOCK bit.

Should have said skb_array, not alf_queue ;-)

 > This also removes the logic used to pick the next band to dequeue from
> and instead just checks each alf_queue for packets from top priority
> to lowest. This might need to be a bit more clever but seems to work
> for now.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  net/sched/sch_generic.c |  131 +++++++++++++++++++++++++++--------------------
>  1 file changed, 75 insertions(+), 56 deletions(-)
> 
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 7dcd066..2ac3eb9 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -26,6 +26,7 @@
>  #include <linux/list.h>
>  #include <linux/slab.h>
>  #include <linux/if_vlan.h>
> +#include <linux/skb_array.h>
>  #include <net/sch_generic.h>
>  #include <net/pkt_sched.h>
>  #include <net/dst.h>
> @@ -555,88 +556,79 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
>  
>  /*
>   * Private data for a pfifo_fast scheduler containing:
> - * 	- queues for the three band
> - * 	- bitmap indicating which of the bands contain skbs
> + *	- rings for priority bands
>   */
>  struct pfifo_fast_priv {
> -	u32 bitmap;
> -	struct sk_buff_head q[PFIFO_FAST_BANDS];
> +	struct skb_array q[PFIFO_FAST_BANDS];
>  };
>  
> -/*
> - * Convert a bitmap to the first band number where an skb is queued, where:
> - * 	bitmap=0 means there are no skbs on any band.
> - * 	bitmap=1 means there is an skb on band 0.
> - *	bitmap=7 means there are skbs on all 3 bands, etc.
> - */
> -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
> -
> -static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
> -					     int band)
> +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
> +					  int band)
>  {
> -	return priv->q + band;
> +	return &priv->q[band];
>  }
>  
>  static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
>  			      struct sk_buff **to_free)
>  {
> -	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
> -		int band = prio2band[skb->priority & TC_PRIO_MAX];
> -		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
> -		struct sk_buff_head *list = band2list(priv, band);
> -
> -		priv->bitmap |= (1 << band);
> -		qdisc->q.qlen++;
> -		return __qdisc_enqueue_tail(skb, qdisc, list);
> -	}
> +	int band = prio2band[skb->priority & TC_PRIO_MAX];
> +	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
> +	struct skb_array *q = band2list(priv, band);
> +	int err;
>  
> -	return qdisc_drop(skb, qdisc, to_free);
> +	err = skb_array_produce_bh(q, skb);

Do you need the _bh variant here?  (Doesn't the qdisc run with BH disabled?)


> +
> +	if (unlikely(err))
> +		return qdisc_drop_cpu(skb, qdisc, to_free);
> +
> +	qdisc_qstats_cpu_qlen_inc(qdisc);
> +	qdisc_qstats_cpu_backlog_inc(qdisc, skb);
> +	return NET_XMIT_SUCCESS;
>  }
>  
>  static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
>  {
>  	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
> -	int band = bitmap2band[priv->bitmap];
> +	struct sk_buff *skb = NULL;
> +	int band;
>  
> -	if (likely(band >= 0)) {
> -		struct sk_buff_head *list = band2list(priv, band);
> -		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
> +	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
> +		struct skb_array *q = band2list(priv, band);
>  
> -		qdisc->q.qlen--;
> -		if (skb_queue_empty(list))
> -			priv->bitmap &= ~(1 << band);
> +		if (__skb_array_empty(q))
> +			continue;
>  
> -		return skb;
> +		skb = skb_array_consume_bh(q);

Also _bh variant.
Alexei Starovoitov July 14, 2016, 11:42 p.m. UTC | #2
On Wed, Jul 13, 2016 at 11:23:12PM -0700, John Fastabend wrote:
> This converts the pfifo_fast qdisc to use the alf_queue enqueue and
> dequeue routines then sets the NOLOCK bit.
> 
> This also removes the logic used to pick the next band to dequeue from
> and instead just checks each alf_queue for packets from top priority
> to lowest. This might need to be a bit more clever but seems to work
> for now.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  net/sched/sch_generic.c |  131 +++++++++++++++++++++++++++--------------------

>  static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
>  			      struct sk_buff **to_free)
>  {
> -	return qdisc_drop(skb, qdisc, to_free);
> +	err = skb_array_produce_bh(q, skb);
..
>  static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
>  {
> +		skb = skb_array_consume_bh(q);

For this particular qdisc the performance gain should come from
granularityof spin_lock, right?
Before we were taking the lock much earlier. Here we keep the lock,
but for the very short time.
        original pps        lockless        diff
        1       1418168             1269450         -148718
        2       1587390             1553408         -33982
        4       1084961             1683639         +598678
        8       989636              1522723         +533087
        12      1014018             1348172         +334154
so perf for 1 cpu case is mainly due to array vs list,
since number of locks is still the same and there is no collision ?
but then why shorter lock give higher overhead in multi cpu cases?
That would have been the main target for performance improvement?

Looks like mq gets the most benefit, because it's lockless internally
which makes sense.
In general I think this is the right direction where tc infra should move to.
I'm only not sure whether it's worth converting pfifo to skb_array.
Probably alf queue would have been a better alternative.
John Fastabend July 15, 2016, 12:07 a.m. UTC | #3
On 16-07-14 04:42 PM, Alexei Starovoitov wrote:
> On Wed, Jul 13, 2016 at 11:23:12PM -0700, John Fastabend wrote:
>> This converts the pfifo_fast qdisc to use the alf_queue enqueue and
>> dequeue routines then sets the NOLOCK bit.
>>
>> This also removes the logic used to pick the next band to dequeue from
>> and instead just checks each alf_queue for packets from top priority
>> to lowest. This might need to be a bit more clever but seems to work
>> for now.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>  net/sched/sch_generic.c |  131 +++++++++++++++++++++++++++--------------------
> 
>>  static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
>>  			      struct sk_buff **to_free)
>>  {
>> -	return qdisc_drop(skb, qdisc, to_free);
>> +	err = skb_array_produce_bh(q, skb);
> ..
>>  static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
>>  {
>> +		skb = skb_array_consume_bh(q);
> 
> For this particular qdisc the performance gain should come from
> granularityof spin_lock, right?

And the fact that the consumer and producer are using different
locks now.

> Before we were taking the lock much earlier. Here we keep the lock,
> but for the very short time.
>         original pps        lockless        diff
>         1       1418168             1269450         -148718
>         2       1587390             1553408         -33982
>         4       1084961             1683639         +598678
>         8       989636              1522723         +533087
>         12      1014018             1348172         +334154
> so perf for 1 cpu case is mainly due to array vs list,
> since number of locks is still the same and there is no collision ?
> but then why shorter lock give higher overhead in multi cpu cases?

So in this case running pfifo_fast as the root qdisc with 12 threads
means we have 12 producers hitting a single enqueue() path where as with
mq and only looking at pktgen numbers we have one thread for each
skb_array.

> That would have been the main target for performance improvement?
> 

Maybe I should fire up a TCP test with 1000's of threads to see what
the perf numbers look like.

> Looks like mq gets the most benefit, because it's lockless internally
> which makes sense.
> In general I think this is the right direction where tc infra should move to.
> I'm only not sure whether it's worth converting pfifo to skb_array.
> Probably alf queue would have been a better alternative.
> 

Tomorrows task is to resurrect the alf_queue and look at its numbers
compared to this. Today was spent trying to remove the HARD_TX_LOCK
that protects the driver, in the mq case it seems this is not really
needed either.

.John
John Fastabend July 15, 2016, 12:09 a.m. UTC | #4
On 16-07-14 08:11 AM, Jesper Dangaard Brouer wrote:
> On Wed, 13 Jul 2016 23:23:12 -0700
> John Fastabend <john.fastabend@gmail.com> wrote:
> 
>> This converts the pfifo_fast qdisc to use the alf_queue enqueue and
>> dequeue routines then sets the NOLOCK bit.
> 
> Should have said skb_array, not alf_queue ;-)

I think I'll get numbers for the alf_queue to so we can compare them
as well. On the todo list for tomorrow.

> 
>  > This also removes the logic used to pick the next band to dequeue from
>> and instead just checks each alf_queue for packets from top priority
>> to lowest. This might need to be a bit more clever but seems to work
>> for now.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>  net/sched/sch_generic.c |  131 +++++++++++++++++++++++++++--------------------
>>  1 file changed, 75 insertions(+), 56 deletions(-)
>>
>> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
>> index 7dcd066..2ac3eb9 100644
>> --- a/net/sched/sch_generic.c
>> +++ b/net/sched/sch_generic.c
>> @@ -26,6 +26,7 @@
>>  #include <linux/list.h>
>>  #include <linux/slab.h>
>>  #include <linux/if_vlan.h>
>> +#include <linux/skb_array.h>
>>  #include <net/sch_generic.h>
>>  #include <net/pkt_sched.h>
>>  #include <net/dst.h>
>> @@ -555,88 +556,79 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
>>  
>>  /*
>>   * Private data for a pfifo_fast scheduler containing:
>> - * 	- queues for the three band
>> - * 	- bitmap indicating which of the bands contain skbs
>> + *	- rings for priority bands
>>   */
>>  struct pfifo_fast_priv {
>> -	u32 bitmap;
>> -	struct sk_buff_head q[PFIFO_FAST_BANDS];
>> +	struct skb_array q[PFIFO_FAST_BANDS];
>>  };
>>  
>> -/*
>> - * Convert a bitmap to the first band number where an skb is queued, where:
>> - * 	bitmap=0 means there are no skbs on any band.
>> - * 	bitmap=1 means there is an skb on band 0.
>> - *	bitmap=7 means there are skbs on all 3 bands, etc.
>> - */
>> -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
>> -
>> -static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
>> -					     int band)
>> +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
>> +					  int band)
>>  {
>> -	return priv->q + band;
>> +	return &priv->q[band];
>>  }
>>  
>>  static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
>>  			      struct sk_buff **to_free)
>>  {
>> -	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
>> -		int band = prio2band[skb->priority & TC_PRIO_MAX];
>> -		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
>> -		struct sk_buff_head *list = band2list(priv, band);
>> -
>> -		priv->bitmap |= (1 << band);
>> -		qdisc->q.qlen++;
>> -		return __qdisc_enqueue_tail(skb, qdisc, list);
>> -	}
>> +	int band = prio2band[skb->priority & TC_PRIO_MAX];
>> +	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
>> +	struct skb_array *q = band2list(priv, band);
>> +	int err;
>>  
>> -	return qdisc_drop(skb, qdisc, to_free);
>> +	err = skb_array_produce_bh(q, skb);
> 
> Do you need the _bh variant here?  (Doesn't the qdisc run with BH disabled?)
> 
> 

Yep its inside rcu_read_lock_bh().
Jesper Dangaard Brouer July 15, 2016, 10:09 a.m. UTC | #5
On Thu, 14 Jul 2016 17:09:43 -0700
John Fastabend <john.fastabend@gmail.com> wrote:

> >>  static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
> >>  			      struct sk_buff **to_free)
> >>  {
> >> -	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
> >> -		int band = prio2band[skb->priority & TC_PRIO_MAX];
> >> -		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
> >> -		struct sk_buff_head *list = band2list(priv, band);
> >> -
> >> -		priv->bitmap |= (1 << band);
> >> -		qdisc->q.qlen++;
> >> -		return __qdisc_enqueue_tail(skb, qdisc, list);
> >> -	}
> >> +	int band = prio2band[skb->priority & TC_PRIO_MAX];
> >> +	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
> >> +	struct skb_array *q = band2list(priv, band);
> >> +	int err;
> >>  
> >> -	return qdisc_drop(skb, qdisc, to_free);
> >> +	err = skb_array_produce_bh(q, skb);  
> > 
> > Do you need the _bh variant here?  (Doesn't the qdisc run with BH disabled?)
> > 
> >   
> 
> Yep its inside rcu_read_lock_bh().

The call rcu_read_lock_bh() already disabled BH (local_bh_disable()).
Thus, you can use the normal variants of skb_array_produce(), it is
(approx 20 cycles) faster than the _bh variant...
Jesper Dangaard Brouer July 15, 2016, 11:23 a.m. UTC | #6
On Thu, 14 Jul 2016 17:07:33 -0700
John Fastabend <john.fastabend@gmail.com> wrote:

> On 16-07-14 04:42 PM, Alexei Starovoitov wrote:
> > On Wed, Jul 13, 2016 at 11:23:12PM -0700, John Fastabend wrote:  
> >> This converts the pfifo_fast qdisc to use the alf_queue enqueue and
> >> dequeue routines then sets the NOLOCK bit.
> >>
> >> This also removes the logic used to pick the next band to dequeue from
> >> and instead just checks each alf_queue for packets from top priority
> >> to lowest. This might need to be a bit more clever but seems to work
> >> for now.
> >>
> >> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> >> ---
> >>  net/sched/sch_generic.c |  131 +++++++++++++++++++++++++++--------------------  
> >   
> >>  static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
> >>  			      struct sk_buff **to_free)
> >>  {
> >> -	return qdisc_drop(skb, qdisc, to_free);
> >> +	err = skb_array_produce_bh(q, skb);  
> > ..  
> >>  static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
> >>  {
> >> +		skb = skb_array_consume_bh(q);  
> > 
> > For this particular qdisc the performance gain should come from
> > granularityof spin_lock, right?  
> 
> And the fact that the consumer and producer are using different
> locks now.

Yes. Splitting up enqueue'ers (producer's) from the dequeuer (consumer)
is an important step, because today the qdisc layer have this problem
that enqueue'ers can starve the single dequeuer.  The current
mitigation tricks are the enq busy_lock and bulk dequeue.

As John says, using skb_array cause producers and consumer to use
different locks.

> > Before we were taking the lock much earlier. Here we keep the lock,
> > but for the very short time.
> >         original pps        lockless        diff
> >         1       1418168             1269450         -148718
> >         2       1587390             1553408         -33982
> >         4       1084961             1683639         +598678
> >         8       989636              1522723         +533087
> >         12      1014018             1348172         +334154
> >

It looks problematic that lockless is slower in the base single CPU
case, orig (1418168 pps) to lockless (1269450).  By approx
(1/1418168-1/1269450)*10^9 = -82.60 ns.  That base "regression" look
too high to start with.

How I view the results:
Negative scaling starts early for "original", which is the main problem
that we are looking to solve.  The lockless does not show as much
scaling effect as expected, and start to show a trend towards negative
scaling.

> > so perf for 1 cpu case is mainly due to array vs list,
> > since number of locks is still the same and there is no collision ?
> > but then why shorter lock give higher overhead in multi cpu cases?  
> 
> So in this case running pfifo_fast as the root qdisc with 12 threads
> means we have 12 producers hitting a single enqueue() path where as with
> mq and only looking at pktgen numbers we have one thread for each
> skb_array.

The skb_array allow multi producer multi consumer (MPMC), but is
optimized towards the case of a single producer and a single consumer
(SPSC).  The SPSC queue is usually implemented with much less
synchronization, but then you need some other guarantee that only a
single producer/consumer can be running.

> > That would have been the main target for performance improvement?
> >   
> 
> Maybe I should fire up a TCP test with 1000's of threads to see what
> the perf numbers look like.
> 
> > Looks like mq gets the most benefit, because it's lockless internally
> > which makes sense.
> > In general I think this is the right direction where tc infra should move to.
> > I'm only not sure whether it's worth converting pfifo to skb_array.
> > Probably alf queue would have been a better alternative.
> >   
> 
> Tomorrows task is to resurrect the alf_queue and look at its numbers
> compared to this. Today was spent trying to remove the HARD_TX_LOCK
> that protects the driver, in the mq case it seems this is not really
> needed either.

I would be very interested in the alf_queue numbers. As alf_queue
should perform better with multiple producers.  If you kept the single
TC dequeue'er rule, then you could use the MPSC variant of alf_queue.

My benchmarks from 2014 are avail in this presentation:
 http://people.netfilter.org/hawk/presentations/nfws2014/dp-accel-qdisc-lockless.pdf
Also showing the mitigation effect of bulk-dequeue patches slide 12
(But I would like to see some new benchmarks for alf_queue slide 14)


But do notice, alf_queue have a design problem that skb_array solved.
Alf_queue have the problem of read-bouncing the remote/opposite CPUs
cache line, because it need to (1) at enqueue (producer) look if there
is free space (reading consumer.tail), and (2) at dequeue (consumer) if
any object are avail (reading producer.tail).

The alf_queue mitigate this by design problem by allowing doing bulk
enqueue and bulk dequeue. But I guess, you will not use that "mode", I
think I did in my NFWS2014 benchmarks.

The skb_array solved this design problem, by using the content of the
queue objects as a maker for free/full condition.  The alf_queue cannot
do so easily, because of it's bulk design, that need to reserve part of
the queue.  Thus, we likely need some new queue design, which is a
hybrid between alf_queue and skb_array, for this use-case.



I actually wrote some queue micro-benchmarks that show the strength and
weaknesses of both skb_array[1] and alf_queue[2].

[1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/skb_array_parallel01.c
[2] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/alf_queue_parallel01.c
John Fastabend July 15, 2016, 5:29 p.m. UTC | #7
On 16-07-15 03:09 AM, Jesper Dangaard Brouer wrote:
> On Thu, 14 Jul 2016 17:09:43 -0700
> John Fastabend <john.fastabend@gmail.com> wrote:
> 
>>>>  static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
>>>>  			      struct sk_buff **to_free)
>>>>  {
>>>> -	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
>>>> -		int band = prio2band[skb->priority & TC_PRIO_MAX];
>>>> -		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
>>>> -		struct sk_buff_head *list = band2list(priv, band);
>>>> -
>>>> -		priv->bitmap |= (1 << band);
>>>> -		qdisc->q.qlen++;
>>>> -		return __qdisc_enqueue_tail(skb, qdisc, list);
>>>> -	}
>>>> +	int band = prio2band[skb->priority & TC_PRIO_MAX];
>>>> +	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
>>>> +	struct skb_array *q = band2list(priv, band);
>>>> +	int err;
>>>>  
>>>> -	return qdisc_drop(skb, qdisc, to_free);
>>>> +	err = skb_array_produce_bh(q, skb);  
>>>
>>> Do you need the _bh variant here?  (Doesn't the qdisc run with BH disabled?)
>>>
>>>   
>>
>> Yep its inside rcu_read_lock_bh().
> 
> The call rcu_read_lock_bh() already disabled BH (local_bh_disable()).
> Thus, you can use the normal variants of skb_array_produce(), it is
> (approx 20 cycles) faster than the _bh variant...
> 

hah I was agreeing with you as in yep no need for the _bh variant :)
I must have been low on coffee or something when I wrote that response
because when I read it now it sounds like I really think the _bh is
needed.

At any rate _bh removed thanks!
John Fastabend July 15, 2016, 10:18 p.m. UTC | #8
On 16-07-15 04:23 AM, Jesper Dangaard Brouer wrote:
> On Thu, 14 Jul 2016 17:07:33 -0700
> John Fastabend <john.fastabend@gmail.com> wrote:
> 
>> On 16-07-14 04:42 PM, Alexei Starovoitov wrote:
>>> On Wed, Jul 13, 2016 at 11:23:12PM -0700, John Fastabend wrote:  
>>>> This converts the pfifo_fast qdisc to use the alf_queue enqueue and
>>>> dequeue routines then sets the NOLOCK bit.
>>>>
>>>> This also removes the logic used to pick the next band to dequeue from
>>>> and instead just checks each alf_queue for packets from top priority
>>>> to lowest. This might need to be a bit more clever but seems to work
>>>> for now.
>>>>
>>>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>>>> ---
>>>>  net/sched/sch_generic.c |  131 +++++++++++++++++++++++++++--------------------  
>>>   
>>>>  static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
>>>>  			      struct sk_buff **to_free)
>>>>  {
>>>> -	return qdisc_drop(skb, qdisc, to_free);
>>>> +	err = skb_array_produce_bh(q, skb);  
>>> ..  
>>>>  static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
>>>>  {
>>>> +		skb = skb_array_consume_bh(q);  
>>>
>>> For this particular qdisc the performance gain should come from
>>> granularityof spin_lock, right?  
>>
>> And the fact that the consumer and producer are using different
>> locks now.
> 
> Yes. Splitting up enqueue'ers (producer's) from the dequeuer (consumer)
> is an important step, because today the qdisc layer have this problem
> that enqueue'ers can starve the single dequeuer.  The current
> mitigation tricks are the enq busy_lock and bulk dequeue.
> 
> As John says, using skb_array cause producers and consumer to use
> different locks.
> 
>>> Before we were taking the lock much earlier. Here we keep the lock,
>>> but for the very short time.
>>>         original pps        lockless        diff
>>>         1       1418168             1269450         -148718
>>>         2       1587390             1553408         -33982
>>>         4       1084961             1683639         +598678
>>>         8       989636              1522723         +533087
>>>         12      1014018             1348172         +334154
>>>
> 

I was able to recover the performance loss here and actually improve it
by fixing a few things in the patchset. Namely qdisc_run was
being called in a few places unnecessarily creating a fairly large per
packet cost overhead and then using the _bh locks was costing quite a
bit and is not needed as Jesper pointer out.

So new pps data here in somewhat raw format. I ran five iterations of
each thread count (1,2,4,8,12)

nolock (pfifo_fast)
1:  1440293 1421602 1409553 1393469 1424543
2:  1754890 1819292 1727948 1797711 1743427
4:  3282665 3344095 3315220 3332777 3348972
8:  2940079 1644450 2950777 2922085 2946310
12: 2042084 2610060 2857581 3493162 3104611

lock (pfifo_fast)
1:  1471479 1469142 1458825 1456788 1453952
2:  1746231 1749490 1753176 1753780 1755959
4:  1119626 1120515 1121478 1119220 1121115
8:  1001471  999308 1000318 1000776 1000384
12:  989269  992122  991590  986581  990430

nolock (mq)
1:   1435952  1459523  1448860  1385451   1435031
2:   2850662  2855702  2859105  2855443   2843382
4:   5288135  5271192  5252242  5270192   5311642
8:  10042731 10018063  9891813  9968382   9956727
12: 13265277 13384199 13438955 13363771  13436198

lock (mq)
1:   1448374  1444208  1437459  1437088  1452453
2:   2687963  2679221  2651059  2691630  2667479
4:   5153884  4684153  5091728  4635261  4902381
8:   9292395  9625869  9681835  9711651  9660498
12: 13553918 13682410 14084055 13946138 13724726

So then if we just use the first test example because I'm being a
bit lazy and don't want to calculate the avg/mean/whatever we get
a pfifo_fast chart like,

      locked             nolock           diff
---------------------------------------------------
1     1471479            1440293          −  31186
2     1746231            1754890          +   8659
4     1119626            3282665          +2163039
8     1119626            2940079          +1820453
12     989269            2857581*         +1868312

[*] I pulled the 3rd iteration here as the 1st one seems off

And the mq chart looks reasonable again with these changes,


       locked            nolock           diff
---------------------------------------------------
1       1448374          1435952          -  12422
2       2687963          2850662          + 162699
4       5153884          5288135          + 134251
8       9292395         10042731          + 750336
12     13553918         13265277          - 288641

So the mq case is a bit of a wash from my point of view which I sort
of expected seeing in this test case there is no contention on the
enqueue()/producer or dequeue()/consumer case when running pktgen
at 1 thread per qdisc/queue. A better test would be to fire up a few
thousand udp sessions and bang on the qdiscs to get contention on the
enqueue side. I'll try this next. On another note the variance is a
touch concerning in the data above for the no lock case so might look
into that a bit more to see why we can get 1mpps swing in one of those
cases I sort of wonder if something kicked off on my test machine
to cause that.

Also I'm going to take a look at Jesper's microbenchmark numbers but I
think if I can convince myself that using skb_array helps or at least
does no harm I might push to have this include with skb_array and then
work on optimizing the ring type/kind/etc. as a follow up patch.
Additionally it does seem to provide goodness on the pfifo_fast single
queue case.

Final point is there are more optimizations we can do once the enqueue
and dequeue is separated. For example two fairly easy things include
removing HARD_TX_LOCK nn NICs with a ring per core and adding bulk
dequeue() to the skb_array or alf queue or whatever object we end up
on. And I expect this will provide additional perf boost.

Thanks,
John
Alexei Starovoitov July 15, 2016, 10:48 p.m. UTC | #9
On Fri, Jul 15, 2016 at 03:18:12PM -0700, John Fastabend wrote:
> 
> nolock (pfifo_fast)
> 1:  1440293 1421602 1409553 1393469 1424543
> 2:  1754890 1819292 1727948 1797711 1743427
> 4:  3282665 3344095 3315220 3332777 3348972
> 8:  2940079 1644450 2950777 2922085 2946310
> 12: 2042084 2610060 2857581 3493162 3104611
> 
> lock (pfifo_fast)
> 1:  1471479 1469142 1458825 1456788 1453952
> 2:  1746231 1749490 1753176 1753780 1755959
> 4:  1119626 1120515 1121478 1119220 1121115
> 8:  1001471  999308 1000318 1000776 1000384
> 12:  989269  992122  991590  986581  990430
> 
> So then if we just use the first test example because I'm being a
> bit lazy and don't want to calculate the avg/mean/whatever we get
> a pfifo_fast chart like,
> 
>       locked             nolock           diff
> ---------------------------------------------------
> 1     1471479            1440293          −  31186
> 2     1746231            1754890          +   8659
> 4     1119626            3282665          +2163039
> 8     1119626            2940079          +1820453
> 12     989269            2857581*         +1868312
...
> Also I'm going to take a look at Jesper's microbenchmark numbers but I
> think if I can convince myself that using skb_array helps or at least
> does no harm I might push to have this include with skb_array and then
> work on optimizing the ring type/kind/etc. as a follow up patch.
> Additionally it does seem to provide goodness on the pfifo_fast single
> queue case.

Agree. I think the pfifo_fast gains worth applying this patch set
as-is and work on further improvements in follow up.
diff mbox

Patch

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7dcd066..2ac3eb9 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -26,6 +26,7 @@ 
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/if_vlan.h>
+#include <linux/skb_array.h>
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 #include <net/dst.h>
@@ -555,88 +556,79 @@  static const u8 prio2band[TC_PRIO_MAX + 1] = {
 
 /*
  * Private data for a pfifo_fast scheduler containing:
- * 	- queues for the three band
- * 	- bitmap indicating which of the bands contain skbs
+ *	- rings for priority bands
  */
 struct pfifo_fast_priv {
-	u32 bitmap;
-	struct sk_buff_head q[PFIFO_FAST_BANDS];
+	struct skb_array q[PFIFO_FAST_BANDS];
 };
 
-/*
- * Convert a bitmap to the first band number where an skb is queued, where:
- * 	bitmap=0 means there are no skbs on any band.
- * 	bitmap=1 means there is an skb on band 0.
- *	bitmap=7 means there are skbs on all 3 bands, etc.
- */
-static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
-
-static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
-					     int band)
+static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
+					  int band)
 {
-	return priv->q + band;
+	return &priv->q[band];
 }
 
 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
 			      struct sk_buff **to_free)
 {
-	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
-		int band = prio2band[skb->priority & TC_PRIO_MAX];
-		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-		struct sk_buff_head *list = band2list(priv, band);
-
-		priv->bitmap |= (1 << band);
-		qdisc->q.qlen++;
-		return __qdisc_enqueue_tail(skb, qdisc, list);
-	}
+	int band = prio2band[skb->priority & TC_PRIO_MAX];
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+	struct skb_array *q = band2list(priv, band);
+	int err;
 
-	return qdisc_drop(skb, qdisc, to_free);
+	err = skb_array_produce_bh(q, skb);
+
+	if (unlikely(err))
+		return qdisc_drop_cpu(skb, qdisc, to_free);
+
+	qdisc_qstats_cpu_qlen_inc(qdisc);
+	qdisc_qstats_cpu_backlog_inc(qdisc, skb);
+	return NET_XMIT_SUCCESS;
 }
 
 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 {
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-	int band = bitmap2band[priv->bitmap];
+	struct sk_buff *skb = NULL;
+	int band;
 
-	if (likely(band >= 0)) {
-		struct sk_buff_head *list = band2list(priv, band);
-		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+		struct skb_array *q = band2list(priv, band);
 
-		qdisc->q.qlen--;
-		if (skb_queue_empty(list))
-			priv->bitmap &= ~(1 << band);
+		if (__skb_array_empty(q))
+			continue;
 
-		return skb;
+		skb = skb_array_consume_bh(q);
 	}
 
-	return NULL;
-}
-
-static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
-{
-	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-	int band = bitmap2band[priv->bitmap];
-
-	if (band >= 0) {
-		struct sk_buff_head *list = band2list(priv, band);
-
-		return skb_peek(list);
+	if (likely(skb)) {
+		qdisc_qstats_cpu_backlog_dec(qdisc, skb);
+		qdisc_bstats_cpu_update(qdisc, skb);
+		qdisc_qstats_cpu_qlen_dec(qdisc);
 	}
 
-	return NULL;
+	return skb;
 }
 
 static void pfifo_fast_reset(struct Qdisc *qdisc)
 {
-	int prio;
+	int i, band;
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 
-	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		__qdisc_reset_queue(band2list(priv, prio));
+	for (band = 0; band < PFIFO_FAST_BANDS; band++) {
+		struct skb_array *q = band2list(priv, band);
+		struct sk_buff *skb;
 
-	priv->bitmap = 0;
-	qdisc->qstats.backlog = 0;
-	qdisc->q.qlen = 0;
+		while ((skb = skb_array_consume_bh(q)) != NULL)
+			kfree_skb(skb);
+	}
+
+	for_each_possible_cpu(i) {
+		struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);
+
+		q->backlog = 0;
+		q->qlen = 0;
+	}
 }
 
 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
@@ -654,24 +646,51 @@  nla_put_failure:
 
 static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
 {
-	int prio;
+	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+	int prio;
+
+	/* guard against zero length rings */
+	if (!qlen)
+		return -EINVAL;
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		struct skb_array *q = band2list(priv, prio);
+		int err;
 
-	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		__skb_queue_head_init(band2list(priv, prio));
+		err = skb_array_init(q, qlen, GFP_KERNEL);
+		if (err)
+			return -ENOMEM;
+	}
 
 	/* Can by-pass the queue discipline */
 	qdisc->flags |= TCQ_F_CAN_BYPASS;
+	qdisc->flags |= TCQ_F_NOLOCK;
+	qdisc->flags |= TCQ_F_CPUSTATS;
+
 	return 0;
 }
 
+static void pfifo_fast_destroy(struct Qdisc *sch)
+{
+	struct pfifo_fast_priv *priv = qdisc_priv(sch);
+	int prio;
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		struct skb_array *q = band2list(priv, prio);
+
+		skb_array_cleanup(q);
+	}
+}
+
 struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.id		=	"pfifo_fast",
 	.priv_size	=	sizeof(struct pfifo_fast_priv),
 	.enqueue	=	pfifo_fast_enqueue,
 	.dequeue	=	pfifo_fast_dequeue,
-	.peek		=	pfifo_fast_peek,
+	.peek		=	qdisc_peek_dequeued,
 	.init		=	pfifo_fast_init,
+	.destroy	=	pfifo_fast_destroy,
 	.reset		=	pfifo_fast_reset,
 	.dump		=	pfifo_fast_dump,
 	.owner		=	THIS_MODULE,