Message ID | 20160714062312.8270.65942.stgit@john-Precision-Tower-5810 |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
On Wed, 13 Jul 2016 23:23:12 -0700 John Fastabend <john.fastabend@gmail.com> wrote: > This converts the pfifo_fast qdisc to use the alf_queue enqueue and > dequeue routines then sets the NOLOCK bit. Should have said skb_array, not alf_queue ;-) > This also removes the logic used to pick the next band to dequeue from > and instead just checks each alf_queue for packets from top priority > to lowest. This might need to be a bit more clever but seems to work > for now. > > Signed-off-by: John Fastabend <john.r.fastabend@intel.com> > --- > net/sched/sch_generic.c | 131 +++++++++++++++++++++++++++-------------------- > 1 file changed, 75 insertions(+), 56 deletions(-) > > diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c > index 7dcd066..2ac3eb9 100644 > --- a/net/sched/sch_generic.c > +++ b/net/sched/sch_generic.c > @@ -26,6 +26,7 @@ > #include <linux/list.h> > #include <linux/slab.h> > #include <linux/if_vlan.h> > +#include <linux/skb_array.h> > #include <net/sch_generic.h> > #include <net/pkt_sched.h> > #include <net/dst.h> > @@ -555,88 +556,79 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { > > /* > * Private data for a pfifo_fast scheduler containing: > - * - queues for the three band > - * - bitmap indicating which of the bands contain skbs > + * - rings for priority bands > */ > struct pfifo_fast_priv { > - u32 bitmap; > - struct sk_buff_head q[PFIFO_FAST_BANDS]; > + struct skb_array q[PFIFO_FAST_BANDS]; > }; > > -/* > - * Convert a bitmap to the first band number where an skb is queued, where: > - * bitmap=0 means there are no skbs on any band. > - * bitmap=1 means there is an skb on band 0. > - * bitmap=7 means there are skbs on all 3 bands, etc. > - */ > -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; > - > -static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, > - int band) > +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, > + int band) > { > - return priv->q + band; > + return &priv->q[band]; > } > > static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, > struct sk_buff **to_free) > { > - if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { > - int band = prio2band[skb->priority & TC_PRIO_MAX]; > - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); > - struct sk_buff_head *list = band2list(priv, band); > - > - priv->bitmap |= (1 << band); > - qdisc->q.qlen++; > - return __qdisc_enqueue_tail(skb, qdisc, list); > - } > + int band = prio2band[skb->priority & TC_PRIO_MAX]; > + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); > + struct skb_array *q = band2list(priv, band); > + int err; > > - return qdisc_drop(skb, qdisc, to_free); > + err = skb_array_produce_bh(q, skb); Do you need the _bh variant here? (Doesn't the qdisc run with BH disabled?) > + > + if (unlikely(err)) > + return qdisc_drop_cpu(skb, qdisc, to_free); > + > + qdisc_qstats_cpu_qlen_inc(qdisc); > + qdisc_qstats_cpu_backlog_inc(qdisc, skb); > + return NET_XMIT_SUCCESS; > } > > static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) > { > struct pfifo_fast_priv *priv = qdisc_priv(qdisc); > - int band = bitmap2band[priv->bitmap]; > + struct sk_buff *skb = NULL; > + int band; > > - if (likely(band >= 0)) { > - struct sk_buff_head *list = band2list(priv, band); > - struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); > + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { > + struct skb_array *q = band2list(priv, band); > > - qdisc->q.qlen--; > - if (skb_queue_empty(list)) > - priv->bitmap &= ~(1 << band); > + if (__skb_array_empty(q)) > + continue; > > - return skb; > + skb = skb_array_consume_bh(q); Also _bh variant.
On Wed, Jul 13, 2016 at 11:23:12PM -0700, John Fastabend wrote: > This converts the pfifo_fast qdisc to use the alf_queue enqueue and > dequeue routines then sets the NOLOCK bit. > > This also removes the logic used to pick the next band to dequeue from > and instead just checks each alf_queue for packets from top priority > to lowest. This might need to be a bit more clever but seems to work > for now. > > Signed-off-by: John Fastabend <john.r.fastabend@intel.com> > --- > net/sched/sch_generic.c | 131 +++++++++++++++++++++++++++-------------------- > static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, > struct sk_buff **to_free) > { > - return qdisc_drop(skb, qdisc, to_free); > + err = skb_array_produce_bh(q, skb); .. > static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) > { > + skb = skb_array_consume_bh(q); For this particular qdisc the performance gain should come from granularityof spin_lock, right? Before we were taking the lock much earlier. Here we keep the lock, but for the very short time. original pps lockless diff 1 1418168 1269450 -148718 2 1587390 1553408 -33982 4 1084961 1683639 +598678 8 989636 1522723 +533087 12 1014018 1348172 +334154 so perf for 1 cpu case is mainly due to array vs list, since number of locks is still the same and there is no collision ? but then why shorter lock give higher overhead in multi cpu cases? That would have been the main target for performance improvement? Looks like mq gets the most benefit, because it's lockless internally which makes sense. In general I think this is the right direction where tc infra should move to. I'm only not sure whether it's worth converting pfifo to skb_array. Probably alf queue would have been a better alternative.
On 16-07-14 04:42 PM, Alexei Starovoitov wrote: > On Wed, Jul 13, 2016 at 11:23:12PM -0700, John Fastabend wrote: >> This converts the pfifo_fast qdisc to use the alf_queue enqueue and >> dequeue routines then sets the NOLOCK bit. >> >> This also removes the logic used to pick the next band to dequeue from >> and instead just checks each alf_queue for packets from top priority >> to lowest. This might need to be a bit more clever but seems to work >> for now. >> >> Signed-off-by: John Fastabend <john.r.fastabend@intel.com> >> --- >> net/sched/sch_generic.c | 131 +++++++++++++++++++++++++++-------------------- > >> static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, >> struct sk_buff **to_free) >> { >> - return qdisc_drop(skb, qdisc, to_free); >> + err = skb_array_produce_bh(q, skb); > .. >> static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) >> { >> + skb = skb_array_consume_bh(q); > > For this particular qdisc the performance gain should come from > granularityof spin_lock, right? And the fact that the consumer and producer are using different locks now. > Before we were taking the lock much earlier. Here we keep the lock, > but for the very short time. > original pps lockless diff > 1 1418168 1269450 -148718 > 2 1587390 1553408 -33982 > 4 1084961 1683639 +598678 > 8 989636 1522723 +533087 > 12 1014018 1348172 +334154 > so perf for 1 cpu case is mainly due to array vs list, > since number of locks is still the same and there is no collision ? > but then why shorter lock give higher overhead in multi cpu cases? So in this case running pfifo_fast as the root qdisc with 12 threads means we have 12 producers hitting a single enqueue() path where as with mq and only looking at pktgen numbers we have one thread for each skb_array. > That would have been the main target for performance improvement? > Maybe I should fire up a TCP test with 1000's of threads to see what the perf numbers look like. > Looks like mq gets the most benefit, because it's lockless internally > which makes sense. > In general I think this is the right direction where tc infra should move to. > I'm only not sure whether it's worth converting pfifo to skb_array. > Probably alf queue would have been a better alternative. > Tomorrows task is to resurrect the alf_queue and look at its numbers compared to this. Today was spent trying to remove the HARD_TX_LOCK that protects the driver, in the mq case it seems this is not really needed either. .John
On 16-07-14 08:11 AM, Jesper Dangaard Brouer wrote: > On Wed, 13 Jul 2016 23:23:12 -0700 > John Fastabend <john.fastabend@gmail.com> wrote: > >> This converts the pfifo_fast qdisc to use the alf_queue enqueue and >> dequeue routines then sets the NOLOCK bit. > > Should have said skb_array, not alf_queue ;-) I think I'll get numbers for the alf_queue to so we can compare them as well. On the todo list for tomorrow. > > > This also removes the logic used to pick the next band to dequeue from >> and instead just checks each alf_queue for packets from top priority >> to lowest. This might need to be a bit more clever but seems to work >> for now. >> >> Signed-off-by: John Fastabend <john.r.fastabend@intel.com> >> --- >> net/sched/sch_generic.c | 131 +++++++++++++++++++++++++++-------------------- >> 1 file changed, 75 insertions(+), 56 deletions(-) >> >> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c >> index 7dcd066..2ac3eb9 100644 >> --- a/net/sched/sch_generic.c >> +++ b/net/sched/sch_generic.c >> @@ -26,6 +26,7 @@ >> #include <linux/list.h> >> #include <linux/slab.h> >> #include <linux/if_vlan.h> >> +#include <linux/skb_array.h> >> #include <net/sch_generic.h> >> #include <net/pkt_sched.h> >> #include <net/dst.h> >> @@ -555,88 +556,79 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { >> >> /* >> * Private data for a pfifo_fast scheduler containing: >> - * - queues for the three band >> - * - bitmap indicating which of the bands contain skbs >> + * - rings for priority bands >> */ >> struct pfifo_fast_priv { >> - u32 bitmap; >> - struct sk_buff_head q[PFIFO_FAST_BANDS]; >> + struct skb_array q[PFIFO_FAST_BANDS]; >> }; >> >> -/* >> - * Convert a bitmap to the first band number where an skb is queued, where: >> - * bitmap=0 means there are no skbs on any band. >> - * bitmap=1 means there is an skb on band 0. >> - * bitmap=7 means there are skbs on all 3 bands, etc. >> - */ >> -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; >> - >> -static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, >> - int band) >> +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, >> + int band) >> { >> - return priv->q + band; >> + return &priv->q[band]; >> } >> >> static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, >> struct sk_buff **to_free) >> { >> - if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { >> - int band = prio2band[skb->priority & TC_PRIO_MAX]; >> - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); >> - struct sk_buff_head *list = band2list(priv, band); >> - >> - priv->bitmap |= (1 << band); >> - qdisc->q.qlen++; >> - return __qdisc_enqueue_tail(skb, qdisc, list); >> - } >> + int band = prio2band[skb->priority & TC_PRIO_MAX]; >> + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); >> + struct skb_array *q = band2list(priv, band); >> + int err; >> >> - return qdisc_drop(skb, qdisc, to_free); >> + err = skb_array_produce_bh(q, skb); > > Do you need the _bh variant here? (Doesn't the qdisc run with BH disabled?) > > Yep its inside rcu_read_lock_bh().
On Thu, 14 Jul 2016 17:09:43 -0700 John Fastabend <john.fastabend@gmail.com> wrote: > >> static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, > >> struct sk_buff **to_free) > >> { > >> - if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { > >> - int band = prio2band[skb->priority & TC_PRIO_MAX]; > >> - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); > >> - struct sk_buff_head *list = band2list(priv, band); > >> - > >> - priv->bitmap |= (1 << band); > >> - qdisc->q.qlen++; > >> - return __qdisc_enqueue_tail(skb, qdisc, list); > >> - } > >> + int band = prio2band[skb->priority & TC_PRIO_MAX]; > >> + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); > >> + struct skb_array *q = band2list(priv, band); > >> + int err; > >> > >> - return qdisc_drop(skb, qdisc, to_free); > >> + err = skb_array_produce_bh(q, skb); > > > > Do you need the _bh variant here? (Doesn't the qdisc run with BH disabled?) > > > > > > Yep its inside rcu_read_lock_bh(). The call rcu_read_lock_bh() already disabled BH (local_bh_disable()). Thus, you can use the normal variants of skb_array_produce(), it is (approx 20 cycles) faster than the _bh variant...
On Thu, 14 Jul 2016 17:07:33 -0700 John Fastabend <john.fastabend@gmail.com> wrote: > On 16-07-14 04:42 PM, Alexei Starovoitov wrote: > > On Wed, Jul 13, 2016 at 11:23:12PM -0700, John Fastabend wrote: > >> This converts the pfifo_fast qdisc to use the alf_queue enqueue and > >> dequeue routines then sets the NOLOCK bit. > >> > >> This also removes the logic used to pick the next band to dequeue from > >> and instead just checks each alf_queue for packets from top priority > >> to lowest. This might need to be a bit more clever but seems to work > >> for now. > >> > >> Signed-off-by: John Fastabend <john.r.fastabend@intel.com> > >> --- > >> net/sched/sch_generic.c | 131 +++++++++++++++++++++++++++-------------------- > > > >> static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, > >> struct sk_buff **to_free) > >> { > >> - return qdisc_drop(skb, qdisc, to_free); > >> + err = skb_array_produce_bh(q, skb); > > .. > >> static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) > >> { > >> + skb = skb_array_consume_bh(q); > > > > For this particular qdisc the performance gain should come from > > granularityof spin_lock, right? > > And the fact that the consumer and producer are using different > locks now. Yes. Splitting up enqueue'ers (producer's) from the dequeuer (consumer) is an important step, because today the qdisc layer have this problem that enqueue'ers can starve the single dequeuer. The current mitigation tricks are the enq busy_lock and bulk dequeue. As John says, using skb_array cause producers and consumer to use different locks. > > Before we were taking the lock much earlier. Here we keep the lock, > > but for the very short time. > > original pps lockless diff > > 1 1418168 1269450 -148718 > > 2 1587390 1553408 -33982 > > 4 1084961 1683639 +598678 > > 8 989636 1522723 +533087 > > 12 1014018 1348172 +334154 > > It looks problematic that lockless is slower in the base single CPU case, orig (1418168 pps) to lockless (1269450). By approx (1/1418168-1/1269450)*10^9 = -82.60 ns. That base "regression" look too high to start with. How I view the results: Negative scaling starts early for "original", which is the main problem that we are looking to solve. The lockless does not show as much scaling effect as expected, and start to show a trend towards negative scaling. > > so perf for 1 cpu case is mainly due to array vs list, > > since number of locks is still the same and there is no collision ? > > but then why shorter lock give higher overhead in multi cpu cases? > > So in this case running pfifo_fast as the root qdisc with 12 threads > means we have 12 producers hitting a single enqueue() path where as with > mq and only looking at pktgen numbers we have one thread for each > skb_array. The skb_array allow multi producer multi consumer (MPMC), but is optimized towards the case of a single producer and a single consumer (SPSC). The SPSC queue is usually implemented with much less synchronization, but then you need some other guarantee that only a single producer/consumer can be running. > > That would have been the main target for performance improvement? > > > > Maybe I should fire up a TCP test with 1000's of threads to see what > the perf numbers look like. > > > Looks like mq gets the most benefit, because it's lockless internally > > which makes sense. > > In general I think this is the right direction where tc infra should move to. > > I'm only not sure whether it's worth converting pfifo to skb_array. > > Probably alf queue would have been a better alternative. > > > > Tomorrows task is to resurrect the alf_queue and look at its numbers > compared to this. Today was spent trying to remove the HARD_TX_LOCK > that protects the driver, in the mq case it seems this is not really > needed either. I would be very interested in the alf_queue numbers. As alf_queue should perform better with multiple producers. If you kept the single TC dequeue'er rule, then you could use the MPSC variant of alf_queue. My benchmarks from 2014 are avail in this presentation: http://people.netfilter.org/hawk/presentations/nfws2014/dp-accel-qdisc-lockless.pdf Also showing the mitigation effect of bulk-dequeue patches slide 12 (But I would like to see some new benchmarks for alf_queue slide 14) But do notice, alf_queue have a design problem that skb_array solved. Alf_queue have the problem of read-bouncing the remote/opposite CPUs cache line, because it need to (1) at enqueue (producer) look if there is free space (reading consumer.tail), and (2) at dequeue (consumer) if any object are avail (reading producer.tail). The alf_queue mitigate this by design problem by allowing doing bulk enqueue and bulk dequeue. But I guess, you will not use that "mode", I think I did in my NFWS2014 benchmarks. The skb_array solved this design problem, by using the content of the queue objects as a maker for free/full condition. The alf_queue cannot do so easily, because of it's bulk design, that need to reserve part of the queue. Thus, we likely need some new queue design, which is a hybrid between alf_queue and skb_array, for this use-case. I actually wrote some queue micro-benchmarks that show the strength and weaknesses of both skb_array[1] and alf_queue[2]. [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/skb_array_parallel01.c [2] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/alf_queue_parallel01.c
On 16-07-15 03:09 AM, Jesper Dangaard Brouer wrote: > On Thu, 14 Jul 2016 17:09:43 -0700 > John Fastabend <john.fastabend@gmail.com> wrote: > >>>> static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, >>>> struct sk_buff **to_free) >>>> { >>>> - if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { >>>> - int band = prio2band[skb->priority & TC_PRIO_MAX]; >>>> - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); >>>> - struct sk_buff_head *list = band2list(priv, band); >>>> - >>>> - priv->bitmap |= (1 << band); >>>> - qdisc->q.qlen++; >>>> - return __qdisc_enqueue_tail(skb, qdisc, list); >>>> - } >>>> + int band = prio2band[skb->priority & TC_PRIO_MAX]; >>>> + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); >>>> + struct skb_array *q = band2list(priv, band); >>>> + int err; >>>> >>>> - return qdisc_drop(skb, qdisc, to_free); >>>> + err = skb_array_produce_bh(q, skb); >>> >>> Do you need the _bh variant here? (Doesn't the qdisc run with BH disabled?) >>> >>> >> >> Yep its inside rcu_read_lock_bh(). > > The call rcu_read_lock_bh() already disabled BH (local_bh_disable()). > Thus, you can use the normal variants of skb_array_produce(), it is > (approx 20 cycles) faster than the _bh variant... > hah I was agreeing with you as in yep no need for the _bh variant :) I must have been low on coffee or something when I wrote that response because when I read it now it sounds like I really think the _bh is needed. At any rate _bh removed thanks!
On 16-07-15 04:23 AM, Jesper Dangaard Brouer wrote: > On Thu, 14 Jul 2016 17:07:33 -0700 > John Fastabend <john.fastabend@gmail.com> wrote: > >> On 16-07-14 04:42 PM, Alexei Starovoitov wrote: >>> On Wed, Jul 13, 2016 at 11:23:12PM -0700, John Fastabend wrote: >>>> This converts the pfifo_fast qdisc to use the alf_queue enqueue and >>>> dequeue routines then sets the NOLOCK bit. >>>> >>>> This also removes the logic used to pick the next band to dequeue from >>>> and instead just checks each alf_queue for packets from top priority >>>> to lowest. This might need to be a bit more clever but seems to work >>>> for now. >>>> >>>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com> >>>> --- >>>> net/sched/sch_generic.c | 131 +++++++++++++++++++++++++++-------------------- >>> >>>> static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, >>>> struct sk_buff **to_free) >>>> { >>>> - return qdisc_drop(skb, qdisc, to_free); >>>> + err = skb_array_produce_bh(q, skb); >>> .. >>>> static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) >>>> { >>>> + skb = skb_array_consume_bh(q); >>> >>> For this particular qdisc the performance gain should come from >>> granularityof spin_lock, right? >> >> And the fact that the consumer and producer are using different >> locks now. > > Yes. Splitting up enqueue'ers (producer's) from the dequeuer (consumer) > is an important step, because today the qdisc layer have this problem > that enqueue'ers can starve the single dequeuer. The current > mitigation tricks are the enq busy_lock and bulk dequeue. > > As John says, using skb_array cause producers and consumer to use > different locks. > >>> Before we were taking the lock much earlier. Here we keep the lock, >>> but for the very short time. >>> original pps lockless diff >>> 1 1418168 1269450 -148718 >>> 2 1587390 1553408 -33982 >>> 4 1084961 1683639 +598678 >>> 8 989636 1522723 +533087 >>> 12 1014018 1348172 +334154 >>> > I was able to recover the performance loss here and actually improve it by fixing a few things in the patchset. Namely qdisc_run was being called in a few places unnecessarily creating a fairly large per packet cost overhead and then using the _bh locks was costing quite a bit and is not needed as Jesper pointer out. So new pps data here in somewhat raw format. I ran five iterations of each thread count (1,2,4,8,12) nolock (pfifo_fast) 1: 1440293 1421602 1409553 1393469 1424543 2: 1754890 1819292 1727948 1797711 1743427 4: 3282665 3344095 3315220 3332777 3348972 8: 2940079 1644450 2950777 2922085 2946310 12: 2042084 2610060 2857581 3493162 3104611 lock (pfifo_fast) 1: 1471479 1469142 1458825 1456788 1453952 2: 1746231 1749490 1753176 1753780 1755959 4: 1119626 1120515 1121478 1119220 1121115 8: 1001471 999308 1000318 1000776 1000384 12: 989269 992122 991590 986581 990430 nolock (mq) 1: 1435952 1459523 1448860 1385451 1435031 2: 2850662 2855702 2859105 2855443 2843382 4: 5288135 5271192 5252242 5270192 5311642 8: 10042731 10018063 9891813 9968382 9956727 12: 13265277 13384199 13438955 13363771 13436198 lock (mq) 1: 1448374 1444208 1437459 1437088 1452453 2: 2687963 2679221 2651059 2691630 2667479 4: 5153884 4684153 5091728 4635261 4902381 8: 9292395 9625869 9681835 9711651 9660498 12: 13553918 13682410 14084055 13946138 13724726 So then if we just use the first test example because I'm being a bit lazy and don't want to calculate the avg/mean/whatever we get a pfifo_fast chart like, locked nolock diff --------------------------------------------------- 1 1471479 1440293 − 31186 2 1746231 1754890 + 8659 4 1119626 3282665 +2163039 8 1119626 2940079 +1820453 12 989269 2857581* +1868312 [*] I pulled the 3rd iteration here as the 1st one seems off And the mq chart looks reasonable again with these changes, locked nolock diff --------------------------------------------------- 1 1448374 1435952 - 12422 2 2687963 2850662 + 162699 4 5153884 5288135 + 134251 8 9292395 10042731 + 750336 12 13553918 13265277 - 288641 So the mq case is a bit of a wash from my point of view which I sort of expected seeing in this test case there is no contention on the enqueue()/producer or dequeue()/consumer case when running pktgen at 1 thread per qdisc/queue. A better test would be to fire up a few thousand udp sessions and bang on the qdiscs to get contention on the enqueue side. I'll try this next. On another note the variance is a touch concerning in the data above for the no lock case so might look into that a bit more to see why we can get 1mpps swing in one of those cases I sort of wonder if something kicked off on my test machine to cause that. Also I'm going to take a look at Jesper's microbenchmark numbers but I think if I can convince myself that using skb_array helps or at least does no harm I might push to have this include with skb_array and then work on optimizing the ring type/kind/etc. as a follow up patch. Additionally it does seem to provide goodness on the pfifo_fast single queue case. Final point is there are more optimizations we can do once the enqueue and dequeue is separated. For example two fairly easy things include removing HARD_TX_LOCK nn NICs with a ring per core and adding bulk dequeue() to the skb_array or alf queue or whatever object we end up on. And I expect this will provide additional perf boost. Thanks, John
On Fri, Jul 15, 2016 at 03:18:12PM -0700, John Fastabend wrote: > > nolock (pfifo_fast) > 1: 1440293 1421602 1409553 1393469 1424543 > 2: 1754890 1819292 1727948 1797711 1743427 > 4: 3282665 3344095 3315220 3332777 3348972 > 8: 2940079 1644450 2950777 2922085 2946310 > 12: 2042084 2610060 2857581 3493162 3104611 > > lock (pfifo_fast) > 1: 1471479 1469142 1458825 1456788 1453952 > 2: 1746231 1749490 1753176 1753780 1755959 > 4: 1119626 1120515 1121478 1119220 1121115 > 8: 1001471 999308 1000318 1000776 1000384 > 12: 989269 992122 991590 986581 990430 > > So then if we just use the first test example because I'm being a > bit lazy and don't want to calculate the avg/mean/whatever we get > a pfifo_fast chart like, > > locked nolock diff > --------------------------------------------------- > 1 1471479 1440293 − 31186 > 2 1746231 1754890 + 8659 > 4 1119626 3282665 +2163039 > 8 1119626 2940079 +1820453 > 12 989269 2857581* +1868312 ... > Also I'm going to take a look at Jesper's microbenchmark numbers but I > think if I can convince myself that using skb_array helps or at least > does no harm I might push to have this include with skb_array and then > work on optimizing the ring type/kind/etc. as a follow up patch. > Additionally it does seem to provide goodness on the pfifo_fast single > queue case. Agree. I think the pfifo_fast gains worth applying this patch set as-is and work on further improvements in follow up.
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7dcd066..2ac3eb9 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -26,6 +26,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/if_vlan.h> +#include <linux/skb_array.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -555,88 +556,79 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { /* * Private data for a pfifo_fast scheduler containing: - * - queues for the three band - * - bitmap indicating which of the bands contain skbs + * - rings for priority bands */ struct pfifo_fast_priv { - u32 bitmap; - struct sk_buff_head q[PFIFO_FAST_BANDS]; + struct skb_array q[PFIFO_FAST_BANDS]; }; -/* - * Convert a bitmap to the first band number where an skb is queued, where: - * bitmap=0 means there are no skbs on any band. - * bitmap=1 means there is an skb on band 0. - * bitmap=7 means there are skbs on all 3 bands, etc. - */ -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; - -static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, - int band) +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, + int band) { - return priv->q + band; + return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { - if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { - int band = prio2band[skb->priority & TC_PRIO_MAX]; - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - struct sk_buff_head *list = band2list(priv, band); - - priv->bitmap |= (1 << band); - qdisc->q.qlen++; - return __qdisc_enqueue_tail(skb, qdisc, list); - } + int band = prio2band[skb->priority & TC_PRIO_MAX]; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + struct skb_array *q = band2list(priv, band); + int err; - return qdisc_drop(skb, qdisc, to_free); + err = skb_array_produce_bh(q, skb); + + if (unlikely(err)) + return qdisc_drop_cpu(skb, qdisc, to_free); + + qdisc_qstats_cpu_qlen_inc(qdisc); + qdisc_qstats_cpu_backlog_inc(qdisc, skb); + return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; + struct sk_buff *skb = NULL; + int band; - if (likely(band >= 0)) { - struct sk_buff_head *list = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - qdisc->q.qlen--; - if (skb_queue_empty(list)) - priv->bitmap &= ~(1 << band); + if (__skb_array_empty(q)) + continue; - return skb; + skb = skb_array_consume_bh(q); } - return NULL; -} - -static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) -{ - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; - - if (band >= 0) { - struct sk_buff_head *list = band2list(priv, band); - - return skb_peek(list); + if (likely(skb)) { + qdisc_qstats_cpu_backlog_dec(qdisc, skb); + qdisc_bstats_cpu_update(qdisc, skb); + qdisc_qstats_cpu_qlen_dec(qdisc); } - return NULL; + return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { - int prio; + int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(band2list(priv, prio)); + for (band = 0; band < PFIFO_FAST_BANDS; band++) { + struct skb_array *q = band2list(priv, band); + struct sk_buff *skb; - priv->bitmap = 0; - qdisc->qstats.backlog = 0; - qdisc->q.qlen = 0; + while ((skb = skb_array_consume_bh(q)) != NULL) + kfree_skb(skb); + } + + for_each_possible_cpu(i) { + struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i); + + q->backlog = 0; + q->qlen = 0; + } } static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) @@ -654,24 +646,51 @@ nla_put_failure: static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) { - int prio; + unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + int prio; + + /* guard against zero length rings */ + if (!qlen) + return -EINVAL; + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + int err; - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __skb_queue_head_init(band2list(priv, prio)); + err = skb_array_init(q, qlen, GFP_KERNEL); + if (err) + return -ENOMEM; + } /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; + qdisc->flags |= TCQ_F_NOLOCK; + qdisc->flags |= TCQ_F_CPUSTATS; + return 0; } +static void pfifo_fast_destroy(struct Qdisc *sch) +{ + struct pfifo_fast_priv *priv = qdisc_priv(sch); + int prio; + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + + skb_array_cleanup(q); + } +} + struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = sizeof(struct pfifo_fast_priv), .enqueue = pfifo_fast_enqueue, .dequeue = pfifo_fast_dequeue, - .peek = pfifo_fast_peek, + .peek = qdisc_peek_dequeued, .init = pfifo_fast_init, + .destroy = pfifo_fast_destroy, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, .owner = THIS_MODULE,
This converts the pfifo_fast qdisc to use the alf_queue enqueue and dequeue routines then sets the NOLOCK bit. This also removes the logic used to pick the next band to dequeue from and instead just checks each alf_queue for packets from top priority to lowest. This might need to be a bit more clever but seems to work for now. Signed-off-by: John Fastabend <john.r.fastabend@intel.com> --- net/sched/sch_generic.c | 131 +++++++++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 56 deletions(-)