diff mbox

[net-next,v5,14/16] net: sched: make bstats per cpu and estimator RCU safe

Message ID 20140912163428.19588.52611.stgit@nitbit.x32
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

John Fastabend Sept. 12, 2014, 4:34 p.m. UTC
In order to run qdisc's without locking statistics and estimators
need to be handled correctly.

To resolve bstats make the statistics per cpu. And because this is
only needed for qdiscs that are running without locks which is not
the case for most qdiscs in the near future only create percpu
stats when qdiscs set the TCQ_F_LLQDISC flag.

Next because estimators use the bstats to calculate packets per
second and bytes per second the estimator code paths are updated
to use the per cpu statistics.

After this if qdiscs use the _percpu routines to update stats
and set the TCQ_F_LLQDISC they can be made safe to run without
locking. Any qdisc that sets the flag needs to ensure that the
data structures it owns are safe to run without locks and
additionally currently all the skb list routines run without
locking so those would need to be updated in a future patch.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/net/gen_stats.h    |   16 ++++++++++++
 include/net/sch_generic.h  |   40 +++++++++++++++++++++++++++--
 net/core/gen_estimator.c   |   60 ++++++++++++++++++++++++++++++++++----------
 net/core/gen_stats.c       |   47 ++++++++++++++++++++++++++++++++++
 net/netfilter/xt_RATEEST.c |    4 +--
 net/sched/act_api.c        |    9 ++++---
 net/sched/act_police.c     |    4 +--
 net/sched/sch_api.c        |   51 ++++++++++++++++++++++++++++++++-----
 net/sched/sch_cbq.c        |    9 ++++---
 net/sched/sch_drr.c        |    9 ++++---
 net/sched/sch_generic.c    |   11 +++++++-
 net/sched/sch_hfsc.c       |   15 +++++++----
 net/sched/sch_htb.c        |   14 +++++++---
 net/sched/sch_ingress.c    |    2 +
 net/sched/sch_mq.c         |   10 ++++---
 net/sched/sch_mqprio.c     |   13 +++++-----
 net/sched/sch_multiq.c     |    2 +
 net/sched/sch_prio.c       |    2 +
 net/sched/sch_qfq.c        |   12 +++++----
 19 files changed, 258 insertions(+), 72 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Dumazet Sept. 12, 2014, 11:49 p.m. UTC | #1
On Fri, 2014-09-12 at 09:34 -0700, John Fastabend wrote:
> In order to run qdisc's without locking statistics and estimators
> need to be handled correctly.
> 
> To resolve bstats make the statistics per cpu. And because this is
> only needed for qdiscs that are running without locks which is not
> the case for most qdiscs in the near future only create percpu
> stats when qdiscs set the TCQ_F_LLQDISC flag.
> 
> Next because estimators use the bstats to calculate packets per
> second and bytes per second the estimator code paths are updated
> to use the per cpu statistics.
> 
> After this if qdiscs use the _percpu routines to update stats
> and set the TCQ_F_LLQDISC they can be made safe to run without
> locking. Any qdisc that sets the flag needs to ensure that the
> data structures it owns are safe to run without locks and
> additionally currently all the skb list routines run without
> locking so those would need to be updated in a future patch.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---

Hmm... this is a bit too complex for my taste.

>  include/net/gen_stats.h    |   16 ++++++++++++
>  include/net/sch_generic.h  |   40 +++++++++++++++++++++++++++--
>  net/core/gen_estimator.c   |   60 ++++++++++++++++++++++++++++++++++----------
>  net/core/gen_stats.c       |   47 ++++++++++++++++++++++++++++++++++
>  net/netfilter/xt_RATEEST.c |    4 +--
>  net/sched/act_api.c        |    9 ++++---
>  net/sched/act_police.c     |    4 +--
>  net/sched/sch_api.c        |   51 ++++++++++++++++++++++++++++++++-----
>  net/sched/sch_cbq.c        |    9 ++++---
>  net/sched/sch_drr.c        |    9 ++++---
>  net/sched/sch_generic.c    |   11 +++++++-
>  net/sched/sch_hfsc.c       |   15 +++++++----
>  net/sched/sch_htb.c        |   14 +++++++---
>  net/sched/sch_ingress.c    |    2 +
>  net/sched/sch_mq.c         |   10 ++++---
>  net/sched/sch_mqprio.c     |   13 +++++-----
>  net/sched/sch_multiq.c     |    2 +
>  net/sched/sch_prio.c       |    2 +
>  net/sched/sch_qfq.c        |   12 +++++----
>  19 files changed, 258 insertions(+), 72 deletions(-)
> 
> diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
> index ea4271d..4b7ca2b 100644
> --- a/include/net/gen_stats.h
> +++ b/include/net/gen_stats.h
> @@ -6,6 +6,11 @@
>  #include <linux/rtnetlink.h>
>  #include <linux/pkt_sched.h>
>  
> +struct gnet_stats_basic_cpu {
> +	struct gnet_stats_basic_packed bstats;
> +	struct u64_stats_sync syncp;
> +};
> +
>  struct gnet_dump {
>  	spinlock_t *      lock;
>  	struct sk_buff *  skb;
> @@ -26,10 +31,17 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
>  				 int tc_stats_type, int xstats_type,
>  				 spinlock_t *lock, struct gnet_dump *d);
>  
> +int gnet_stats_copy_basic_cpu(struct gnet_dump *d,
> +			      struct gnet_stats_basic_cpu __percpu *b);
> +void __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
> +				 struct gnet_stats_basic_cpu __percpu *b);
>  int gnet_stats_copy_basic(struct gnet_dump *d,
>  			  struct gnet_stats_basic_packed *b);
> +void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
> +			     struct gnet_stats_basic_packed *b);
>  int gnet_stats_copy_rate_est(struct gnet_dump *d,
>  			     const struct gnet_stats_basic_packed *b,
> +			     const struct gnet_stats_basic_cpu __percpu *cpu_b,
>  			     struct gnet_stats_rate_est64 *r);
>  int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q);
>  int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
> @@ -37,13 +49,17 @@ int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
>  int gnet_stats_finish_copy(struct gnet_dump *d);
>  
>  int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
> +		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
>  		      struct gnet_stats_rate_est64 *rate_est,
>  		      spinlock_t *stats_lock, struct nlattr *opt);
>  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
> +			struct gnet_stats_basic_cpu __percpu *cpu_bstats,
>  			struct gnet_stats_rate_est64 *rate_est);
>  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
> +			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
>  			  struct gnet_stats_rate_est64 *rate_est,
>  			  spinlock_t *stats_lock, struct nlattr *opt);
>  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
> +			  const struct gnet_stats_basic_cpu __percpu *cpu_bstat,
>  			  const struct gnet_stats_rate_est64 *rate_est);
>  #endif
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 1e89b9a..54e318f 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -6,6 +6,7 @@
>  #include <linux/rcupdate.h>
>  #include <linux/pkt_sched.h>
>  #include <linux/pkt_cls.h>
> +#include <linux/percpu.h>
>  #include <net/gen_stats.h>
>  #include <net/rtnetlink.h>
>  
> @@ -57,6 +58,9 @@ struct Qdisc {
>  				      * Its true for MQ/MQPRIO slaves, or non
>  				      * multiqueue device.
>  				      */
> +#define TCQ_F_LLQDISC		0x20 /* lockless qdiscs can run without using
> +				      * the qdisc lock to serialize skbs.
> +				      */
>  #define TCQ_F_WARN_NONWC	(1 << 16)
>  	u32			limit;
>  	const struct Qdisc_ops	*ops;
> @@ -83,7 +87,10 @@ struct Qdisc {
>  	 */
>  	unsigned long		state;
>  	struct sk_buff_head	q;
> -	struct gnet_stats_basic_packed bstats;
> +	union {
> +		struct gnet_stats_basic_packed bstats;
> +		struct gnet_stats_basic_cpu __percpu *cpu_bstats;
> +	} bstats_qdisc;

I believe this union will loose the _packed attribute, and __state will
no longer fill the 4 bytes hole we had after struct
gnet_stats_basic_packed bstats

So I think you should add a __packed attribute :

+       union {
+               struct gnet_stats_basic_packed bstats;
+               struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+       } __packed bstats_qdisc;

btw, have you tried to use an anonymous union ?
	

>  	unsigned int		__state;
>  	struct gnet_stats_queue	qstats;
>  	struct rcu_head		rcu_head;
> @@ -486,7 +493,6 @@ static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
>  	return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
>  }
>  
> -
>  static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
>  				 const struct sk_buff *skb)
>  {
> @@ -494,10 +500,38 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
>  	bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
>  }
>  
> +static inline void qdisc_bstats_update_cpu(struct Qdisc *sch,
> +					   const struct sk_buff *skb)
> +{
> +	struct gnet_stats_basic_cpu *bstats =
> +				this_cpu_ptr(sch->bstats_qdisc.cpu_bstats);
> +
> +	u64_stats_update_begin(&bstats->syncp);
> +	bstats_update(&bstats->bstats, skb);
> +	u64_stats_update_end(&bstats->syncp);
> +}
> +
>  static inline void qdisc_bstats_update(struct Qdisc *sch,
>  				       const struct sk_buff *skb)
>  {
> -	bstats_update(&sch->bstats, skb);
> +	bstats_update(&sch->bstats_qdisc.bstats, skb);
> +}
> +
> +static inline bool qdisc_is_lockless(struct Qdisc *q)

const ?

btw, not sure if the name is correct. You could still use percpu stats
even for a qdisc using a lock at some point...

> +{
> +	return q->flags & TCQ_F_LLQDISC;
> +}
> +
> +static inline int
> +qdisc_bstats_copy_qdisc(struct gnet_dump *d, struct Qdisc *q)

const struct Qdisc *q ?

> +{
> +	int err;
> +
> +	if (qdisc_is_lockless(q))
> +		err = gnet_stats_copy_basic_cpu(d, q->bstats_qdisc.cpu_bstats);
> +	else
> +		err = gnet_stats_copy_basic(d, &q->bstats_qdisc.bstats);
> +	return err;
>  }

I believe you should reuse a single function, gnet_stats_copy_basic()
and add an additional parameter (per_cpu pointer), that can be NULL.

>  
>  static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
> diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
> index 9d33dff..0d13319 100644
> --- a/net/core/gen_estimator.c
> +++ b/net/core/gen_estimator.c
> @@ -91,6 +91,8 @@ struct gen_estimator
>  	u32			avpps;
>  	struct rcu_head		e_rcu;
>  	struct rb_node		node;
> +	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
> +	struct rcu_head		head;
>  };
>  
>  struct gen_estimator_head
> @@ -122,11 +124,21 @@ static void est_timer(unsigned long arg)
>  
>  		spin_lock(e->stats_lock);
>  		read_lock(&est_lock);
> -		if (e->bstats == NULL)
> +
> +		if (e->bstats == NULL && e->cpu_bstats == NULL)
>  			goto skip;

You should only test e->bstats here.

>  
> -		nbytes = e->bstats->bytes;
> -		npackets = e->bstats->packets;
> +		if (e->cpu_bstats) {
> +			struct gnet_stats_basic_packed b = {0};
> +
> +			__gnet_stats_copy_basic_cpu(&b, e->cpu_bstats);
> +			nbytes = b.bytes;
> +			npackets = b.packets;
> +		} else {
> +			nbytes = e->bstats->bytes;
> +			npackets = e->bstats->packets;
> +		}
> +
>  		brate = (nbytes - e->last_bytes)<<(7 - idx);
>  		e->last_bytes = nbytes;
>  		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
> @@ -146,6 +158,11 @@ skip:
>  	rcu_read_unlock();
>  }
>  
> +static void *gen_get_bstats(struct gen_estimator *est)
> +{
> +	return est->cpu_bstats ? (void *)est->cpu_bstats : (void *)est->bstats;
> +}

Note there is no guarantee that percpu addresses and kmalloc() addresses
do not collide on some arches.

Thats a bit ugly, you could change your convention to always sort on
est->bstats

(No need for this helper)

> +
>  static void gen_add_node(struct gen_estimator *est)
>  {
>  	struct rb_node **p = &est_root.rb_node, *parent = NULL;
> @@ -156,7 +173,7 @@ static void gen_add_node(struct gen_estimator *est)
>  		parent = *p;
>  		e = rb_entry(parent, struct gen_estimator, node);
>  
> -		if (est->bstats > e->bstats)
> +		if (gen_get_bstats(est)  > gen_get_bstats(e))

Change not needed.

>  			p = &parent->rb_right;
>  		else
>  			p = &parent->rb_left;
> @@ -167,18 +184,20 @@ static void gen_add_node(struct gen_estimator *est)
>  
>  static
>  struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
> +				    const struct gnet_stats_basic_cpu __percpu *cpu_bstats,
>  				    const struct gnet_stats_rate_est64 *rate_est)
>  {
>  	struct rb_node *p = est_root.rb_node;
> +	void *b = bstats ? (void *)bstats : (void *)cpu_bstats;
>  

Same complexity not needed. You should not have to change this function
at all.

>  	while (p) {
>  		struct gen_estimator *e;
>  
>  		e = rb_entry(p, struct gen_estimator, node);
>  
> -		if (bstats > e->bstats)
> +		if (b > gen_get_bstats(e))
>  			p = p->rb_right;
> -		else if (bstats < e->bstats || rate_est != e->rate_est)
> +		else if (b < gen_get_bstats(e) || rate_est != e->rate_est)
>  			p = p->rb_left;
>  		else
>  			return e;
> @@ -203,6 +222,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
>   *
>   */
>  int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
> +		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
>  		      struct gnet_stats_rate_est64 *rate_est,
>  		      spinlock_t *stats_lock,
>  		      struct nlattr *opt)
> @@ -221,14 +241,24 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
>  	if (est == NULL)
>  		return -ENOBUFS;
>  
> +	if (cpu_bstats) {
> +		struct gnet_stats_basic_packed b = {0};
> +
> +		est->cpu_bstats = cpu_bstats;
> +		__gnet_stats_copy_basic_cpu(&b, cpu_bstats);
> +		est->last_bytes = b.bytes;
> +		est->last_packets = b.packets;
> +	} else {
> +		est->bstats = bstats;

	est->bstats should be set in both cases, this is the key for searches.


> +		est->last_bytes = bstats->bytes;
> +		est->last_packets = bstats->packets;
> +	}
> +
>  	idx = parm->interval + 2;
> -	est->bstats = bstats;
>  	est->rate_est = rate_est;
>  	est->stats_lock = stats_lock;
>  	est->ewma_log = parm->ewma_log;
> -	est->last_bytes = bstats->bytes;
>  	est->avbps = rate_est->bps<<5;
> -	est->last_packets = bstats->packets;
>  	est->avpps = rate_est->pps<<10;
>  
>  	spin_lock_bh(&est_tree_lock);
> @@ -258,14 +288,14 @@ EXPORT_SYMBOL(gen_new_estimator);
>   * Note : Caller should respect an RCU grace period before freeing stats_lock
>   */
>  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
> +			struct gnet_stats_basic_cpu __percpu *cpu_bstats,
>  			struct gnet_stats_rate_est64 *rate_est)
>  {
>  	struct gen_estimator *e;
>  
>  	spin_lock_bh(&est_tree_lock);
> -	while ((e = gen_find_node(bstats, rate_est))) {
> +	while ((e = gen_find_node(bstats, cpu_bstats, rate_est))) {
>  		rb_erase(&e->node, &est_root);
> -
>  		write_lock(&est_lock);
>  		e->bstats = NULL;
>  		write_unlock(&est_lock);
> @@ -290,11 +320,12 @@ EXPORT_SYMBOL(gen_kill_estimator);
>   * Returns 0 on success or a negative error code.
>   */
>  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
> +			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
>  			  struct gnet_stats_rate_est64 *rate_est,
>  			  spinlock_t *stats_lock, struct nlattr *opt)
>  {
> -	gen_kill_estimator(bstats, rate_est);
> -	return gen_new_estimator(bstats, rate_est, stats_lock, opt);
> +	gen_kill_estimator(bstats, cpu_bstats, rate_est);
> +	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
>  }
>  EXPORT_SYMBOL(gen_replace_estimator);
>  
> @@ -306,6 +337,7 @@ EXPORT_SYMBOL(gen_replace_estimator);
>   * Returns true if estimator is active, and false if not.
>   */
>  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
> +			  const struct gnet_stats_basic_cpu __percpu *cpu_bstats,
>  			  const struct gnet_stats_rate_est64 *rate_est)
>  {
>  	bool res;
> @@ -313,7 +345,7 @@ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
>  	ASSERT_RTNL();
>  
>  	spin_lock_bh(&est_tree_lock);
> -	res = gen_find_node(bstats, rate_est) != NULL;
> +	res = gen_find_node(bstats, cpu_bstats, rate_est) != NULL;
>  	spin_unlock_bh(&est_tree_lock);
>  
>  	return res;
> diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
> index 2ddbce4..e43b55f 100644
> --- a/net/core/gen_stats.c
> +++ b/net/core/gen_stats.c
> @@ -128,6 +128,50 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b)
>  }
>  EXPORT_SYMBOL(gnet_stats_copy_basic);
>  
> +void
> +__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
> +			struct gnet_stats_basic_packed *b)
> +{
> +	bstats->bytes = b->bytes;
> +	bstats->packets = b->packets;
> +}
> +EXPORT_SYMBOL(__gnet_stats_copy_basic);
> +
> +void
> +__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
> +			    struct gnet_stats_basic_cpu __percpu *b)
> +{
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(b, i);
> +		unsigned int start;
> +		__u64 bytes;
> +		__u32 packets;
> +
> +		do {
> +			start = u64_stats_fetch_begin_irq(&bcpu->syncp);
> +			bytes = bcpu->bstats.bytes;
> +			packets = bcpu->bstats.packets;
> +		} while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
> +
> +		bstats->bytes += bcpu->bstats.bytes;
> +		bstats->packets += bcpu->bstats.packets;
> +	}
> +}
> +EXPORT_SYMBOL(__gnet_stats_copy_basic_cpu);
> +
> +int
> +gnet_stats_copy_basic_cpu(struct gnet_dump *d,
> +			  struct gnet_stats_basic_cpu __percpu *b)
> +{
> +	struct gnet_stats_basic_packed bstats = {0};
> +
> +	__gnet_stats_copy_basic_cpu(&bstats, b);
> +	return gnet_stats_copy_basic(d, &bstats);
> +}
> +EXPORT_SYMBOL(gnet_stats_copy_basic_cpu);
> +
>  /**
>   * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
>   * @d: dumping handle
> @@ -143,12 +187,13 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
>  int
>  gnet_stats_copy_rate_est(struct gnet_dump *d,
>  			 const struct gnet_stats_basic_packed *b,
> +			 const struct gnet_stats_basic_cpu __percpu *cpu_b,
>  			 struct gnet_stats_rate_est64 *r)
>  {
>  	struct gnet_stats_rate_est est;
>  	int res;
>  
> -	if (b && !gen_estimator_active(b, r))
> +	if ((b || cpu_b) && !gen_estimator_active(b, cpu_b, r))
>  		return 0;
>  
>  	est.bps = min_t(u64, UINT_MAX, r->bps);
> diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
> index 370adf6..02e2532 100644
> --- a/net/netfilter/xt_RATEEST.c
> +++ b/net/netfilter/xt_RATEEST.c
> @@ -64,7 +64,7 @@ void xt_rateest_put(struct xt_rateest *est)
>  	mutex_lock(&xt_rateest_mutex);
>  	if (--est->refcnt == 0) {
>  		hlist_del(&est->list);
> -		gen_kill_estimator(&est->bstats, &est->rstats);
> +		gen_kill_estimator(&est->bstats, NULL, &est->rstats);
>  		/*
>  		 * gen_estimator est_timer() might access est->lock or bstats,
>  		 * wait a RCU grace period before freeing 'est'
> @@ -136,7 +136,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
>  	cfg.est.interval	= info->interval;
>  	cfg.est.ewma_log	= info->ewma_log;
>  
> -	ret = gen_new_estimator(&est->bstats, &est->rstats,
> +	ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
>  				&est->lock, &cfg.opt);
>  	if (ret < 0)
>  		goto err2;
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index ae32b5b..89504ea 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -35,7 +35,7 @@ void tcf_hash_destroy(struct tc_action *a)
>  	spin_lock_bh(&hinfo->lock);
>  	hlist_del(&p->tcfc_head);
>  	spin_unlock_bh(&hinfo->lock);
> -	gen_kill_estimator(&p->tcfc_bstats,
> +	gen_kill_estimator(&p->tcfc_bstats, NULL,
>  			   &p->tcfc_rate_est);
>  	/*
>  	 * gen_estimator est_timer() might access p->tcfc_lock
> @@ -228,7 +228,7 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
>  {
>  	struct tcf_common *pc = a->priv;
>  	if (est)
> -		gen_kill_estimator(&pc->tcfc_bstats,
> +		gen_kill_estimator(&pc->tcfc_bstats, NULL,
>  				   &pc->tcfc_rate_est);
>  	kfree_rcu(pc, tcfc_rcu);
>  }
> @@ -252,7 +252,8 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
>  	p->tcfc_tm.install = jiffies;
>  	p->tcfc_tm.lastuse = jiffies;
>  	if (est) {
> -		int err = gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est,
> +		int err = gen_new_estimator(&p->tcfc_bstats, NULL,
> +					    &p->tcfc_rate_est,
>  					    &p->tcfc_lock, est);
>  		if (err) {
>  			kfree(p);
> @@ -620,7 +621,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
>  		goto errout;
>  
>  	if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 ||
> -	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
> +	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, NULL,
>  				     &p->tcfc_rate_est) < 0 ||
>  	    gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0)
>  		goto errout;
> diff --git a/net/sched/act_police.c b/net/sched/act_police.c
> index f32bcb0..26f4bb3 100644
> --- a/net/sched/act_police.c
> +++ b/net/sched/act_police.c
> @@ -178,14 +178,14 @@ override:
>  
>  	spin_lock_bh(&police->tcf_lock);
>  	if (est) {
> -		err = gen_replace_estimator(&police->tcf_bstats,
> +		err = gen_replace_estimator(&police->tcf_bstats, NULL,
>  					    &police->tcf_rate_est,
>  					    &police->tcf_lock, est);
>  		if (err)
>  			goto failure_unlock;
>  	} else if (tb[TCA_POLICE_AVRATE] &&
>  		   (ret == ACT_P_CREATED ||
> -		    !gen_estimator_active(&police->tcf_bstats,
> +		    !gen_estimator_active(&police->tcf_bstats, NULL,
>  					  &police->tcf_rate_est))) {
>  		err = -EINVAL;
>  		goto failure_unlock;
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index ca62483..beb2064 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -942,6 +942,13 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
>  	sch->handle = handle;
>  
>  	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
> +		if (qdisc_is_lockless(sch)) {
> +			sch->bstats_qdisc.cpu_bstats =
> +				alloc_percpu(struct gnet_stats_basic_cpu);
> +			if (!sch->bstats_qdisc.cpu_bstats)
> +				goto err_out4;
> +		}
> +
>  		if (tca[TCA_STAB]) {
>  			stab = qdisc_get_stab(tca[TCA_STAB]);
>  			if (IS_ERR(stab)) {
> @@ -964,8 +971,18 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
>  			else
>  				root_lock = qdisc_lock(sch);
>  
> -			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
> -						root_lock, tca[TCA_RATE]);
> +			if (qdisc_is_lockless(sch))
> +				err = gen_new_estimator(NULL,
> +							sch->bstats_qdisc.cpu_bstats,
> +							&sch->rate_est,
> +							root_lock,
> +							tca[TCA_RATE]);
> +			else
> +				err = gen_new_estimator(&sch->bstats_qdisc.bstats,
> +							NULL,
> +							&sch->rate_est,
> +							root_lock,
> +							tca[TCA_RATE]);


Note that you could have a simpler convention : and use	

err = gen_new_estimator(&sch->bstats_qdisc.bstats,
			sch->bstats_qdisc.cpu_bstats,
			root_lock,
			tca[TCA_RATE]);

This way, this patch would be way smaller.
		  


>  			if (err)
>  				goto err_out4;
>  		}
> @@ -1022,9 +1039,11 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
>  		   because change can't be undone. */
>  		if (sch->flags & TCQ_F_MQROOT)
>  			goto out;
> -		gen_replace_estimator(&sch->bstats, &sch->rate_est,
> -					    qdisc_root_sleeping_lock(sch),
> -					    tca[TCA_RATE]);
> +		gen_replace_estimator(&sch->bstats_qdisc.bstats,
> +				      sch->bstats_qdisc.cpu_bstats,
> +				      &sch->rate_est,
> +				      qdisc_root_sleeping_lock(sch),
> +				      tca[TCA_RATE]);
>  	}
>  out:
>  	return 0;
> @@ -1304,6 +1323,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
>  	unsigned char *b = skb_tail_pointer(skb);
>  	struct gnet_dump d;
>  	struct qdisc_size_table *stab;
> +	int err;
>  
>  	cond_resched();
>  	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
> @@ -1334,10 +1354,25 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
>  	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
>  		goto nla_put_failure;
>  
> -	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
> -	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
> +	if (qdisc_is_lockless(q)) {
> +		err = gnet_stats_copy_basic_cpu(&d, q->bstats_qdisc.cpu_bstats);
> +		if (err < 0)
> +			goto nla_put_failure;
> +		err = gnet_stats_copy_rate_est(&d, NULL,
> +					       q->bstats_qdisc.cpu_bstats,
> +					       &q->rate_est);
> +	} else {
> +		err = gnet_stats_copy_basic(&d, &q->bstats_qdisc.bstats);
> +		if (err < 0)
> +			goto nla_put_failure;
> +		err = gnet_stats_copy_rate_est(&d,
> +					       &q->bstats_qdisc.bstats, NULL,
> +					       &q->rate_est);
> +	}

Same thing here....

You could simply extend existing functions with one cpu_bstats argument.

If it is null, then you copy the traditional bstats.


> +
> +	if (err < 0 ||
>  	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
> -		goto nla_put_failure;
> +			goto nla_put_failure;
>  
>  	if (gnet_stats_finish_copy(&d) < 0)
>  		goto nla_put_failure;
> diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
> index a3244a8..208074d 100644
> --- a/net/sched/sch_cbq.c
> +++ b/net/sched/sch_cbq.c
> @@ -1602,7 +1602,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
>  		cl->xstats.undertime = cl->undertime - q->now;
>  
>  	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
> -	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
> +	    gnet_stats_copy_rate_est(d, &cl->bstats, NULL, &cl->rate_est) < 0 ||
>  	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
>  		return -1;
>  
> @@ -1671,7 +1671,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
>  	tcf_destroy_chain(&cl->filter_list);
>  	qdisc_destroy(cl->q);
>  	qdisc_put_rtab(cl->R_tab);
> -	gen_kill_estimator(&cl->bstats, &cl->rate_est);
> +	gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
>  	if (cl != &q->link)
>  		kfree(cl);
>  }
> @@ -1759,7 +1759,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
>  		}
>  
>  		if (tca[TCA_RATE]) {
> -			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
> +			err = gen_replace_estimator(&cl->bstats, NULL,
> +						    &cl->rate_est,
>  						    qdisc_root_sleeping_lock(sch),
>  						    tca[TCA_RATE]);
>  			if (err) {
> @@ -1852,7 +1853,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
>  		goto failure;
>  
>  	if (tca[TCA_RATE]) {
> -		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
> +		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
>  					qdisc_root_sleeping_lock(sch),
>  					tca[TCA_RATE]);
>  		if (err) {
> diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
> index d8b5ccf..88a65f6 100644
> --- a/net/sched/sch_drr.c
> +++ b/net/sched/sch_drr.c
> @@ -88,7 +88,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
>  
>  	if (cl != NULL) {
>  		if (tca[TCA_RATE]) {
> -			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
> +			err = gen_replace_estimator(&cl->bstats, NULL,
> +						    &cl->rate_est,
>  						    qdisc_root_sleeping_lock(sch),
>  						    tca[TCA_RATE]);
>  			if (err)
> @@ -116,7 +117,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
>  		cl->qdisc = &noop_qdisc;
>  
>  	if (tca[TCA_RATE]) {
> -		err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
> +		err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
>  					    qdisc_root_sleeping_lock(sch),
>  					    tca[TCA_RATE]);
>  		if (err) {
> @@ -138,7 +139,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
>  
>  static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
>  {
> -	gen_kill_estimator(&cl->bstats, &cl->rate_est);
> +	gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
>  	qdisc_destroy(cl->qdisc);
>  	kfree(cl);
>  }
> @@ -283,7 +284,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
>  	}
>  
>  	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
> -	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
> +	    gnet_stats_copy_rate_est(d, &cl->bstats, NULL, &cl->rate_est) < 0 ||
>  	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
>  		return -1;
>  
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 346ef85..e3d203e 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -632,6 +632,9 @@ static void qdisc_rcu_free(struct rcu_head *head)
>  {
>  	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
>  
> +	if (qdisc_is_lockless(qdisc))
> +		free_percpu(qdisc->bstats_qdisc.cpu_bstats);
> +
>  	kfree((char *) qdisc - qdisc->padded);
>  }
>  
> @@ -648,7 +651,13 @@ void qdisc_destroy(struct Qdisc *qdisc)
>  
>  	qdisc_put_stab(rtnl_dereference(qdisc->stab));
>  #endif
> -	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
> +	if (qdisc_is_lockless(qdisc))
> +		gen_kill_estimator(NULL, qdisc->bstats_qdisc.cpu_bstats,
> +				   &qdisc->rate_est);
> +	else
> +		gen_kill_estimator(&qdisc->bstats_qdisc.bstats, NULL,
> +				   &qdisc->rate_est);
> +
>  	if (ops->reset)
>  		ops->reset(qdisc);
>  	if (ops->destroy)
> diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
> index 04b0de4..3b112d2 100644
> --- a/net/sched/sch_hfsc.c
> +++ b/net/sched/sch_hfsc.c
> @@ -1014,9 +1014,12 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
>  		cur_time = psched_get_time();
>  
>  		if (tca[TCA_RATE]) {
> -			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
> -					      qdisc_root_sleeping_lock(sch),
> -					      tca[TCA_RATE]);
> +			spinlock_t *lock = qdisc_root_sleeping_lock(sch);
> +
> +			err = gen_replace_estimator(&cl->bstats, NULL,
> +						    &cl->rate_est,
> +						    lock,
> +						    tca[TCA_RATE]);
>  			if (err)
>  				return err;
>  		}
> @@ -1063,7 +1066,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
>  		return -ENOBUFS;
>  
>  	if (tca[TCA_RATE]) {
> -		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
> +		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
>  					qdisc_root_sleeping_lock(sch),
>  					tca[TCA_RATE]);
>  		if (err) {
> @@ -1113,7 +1116,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
>  
>  	tcf_destroy_chain(&cl->filter_list);
>  	qdisc_destroy(cl->qdisc);
> -	gen_kill_estimator(&cl->bstats, &cl->rate_est);
> +	gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
>  	if (cl != &q->root)
>  		kfree(cl);
>  }
> @@ -1375,7 +1378,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
>  	xstats.rtwork  = cl->cl_cumul;
>  
>  	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
> -	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
> +	    gnet_stats_copy_rate_est(d, &cl->bstats, NULL, &cl->rate_est) < 0 ||
>  	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
>  		return -1;
>  
> diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
> index 6d16b9b..8067a82 100644
> --- a/net/sched/sch_htb.c
> +++ b/net/sched/sch_htb.c
> @@ -1145,7 +1145,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
>  	cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
>  
>  	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
> -	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
> +	    gnet_stats_copy_rate_est(d, NULL, NULL, &cl->rate_est) < 0 ||
>  	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
>  		return -1;
>  
> @@ -1235,7 +1235,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
>  		WARN_ON(!cl->un.leaf.q);
>  		qdisc_destroy(cl->un.leaf.q);
>  	}
> -	gen_kill_estimator(&cl->bstats, &cl->rate_est);
> +	gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
>  	tcf_destroy_chain(&cl->filter_list);
>  	kfree(cl);
>  }
> @@ -1402,7 +1402,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
>  			goto failure;
>  
>  		if (htb_rate_est || tca[TCA_RATE]) {
> -			err = gen_new_estimator(&cl->bstats, &cl->rate_est,
> +			err = gen_new_estimator(&cl->bstats, NULL,
> +						&cl->rate_est,
>  						qdisc_root_sleeping_lock(sch),
>  						tca[TCA_RATE] ? : &est.nla);
>  			if (err) {
> @@ -1464,8 +1465,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
>  			parent->children++;
>  	} else {
>  		if (tca[TCA_RATE]) {
> -			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
> -						    qdisc_root_sleeping_lock(sch),
> +			spinlock_t *lock = qdisc_root_sleeping_lock(sch);
> +
> +			err = gen_replace_estimator(&cl->bstats, NULL,
> +						    &cl->rate_est,
> +						    lock,
>  						    tca[TCA_RATE]);
>  			if (err)
>  				return err;
> diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
> index b351125..25302be 100644
> --- a/net/sched/sch_ingress.c
> +++ b/net/sched/sch_ingress.c
> @@ -65,7 +65,7 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>  
>  	result = tc_classify(skb, fl, &res);
>  
> -	qdisc_bstats_update(sch, skb);
> +	qdisc_bstats_update_cpu(sch, skb);
>  	switch (result) {
>  	case TC_ACT_SHOT:
>  		result = TC_ACT_SHOT;
> diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
> index a8b2864..e96a41f 100644
> --- a/net/sched/sch_mq.c
> +++ b/net/sched/sch_mq.c
> @@ -98,20 +98,22 @@ static void mq_attach(struct Qdisc *sch)
>  
>  static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
>  {
> +	struct gnet_stats_basic_packed *bstats = &sch->bstats_qdisc.bstats;
>  	struct net_device *dev = qdisc_dev(sch);
>  	struct Qdisc *qdisc;
>  	unsigned int ntx;
>  
>  	sch->q.qlen = 0;
> -	memset(&sch->bstats, 0, sizeof(sch->bstats));
> +	memset(bstats, 0, sizeof(sch->bstats_qdisc));
>  	memset(&sch->qstats, 0, sizeof(sch->qstats));
>  
>  	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
>  		qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
>  		spin_lock_bh(qdisc_lock(qdisc));
>  		sch->q.qlen		+= qdisc->q.qlen;
> -		sch->bstats.bytes	+= qdisc->bstats.bytes;
> -		sch->bstats.packets	+= qdisc->bstats.packets;
> +
> +		bstats->bytes		+= qdisc->bstats_qdisc.bstats.bytes;
> +		bstats->packets		+= qdisc->bstats_qdisc.bstats.packets;
>  		sch->qstats.qlen	+= qdisc->qstats.qlen;
>  		sch->qstats.backlog	+= qdisc->qstats.backlog;
>  		sch->qstats.drops	+= qdisc->qstats.drops;
> @@ -201,7 +203,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
>  
>  	sch = dev_queue->qdisc_sleeping;
>  	sch->qstats.qlen = sch->q.qlen;
> -	if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
> +	if (gnet_stats_copy_basic(d, &sch->bstats_qdisc.bstats) < 0 ||
>  	    gnet_stats_copy_queue(d, &sch->qstats) < 0)
>  		return -1;
>  	return 0;
> diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
> index 37e7d25..6e3e4e9 100644
> --- a/net/sched/sch_mqprio.c
> +++ b/net/sched/sch_mqprio.c
> @@ -219,6 +219,7 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
>  
>  static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
>  {
> +	struct gnet_stats_basic_packed *bstats = &sch->bstats_qdisc.bstats;
>  	struct net_device *dev = qdisc_dev(sch);
>  	struct mqprio_sched *priv = qdisc_priv(sch);
>  	unsigned char *b = skb_tail_pointer(skb);
> @@ -227,15 +228,15 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
>  	unsigned int i;
>  
>  	sch->q.qlen = 0;
> -	memset(&sch->bstats, 0, sizeof(sch->bstats));
> +	memset(bstats, 0, sizeof(sch->bstats_qdisc.bstats));
>  	memset(&sch->qstats, 0, sizeof(sch->qstats));
>  
>  	for (i = 0; i < dev->num_tx_queues; i++) {
>  		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
>  		spin_lock_bh(qdisc_lock(qdisc));
>  		sch->q.qlen		+= qdisc->q.qlen;
> -		sch->bstats.bytes	+= qdisc->bstats.bytes;
> -		sch->bstats.packets	+= qdisc->bstats.packets;
> +		bstats->bytes		+= qdisc->bstats_qdisc.bstats.bytes;
> +		bstats->packets		+= qdisc->bstats_qdisc.bstats.packets;
>  		sch->qstats.qlen	+= qdisc->qstats.qlen;
>  		sch->qstats.backlog	+= qdisc->qstats.backlog;
>  		sch->qstats.drops	+= qdisc->qstats.drops;
> @@ -344,8 +345,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
>  
>  			qdisc = rtnl_dereference(q->qdisc);
>  			spin_lock_bh(qdisc_lock(qdisc));
> -			bstats.bytes      += qdisc->bstats.bytes;
> -			bstats.packets    += qdisc->bstats.packets;
> +			bstats.bytes      += qdisc->bstats_qdisc.bstats.bytes;
> +			bstats.packets    += qdisc->bstats_qdisc.bstats.packets;
>  			qstats.qlen       += qdisc->qstats.qlen;
>  			qstats.backlog    += qdisc->qstats.backlog;
>  			qstats.drops      += qdisc->qstats.drops;
> @@ -363,7 +364,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
>  
>  		sch = dev_queue->qdisc_sleeping;
>  		sch->qstats.qlen = sch->q.qlen;
> -		if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
> +		if (gnet_stats_copy_basic(d, &sch->bstats_qdisc.bstats) < 0 ||
>  		    gnet_stats_copy_queue(d, &sch->qstats) < 0)
>  			return -1;
>  	}
> diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
> index c0466c1..d6430102 100644
> --- a/net/sched/sch_multiq.c
> +++ b/net/sched/sch_multiq.c
> @@ -361,7 +361,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
>  
>  	cl_q = q->queues[cl - 1];
>  	cl_q->qstats.qlen = cl_q->q.qlen;
> -	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
> +	if (gnet_stats_copy_basic(d, &cl_q->bstats_qdisc.bstats) < 0 ||
>  	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
>  		return -1;
>  
> diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
> index 03ef99e..9069aba 100644
> --- a/net/sched/sch_prio.c
> +++ b/net/sched/sch_prio.c
> @@ -325,7 +325,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
>  
>  	cl_q = q->queues[cl - 1];
>  	cl_q->qstats.qlen = cl_q->q.qlen;
> -	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
> +	if (gnet_stats_copy_basic(d, &cl_q->bstats_qdisc.bstats) < 0 ||
>  	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
>  		return -1;
>  
> diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
> index 602ea01..52a602d 100644
> --- a/net/sched/sch_qfq.c
> +++ b/net/sched/sch_qfq.c
> @@ -459,7 +459,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
>  
>  	if (cl != NULL) { /* modify existing class */
>  		if (tca[TCA_RATE]) {
> -			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
> +			err = gen_replace_estimator(&cl->bstats, NULL,
> +						    &cl->rate_est,
>  						    qdisc_root_sleeping_lock(sch),
>  						    tca[TCA_RATE]);
>  			if (err)
> @@ -484,7 +485,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
>  		cl->qdisc = &noop_qdisc;
>  
>  	if (tca[TCA_RATE]) {
> -		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
> +		err = gen_new_estimator(&cl->bstats, NULL,
> +					&cl->rate_est,
>  					qdisc_root_sleeping_lock(sch),
>  					tca[TCA_RATE]);
>  		if (err)
> @@ -505,7 +507,7 @@ set_change_agg:
>  		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
>  		if (new_agg == NULL) {
>  			err = -ENOBUFS;
> -			gen_kill_estimator(&cl->bstats, &cl->rate_est);
> +			gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
>  			goto destroy_class;
>  		}
>  		sch_tree_lock(sch);
> @@ -530,7 +532,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
>  	struct qfq_sched *q = qdisc_priv(sch);
>  
>  	qfq_rm_from_agg(q, cl);
> -	gen_kill_estimator(&cl->bstats, &cl->rate_est);
> +	gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
>  	qdisc_destroy(cl->qdisc);
>  	kfree(cl);
>  }
> @@ -668,7 +670,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
>  	xstats.lmax = cl->agg->lmax;
>  
>  	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
> -	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
> +	    gnet_stats_copy_rate_est(d, &cl->bstats, NULL, &cl->rate_est) < 0 ||
>  	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
>  		return -1;
>  
> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index ea4271d..4b7ca2b 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -6,6 +6,11 @@ 
 #include <linux/rtnetlink.h>
 #include <linux/pkt_sched.h>
 
+struct gnet_stats_basic_cpu {
+	struct gnet_stats_basic_packed bstats;
+	struct u64_stats_sync syncp;
+};
+
 struct gnet_dump {
 	spinlock_t *      lock;
 	struct sk_buff *  skb;
@@ -26,10 +31,17 @@  int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
 				 int tc_stats_type, int xstats_type,
 				 spinlock_t *lock, struct gnet_dump *d);
 
+int gnet_stats_copy_basic_cpu(struct gnet_dump *d,
+			      struct gnet_stats_basic_cpu __percpu *b);
+void __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
+				 struct gnet_stats_basic_cpu __percpu *b);
 int gnet_stats_copy_basic(struct gnet_dump *d,
 			  struct gnet_stats_basic_packed *b);
+void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+			     struct gnet_stats_basic_packed *b);
 int gnet_stats_copy_rate_est(struct gnet_dump *d,
 			     const struct gnet_stats_basic_packed *b,
+			     const struct gnet_stats_basic_cpu __percpu *cpu_b,
 			     struct gnet_stats_rate_est64 *r);
 int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q);
 int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
@@ -37,13 +49,17 @@  int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
 int gnet_stats_finish_copy(struct gnet_dump *d);
 
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 		      struct gnet_stats_rate_est64 *rate_est,
 		      spinlock_t *stats_lock, struct nlattr *opt);
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
+			struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 			struct gnet_stats_rate_est64 *rate_est);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 			  struct gnet_stats_rate_est64 *rate_est,
 			  spinlock_t *stats_lock, struct nlattr *opt);
 bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
+			  const struct gnet_stats_basic_cpu __percpu *cpu_bstat,
 			  const struct gnet_stats_rate_est64 *rate_est);
 #endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 1e89b9a..54e318f 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -6,6 +6,7 @@ 
 #include <linux/rcupdate.h>
 #include <linux/pkt_sched.h>
 #include <linux/pkt_cls.h>
+#include <linux/percpu.h>
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 
@@ -57,6 +58,9 @@  struct Qdisc {
 				      * Its true for MQ/MQPRIO slaves, or non
 				      * multiqueue device.
 				      */
+#define TCQ_F_LLQDISC		0x20 /* lockless qdiscs can run without using
+				      * the qdisc lock to serialize skbs.
+				      */
 #define TCQ_F_WARN_NONWC	(1 << 16)
 	u32			limit;
 	const struct Qdisc_ops	*ops;
@@ -83,7 +87,10 @@  struct Qdisc {
 	 */
 	unsigned long		state;
 	struct sk_buff_head	q;
-	struct gnet_stats_basic_packed bstats;
+	union {
+		struct gnet_stats_basic_packed bstats;
+		struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	} bstats_qdisc;
 	unsigned int		__state;
 	struct gnet_stats_queue	qstats;
 	struct rcu_head		rcu_head;
@@ -486,7 +493,6 @@  static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
 	return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
 }
 
-
 static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
 				 const struct sk_buff *skb)
 {
@@ -494,10 +500,38 @@  static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
 	bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
 }
 
+static inline void qdisc_bstats_update_cpu(struct Qdisc *sch,
+					   const struct sk_buff *skb)
+{
+	struct gnet_stats_basic_cpu *bstats =
+				this_cpu_ptr(sch->bstats_qdisc.cpu_bstats);
+
+	u64_stats_update_begin(&bstats->syncp);
+	bstats_update(&bstats->bstats, skb);
+	u64_stats_update_end(&bstats->syncp);
+}
+
 static inline void qdisc_bstats_update(struct Qdisc *sch,
 				       const struct sk_buff *skb)
 {
-	bstats_update(&sch->bstats, skb);
+	bstats_update(&sch->bstats_qdisc.bstats, skb);
+}
+
+static inline bool qdisc_is_lockless(struct Qdisc *q)
+{
+	return q->flags & TCQ_F_LLQDISC;
+}
+
+static inline int
+qdisc_bstats_copy_qdisc(struct gnet_dump *d, struct Qdisc *q)
+{
+	int err;
+
+	if (qdisc_is_lockless(q))
+		err = gnet_stats_copy_basic_cpu(d, q->bstats_qdisc.cpu_bstats);
+	else
+		err = gnet_stats_copy_basic(d, &q->bstats_qdisc.bstats);
+	return err;
 }
 
 static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9d33dff..0d13319 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -91,6 +91,8 @@  struct gen_estimator
 	u32			avpps;
 	struct rcu_head		e_rcu;
 	struct rb_node		node;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	struct rcu_head		head;
 };
 
 struct gen_estimator_head
@@ -122,11 +124,21 @@  static void est_timer(unsigned long arg)
 
 		spin_lock(e->stats_lock);
 		read_lock(&est_lock);
-		if (e->bstats == NULL)
+
+		if (e->bstats == NULL && e->cpu_bstats == NULL)
 			goto skip;
 
-		nbytes = e->bstats->bytes;
-		npackets = e->bstats->packets;
+		if (e->cpu_bstats) {
+			struct gnet_stats_basic_packed b = {0};
+
+			__gnet_stats_copy_basic_cpu(&b, e->cpu_bstats);
+			nbytes = b.bytes;
+			npackets = b.packets;
+		} else {
+			nbytes = e->bstats->bytes;
+			npackets = e->bstats->packets;
+		}
+
 		brate = (nbytes - e->last_bytes)<<(7 - idx);
 		e->last_bytes = nbytes;
 		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
@@ -146,6 +158,11 @@  skip:
 	rcu_read_unlock();
 }
 
+static void *gen_get_bstats(struct gen_estimator *est)
+{
+	return est->cpu_bstats ? (void *)est->cpu_bstats : (void *)est->bstats;
+}
+
 static void gen_add_node(struct gen_estimator *est)
 {
 	struct rb_node **p = &est_root.rb_node, *parent = NULL;
@@ -156,7 +173,7 @@  static void gen_add_node(struct gen_estimator *est)
 		parent = *p;
 		e = rb_entry(parent, struct gen_estimator, node);
 
-		if (est->bstats > e->bstats)
+		if (gen_get_bstats(est)  > gen_get_bstats(e))
 			p = &parent->rb_right;
 		else
 			p = &parent->rb_left;
@@ -167,18 +184,20 @@  static void gen_add_node(struct gen_estimator *est)
 
 static
 struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
+				    const struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 				    const struct gnet_stats_rate_est64 *rate_est)
 {
 	struct rb_node *p = est_root.rb_node;
+	void *b = bstats ? (void *)bstats : (void *)cpu_bstats;
 
 	while (p) {
 		struct gen_estimator *e;
 
 		e = rb_entry(p, struct gen_estimator, node);
 
-		if (bstats > e->bstats)
+		if (b > gen_get_bstats(e))
 			p = p->rb_right;
-		else if (bstats < e->bstats || rate_est != e->rate_est)
+		else if (b < gen_get_bstats(e) || rate_est != e->rate_est)
 			p = p->rb_left;
 		else
 			return e;
@@ -203,6 +222,7 @@  struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
  *
  */
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 		      struct gnet_stats_rate_est64 *rate_est,
 		      spinlock_t *stats_lock,
 		      struct nlattr *opt)
@@ -221,14 +241,24 @@  int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	if (est == NULL)
 		return -ENOBUFS;
 
+	if (cpu_bstats) {
+		struct gnet_stats_basic_packed b = {0};
+
+		est->cpu_bstats = cpu_bstats;
+		__gnet_stats_copy_basic_cpu(&b, cpu_bstats);
+		est->last_bytes = b.bytes;
+		est->last_packets = b.packets;
+	} else {
+		est->bstats = bstats;
+		est->last_bytes = bstats->bytes;
+		est->last_packets = bstats->packets;
+	}
+
 	idx = parm->interval + 2;
-	est->bstats = bstats;
 	est->rate_est = rate_est;
 	est->stats_lock = stats_lock;
 	est->ewma_log = parm->ewma_log;
-	est->last_bytes = bstats->bytes;
 	est->avbps = rate_est->bps<<5;
-	est->last_packets = bstats->packets;
 	est->avpps = rate_est->pps<<10;
 
 	spin_lock_bh(&est_tree_lock);
@@ -258,14 +288,14 @@  EXPORT_SYMBOL(gen_new_estimator);
  * Note : Caller should respect an RCU grace period before freeing stats_lock
  */
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
+			struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 			struct gnet_stats_rate_est64 *rate_est)
 {
 	struct gen_estimator *e;
 
 	spin_lock_bh(&est_tree_lock);
-	while ((e = gen_find_node(bstats, rate_est))) {
+	while ((e = gen_find_node(bstats, cpu_bstats, rate_est))) {
 		rb_erase(&e->node, &est_root);
-
 		write_lock(&est_lock);
 		e->bstats = NULL;
 		write_unlock(&est_lock);
@@ -290,11 +320,12 @@  EXPORT_SYMBOL(gen_kill_estimator);
  * Returns 0 on success or a negative error code.
  */
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 			  struct gnet_stats_rate_est64 *rate_est,
 			  spinlock_t *stats_lock, struct nlattr *opt)
 {
-	gen_kill_estimator(bstats, rate_est);
-	return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+	gen_kill_estimator(bstats, cpu_bstats, rate_est);
+	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
 }
 EXPORT_SYMBOL(gen_replace_estimator);
 
@@ -306,6 +337,7 @@  EXPORT_SYMBOL(gen_replace_estimator);
  * Returns true if estimator is active, and false if not.
  */
 bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
+			  const struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 			  const struct gnet_stats_rate_est64 *rate_est)
 {
 	bool res;
@@ -313,7 +345,7 @@  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
 	ASSERT_RTNL();
 
 	spin_lock_bh(&est_tree_lock);
-	res = gen_find_node(bstats, rate_est) != NULL;
+	res = gen_find_node(bstats, cpu_bstats, rate_est) != NULL;
 	spin_unlock_bh(&est_tree_lock);
 
 	return res;
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 2ddbce4..e43b55f 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -128,6 +128,50 @@  gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b)
 }
 EXPORT_SYMBOL(gnet_stats_copy_basic);
 
+void
+__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+			struct gnet_stats_basic_packed *b)
+{
+	bstats->bytes = b->bytes;
+	bstats->packets = b->packets;
+}
+EXPORT_SYMBOL(__gnet_stats_copy_basic);
+
+void
+__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
+			    struct gnet_stats_basic_cpu __percpu *b)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(b, i);
+		unsigned int start;
+		__u64 bytes;
+		__u32 packets;
+
+		do {
+			start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+			bytes = bcpu->bstats.bytes;
+			packets = bcpu->bstats.packets;
+		} while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+
+		bstats->bytes += bcpu->bstats.bytes;
+		bstats->packets += bcpu->bstats.packets;
+	}
+}
+EXPORT_SYMBOL(__gnet_stats_copy_basic_cpu);
+
+int
+gnet_stats_copy_basic_cpu(struct gnet_dump *d,
+			  struct gnet_stats_basic_cpu __percpu *b)
+{
+	struct gnet_stats_basic_packed bstats = {0};
+
+	__gnet_stats_copy_basic_cpu(&bstats, b);
+	return gnet_stats_copy_basic(d, &bstats);
+}
+EXPORT_SYMBOL(gnet_stats_copy_basic_cpu);
+
 /**
  * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
  * @d: dumping handle
@@ -143,12 +187,13 @@  EXPORT_SYMBOL(gnet_stats_copy_basic);
 int
 gnet_stats_copy_rate_est(struct gnet_dump *d,
 			 const struct gnet_stats_basic_packed *b,
+			 const struct gnet_stats_basic_cpu __percpu *cpu_b,
 			 struct gnet_stats_rate_est64 *r)
 {
 	struct gnet_stats_rate_est est;
 	int res;
 
-	if (b && !gen_estimator_active(b, r))
+	if ((b || cpu_b) && !gen_estimator_active(b, cpu_b, r))
 		return 0;
 
 	est.bps = min_t(u64, UINT_MAX, r->bps);
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 370adf6..02e2532 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -64,7 +64,7 @@  void xt_rateest_put(struct xt_rateest *est)
 	mutex_lock(&xt_rateest_mutex);
 	if (--est->refcnt == 0) {
 		hlist_del(&est->list);
-		gen_kill_estimator(&est->bstats, &est->rstats);
+		gen_kill_estimator(&est->bstats, NULL, &est->rstats);
 		/*
 		 * gen_estimator est_timer() might access est->lock or bstats,
 		 * wait a RCU grace period before freeing 'est'
@@ -136,7 +136,7 @@  static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 	cfg.est.interval	= info->interval;
 	cfg.est.ewma_log	= info->ewma_log;
 
-	ret = gen_new_estimator(&est->bstats, &est->rstats,
+	ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
 				&est->lock, &cfg.opt);
 	if (ret < 0)
 		goto err2;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index ae32b5b..89504ea 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -35,7 +35,7 @@  void tcf_hash_destroy(struct tc_action *a)
 	spin_lock_bh(&hinfo->lock);
 	hlist_del(&p->tcfc_head);
 	spin_unlock_bh(&hinfo->lock);
-	gen_kill_estimator(&p->tcfc_bstats,
+	gen_kill_estimator(&p->tcfc_bstats, NULL,
 			   &p->tcfc_rate_est);
 	/*
 	 * gen_estimator est_timer() might access p->tcfc_lock
@@ -228,7 +228,7 @@  void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
 {
 	struct tcf_common *pc = a->priv;
 	if (est)
-		gen_kill_estimator(&pc->tcfc_bstats,
+		gen_kill_estimator(&pc->tcfc_bstats, NULL,
 				   &pc->tcfc_rate_est);
 	kfree_rcu(pc, tcfc_rcu);
 }
@@ -252,7 +252,8 @@  int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
 	p->tcfc_tm.install = jiffies;
 	p->tcfc_tm.lastuse = jiffies;
 	if (est) {
-		int err = gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est,
+		int err = gen_new_estimator(&p->tcfc_bstats, NULL,
+					    &p->tcfc_rate_est,
 					    &p->tcfc_lock, est);
 		if (err) {
 			kfree(p);
@@ -620,7 +621,7 @@  int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 		goto errout;
 
 	if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
+	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, NULL,
 				     &p->tcfc_rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0)
 		goto errout;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index f32bcb0..26f4bb3 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -178,14 +178,14 @@  override:
 
 	spin_lock_bh(&police->tcf_lock);
 	if (est) {
-		err = gen_replace_estimator(&police->tcf_bstats,
+		err = gen_replace_estimator(&police->tcf_bstats, NULL,
 					    &police->tcf_rate_est,
 					    &police->tcf_lock, est);
 		if (err)
 			goto failure_unlock;
 	} else if (tb[TCA_POLICE_AVRATE] &&
 		   (ret == ACT_P_CREATED ||
-		    !gen_estimator_active(&police->tcf_bstats,
+		    !gen_estimator_active(&police->tcf_bstats, NULL,
 					  &police->tcf_rate_est))) {
 		err = -EINVAL;
 		goto failure_unlock;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ca62483..beb2064 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -942,6 +942,13 @@  qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 	sch->handle = handle;
 
 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
+		if (qdisc_is_lockless(sch)) {
+			sch->bstats_qdisc.cpu_bstats =
+				alloc_percpu(struct gnet_stats_basic_cpu);
+			if (!sch->bstats_qdisc.cpu_bstats)
+				goto err_out4;
+		}
+
 		if (tca[TCA_STAB]) {
 			stab = qdisc_get_stab(tca[TCA_STAB]);
 			if (IS_ERR(stab)) {
@@ -964,8 +971,18 @@  qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 			else
 				root_lock = qdisc_lock(sch);
 
-			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
-						root_lock, tca[TCA_RATE]);
+			if (qdisc_is_lockless(sch))
+				err = gen_new_estimator(NULL,
+							sch->bstats_qdisc.cpu_bstats,
+							&sch->rate_est,
+							root_lock,
+							tca[TCA_RATE]);
+			else
+				err = gen_new_estimator(&sch->bstats_qdisc.bstats,
+							NULL,
+							&sch->rate_est,
+							root_lock,
+							tca[TCA_RATE]);
 			if (err)
 				goto err_out4;
 		}
@@ -1022,9 +1039,11 @@  static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 		   because change can't be undone. */
 		if (sch->flags & TCQ_F_MQROOT)
 			goto out;
-		gen_replace_estimator(&sch->bstats, &sch->rate_est,
-					    qdisc_root_sleeping_lock(sch),
-					    tca[TCA_RATE]);
+		gen_replace_estimator(&sch->bstats_qdisc.bstats,
+				      sch->bstats_qdisc.cpu_bstats,
+				      &sch->rate_est,
+				      qdisc_root_sleeping_lock(sch),
+				      tca[TCA_RATE]);
 	}
 out:
 	return 0;
@@ -1304,6 +1323,7 @@  static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	unsigned char *b = skb_tail_pointer(skb);
 	struct gnet_dump d;
 	struct qdisc_size_table *stab;
+	int err;
 
 	cond_resched();
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
@@ -1334,10 +1354,25 @@  static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
 		goto nla_put_failure;
 
-	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
+	if (qdisc_is_lockless(q)) {
+		err = gnet_stats_copy_basic_cpu(&d, q->bstats_qdisc.cpu_bstats);
+		if (err < 0)
+			goto nla_put_failure;
+		err = gnet_stats_copy_rate_est(&d, NULL,
+					       q->bstats_qdisc.cpu_bstats,
+					       &q->rate_est);
+	} else {
+		err = gnet_stats_copy_basic(&d, &q->bstats_qdisc.bstats);
+		if (err < 0)
+			goto nla_put_failure;
+		err = gnet_stats_copy_rate_est(&d,
+					       &q->bstats_qdisc.bstats, NULL,
+					       &q->rate_est);
+	}
+
+	if (err < 0 ||
 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
-		goto nla_put_failure;
+			goto nla_put_failure;
 
 	if (gnet_stats_finish_copy(&d) < 0)
 		goto nla_put_failure;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index a3244a8..208074d 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1602,7 +1602,7 @@  cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 		cl->xstats.undertime = cl->undertime - q->now;
 
 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->bstats, NULL, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
 		return -1;
 
@@ -1671,7 +1671,7 @@  static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
 	tcf_destroy_chain(&cl->filter_list);
 	qdisc_destroy(cl->q);
 	qdisc_put_rtab(cl->R_tab);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
 	if (cl != &q->link)
 		kfree(cl);
 }
@@ -1759,7 +1759,8 @@  cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 		}
 
 		if (tca[TCA_RATE]) {
-			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+			err = gen_replace_estimator(&cl->bstats, NULL,
+						    &cl->rate_est,
 						    qdisc_root_sleeping_lock(sch),
 						    tca[TCA_RATE]);
 			if (err) {
@@ -1852,7 +1853,7 @@  cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 		goto failure;
 
 	if (tca[TCA_RATE]) {
-		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
 					qdisc_root_sleeping_lock(sch),
 					tca[TCA_RATE]);
 		if (err) {
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index d8b5ccf..88a65f6 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -88,7 +88,8 @@  static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 
 	if (cl != NULL) {
 		if (tca[TCA_RATE]) {
-			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+			err = gen_replace_estimator(&cl->bstats, NULL,
+						    &cl->rate_est,
 						    qdisc_root_sleeping_lock(sch),
 						    tca[TCA_RATE]);
 			if (err)
@@ -116,7 +117,7 @@  static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		cl->qdisc = &noop_qdisc;
 
 	if (tca[TCA_RATE]) {
-		err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+		err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
 					    qdisc_root_sleeping_lock(sch),
 					    tca[TCA_RATE]);
 		if (err) {
@@ -138,7 +139,7 @@  static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 
 static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
 {
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
 	qdisc_destroy(cl->qdisc);
 	kfree(cl);
 }
@@ -283,7 +284,7 @@  static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	}
 
 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->bstats, NULL, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
 		return -1;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 346ef85..e3d203e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -632,6 +632,9 @@  static void qdisc_rcu_free(struct rcu_head *head)
 {
 	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
 
+	if (qdisc_is_lockless(qdisc))
+		free_percpu(qdisc->bstats_qdisc.cpu_bstats);
+
 	kfree((char *) qdisc - qdisc->padded);
 }
 
@@ -648,7 +651,13 @@  void qdisc_destroy(struct Qdisc *qdisc)
 
 	qdisc_put_stab(rtnl_dereference(qdisc->stab));
 #endif
-	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+	if (qdisc_is_lockless(qdisc))
+		gen_kill_estimator(NULL, qdisc->bstats_qdisc.cpu_bstats,
+				   &qdisc->rate_est);
+	else
+		gen_kill_estimator(&qdisc->bstats_qdisc.bstats, NULL,
+				   &qdisc->rate_est);
+
 	if (ops->reset)
 		ops->reset(qdisc);
 	if (ops->destroy)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 04b0de4..3b112d2 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1014,9 +1014,12 @@  hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		cur_time = psched_get_time();
 
 		if (tca[TCA_RATE]) {
-			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
-					      qdisc_root_sleeping_lock(sch),
-					      tca[TCA_RATE]);
+			spinlock_t *lock = qdisc_root_sleeping_lock(sch);
+
+			err = gen_replace_estimator(&cl->bstats, NULL,
+						    &cl->rate_est,
+						    lock,
+						    tca[TCA_RATE]);
 			if (err)
 				return err;
 		}
@@ -1063,7 +1066,7 @@  hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		return -ENOBUFS;
 
 	if (tca[TCA_RATE]) {
-		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
 					qdisc_root_sleeping_lock(sch),
 					tca[TCA_RATE]);
 		if (err) {
@@ -1113,7 +1116,7 @@  hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
 
 	tcf_destroy_chain(&cl->filter_list);
 	qdisc_destroy(cl->qdisc);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
 	if (cl != &q->root)
 		kfree(cl);
 }
@@ -1375,7 +1378,7 @@  hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	xstats.rtwork  = cl->cl_cumul;
 
 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->bstats, NULL, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
 		return -1;
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 6d16b9b..8067a82 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1145,7 +1145,7 @@  htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
 	cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
 
 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, NULL, NULL, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
 		return -1;
 
@@ -1235,7 +1235,7 @@  static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 		WARN_ON(!cl->un.leaf.q);
 		qdisc_destroy(cl->un.leaf.q);
 	}
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
 	tcf_destroy_chain(&cl->filter_list);
 	kfree(cl);
 }
@@ -1402,7 +1402,8 @@  static int htb_change_class(struct Qdisc *sch, u32 classid,
 			goto failure;
 
 		if (htb_rate_est || tca[TCA_RATE]) {
-			err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+			err = gen_new_estimator(&cl->bstats, NULL,
+						&cl->rate_est,
 						qdisc_root_sleeping_lock(sch),
 						tca[TCA_RATE] ? : &est.nla);
 			if (err) {
@@ -1464,8 +1465,11 @@  static int htb_change_class(struct Qdisc *sch, u32 classid,
 			parent->children++;
 	} else {
 		if (tca[TCA_RATE]) {
-			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
-						    qdisc_root_sleeping_lock(sch),
+			spinlock_t *lock = qdisc_root_sleeping_lock(sch);
+
+			err = gen_replace_estimator(&cl->bstats, NULL,
+						    &cl->rate_est,
+						    lock,
 						    tca[TCA_RATE]);
 			if (err)
 				return err;
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index b351125..25302be 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -65,7 +65,7 @@  static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	result = tc_classify(skb, fl, &res);
 
-	qdisc_bstats_update(sch, skb);
+	qdisc_bstats_update_cpu(sch, skb);
 	switch (result) {
 	case TC_ACT_SHOT:
 		result = TC_ACT_SHOT;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index a8b2864..e96a41f 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -98,20 +98,22 @@  static void mq_attach(struct Qdisc *sch)
 
 static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
+	struct gnet_stats_basic_packed *bstats = &sch->bstats_qdisc.bstats;
 	struct net_device *dev = qdisc_dev(sch);
 	struct Qdisc *qdisc;
 	unsigned int ntx;
 
 	sch->q.qlen = 0;
-	memset(&sch->bstats, 0, sizeof(sch->bstats));
+	memset(bstats, 0, sizeof(sch->bstats_qdisc));
 	memset(&sch->qstats, 0, sizeof(sch->qstats));
 
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
 		qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
 		spin_lock_bh(qdisc_lock(qdisc));
 		sch->q.qlen		+= qdisc->q.qlen;
-		sch->bstats.bytes	+= qdisc->bstats.bytes;
-		sch->bstats.packets	+= qdisc->bstats.packets;
+
+		bstats->bytes		+= qdisc->bstats_qdisc.bstats.bytes;
+		bstats->packets		+= qdisc->bstats_qdisc.bstats.packets;
 		sch->qstats.qlen	+= qdisc->qstats.qlen;
 		sch->qstats.backlog	+= qdisc->qstats.backlog;
 		sch->qstats.drops	+= qdisc->qstats.drops;
@@ -201,7 +203,7 @@  static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 	sch = dev_queue->qdisc_sleeping;
 	sch->qstats.qlen = sch->q.qlen;
-	if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+	if (gnet_stats_copy_basic(d, &sch->bstats_qdisc.bstats) < 0 ||
 	    gnet_stats_copy_queue(d, &sch->qstats) < 0)
 		return -1;
 	return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 37e7d25..6e3e4e9 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -219,6 +219,7 @@  static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
 
 static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
+	struct gnet_stats_basic_packed *bstats = &sch->bstats_qdisc.bstats;
 	struct net_device *dev = qdisc_dev(sch);
 	struct mqprio_sched *priv = qdisc_priv(sch);
 	unsigned char *b = skb_tail_pointer(skb);
@@ -227,15 +228,15 @@  static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	unsigned int i;
 
 	sch->q.qlen = 0;
-	memset(&sch->bstats, 0, sizeof(sch->bstats));
+	memset(bstats, 0, sizeof(sch->bstats_qdisc.bstats));
 	memset(&sch->qstats, 0, sizeof(sch->qstats));
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
 		spin_lock_bh(qdisc_lock(qdisc));
 		sch->q.qlen		+= qdisc->q.qlen;
-		sch->bstats.bytes	+= qdisc->bstats.bytes;
-		sch->bstats.packets	+= qdisc->bstats.packets;
+		bstats->bytes		+= qdisc->bstats_qdisc.bstats.bytes;
+		bstats->packets		+= qdisc->bstats_qdisc.bstats.packets;
 		sch->qstats.qlen	+= qdisc->qstats.qlen;
 		sch->qstats.backlog	+= qdisc->qstats.backlog;
 		sch->qstats.drops	+= qdisc->qstats.drops;
@@ -344,8 +345,8 @@  static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 			qdisc = rtnl_dereference(q->qdisc);
 			spin_lock_bh(qdisc_lock(qdisc));
-			bstats.bytes      += qdisc->bstats.bytes;
-			bstats.packets    += qdisc->bstats.packets;
+			bstats.bytes      += qdisc->bstats_qdisc.bstats.bytes;
+			bstats.packets    += qdisc->bstats_qdisc.bstats.packets;
 			qstats.qlen       += qdisc->qstats.qlen;
 			qstats.backlog    += qdisc->qstats.backlog;
 			qstats.drops      += qdisc->qstats.drops;
@@ -363,7 +364,7 @@  static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 		sch = dev_queue->qdisc_sleeping;
 		sch->qstats.qlen = sch->q.qlen;
-		if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+		if (gnet_stats_copy_basic(d, &sch->bstats_qdisc.bstats) < 0 ||
 		    gnet_stats_copy_queue(d, &sch->qstats) < 0)
 			return -1;
 	}
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index c0466c1..d6430102 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -361,7 +361,7 @@  static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 	cl_q = q->queues[cl - 1];
 	cl_q->qstats.qlen = cl_q->q.qlen;
-	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	if (gnet_stats_copy_basic(d, &cl_q->bstats_qdisc.bstats) < 0 ||
 	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
 		return -1;
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 03ef99e..9069aba 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -325,7 +325,7 @@  static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 	cl_q = q->queues[cl - 1];
 	cl_q->qstats.qlen = cl_q->q.qlen;
-	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	if (gnet_stats_copy_basic(d, &cl_q->bstats_qdisc.bstats) < 0 ||
 	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
 		return -1;
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 602ea01..52a602d 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -459,7 +459,8 @@  static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 
 	if (cl != NULL) { /* modify existing class */
 		if (tca[TCA_RATE]) {
-			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+			err = gen_replace_estimator(&cl->bstats, NULL,
+						    &cl->rate_est,
 						    qdisc_root_sleeping_lock(sch),
 						    tca[TCA_RATE]);
 			if (err)
@@ -484,7 +485,8 @@  static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		cl->qdisc = &noop_qdisc;
 
 	if (tca[TCA_RATE]) {
-		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+		err = gen_new_estimator(&cl->bstats, NULL,
+					&cl->rate_est,
 					qdisc_root_sleeping_lock(sch),
 					tca[TCA_RATE]);
 		if (err)
@@ -505,7 +507,7 @@  set_change_agg:
 		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
 		if (new_agg == NULL) {
 			err = -ENOBUFS;
-			gen_kill_estimator(&cl->bstats, &cl->rate_est);
+			gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
 			goto destroy_class;
 		}
 		sch_tree_lock(sch);
@@ -530,7 +532,7 @@  static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
 	struct qfq_sched *q = qdisc_priv(sch);
 
 	qfq_rm_from_agg(q, cl);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->bstats, NULL, &cl->rate_est);
 	qdisc_destroy(cl->qdisc);
 	kfree(cl);
 }
@@ -668,7 +670,7 @@  static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	xstats.lmax = cl->agg->lmax;
 
 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->bstats, NULL, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
 		return -1;