Message ID | 1279077475-2956-1-git-send-email-xiaosuo@gmail.com |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
Le mercredi 14 juillet 2010 à 11:17 +0800, Changli Gao a écrit : > I want to know if I can assign the sk to skb as nf_tproxy_core does to avoid > the duplicate search later. Thanks. > I suggest we first correct bugs before adding new features. For me, tproxy is a high suspect at this moment, I would not use it on production machine. > act_cpu: packet distributing > > This TC action can be used to redirect packets to the CPU: > * specified by the cpuid option > * specified by the class minor ID obtained previously > * on which the corresponding application runs > > It supports the similar functions of RPS and RFS, but is more flexible. > Thins kind of claims is disgusting. No documentation, I have no idea how you setup the thing. RPS still misses documentation, but activating it is easy. > Signed-off-by: Changli Gao <xiaosuo@gmail.com> > ---- > include/linux/netdevice.h | 2 > include/linux/tc_act/tc_cpu.h | 31 +++++ > include/net/sock.h | 24 +++ > include/net/tc_act/tc_cpu.h | 18 ++ > net/core/dev.c | 4 > net/core/sock.c | 1 > net/sched/Kconfig | 12 + > net/sched/Makefile | 1 > net/sched/act_cpu.c | 260 ++++++++++++++++++++++++++++++++++++++++++ > 9 files changed, 350 insertions(+), 3 deletions(-) > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index c4fedf0..318d422 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -1435,6 +1435,8 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd, > > DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); > > +int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail); > + > #define HAVE_NETIF_QUEUE > > extern void __netif_schedule(struct Qdisc *q); > diff --git a/include/linux/tc_act/tc_cpu.h b/include/linux/tc_act/tc_cpu.h > new file mode 100644 > index 0000000..2704607 > --- /dev/null > +++ b/include/linux/tc_act/tc_cpu.h > @@ -0,0 +1,31 @@ > +#ifndef __LINUX_TC_CPU_H > +#define __LINUX_TC_CPU_H > + > +#include <linux/pkt_cls.h> > +#include <linux/types.h> > + > +#define TCA_ACT_CPU 12 > + > +enum { > + TCA_CPU_UNSPEC, > + TCA_CPU_PARMS, > + TCA_CPU_TM, > + __TCA_CPU_MAX > +}; > +#define TCA_CPU_MAX (__TCA_CPU_MAX - 1) > + > +enum { > + TCA_CPU_TYPE_MAP, > + TCA_CPU_TYPE_CPUID, > + TCA_CPU_TYPE_SOCKET, > + __TCA_CPU_TYPE_MAX > +}; > +#define TCA_CPU_TYPE_MAX (__TCA_CPU_TYPE_MAX - 1) > + > +struct tc_cpu { > + tc_gen; > + __u32 type; > + __u32 value; > +}; > + > +#endif /* __LINUX_TC_CPU_H */ > diff --git a/include/net/sock.h b/include/net/sock.h > index 3100e71..7913158 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -200,6 +200,7 @@ struct sock_common { > * @sk_rcvtimeo: %SO_RCVTIMEO setting > * @sk_sndtimeo: %SO_SNDTIMEO setting > * @sk_rxhash: flow hash received from netif layer > + * @sk_cpu: the CPU on which the corresponding process works. > * @sk_filter: socket filtering instructions > * @sk_protinfo: private area, net family specific, when not using slab > * @sk_timer: sock cleanup timer > @@ -284,6 +285,9 @@ struct sock { > #ifdef CONFIG_RPS > __u32 sk_rxhash; > #endif > +#if defined(CONFIG_NET_ACT_CPU) || defined(CONFIG_NET_ACT_CPU_MODULE) > + int sk_cpu; > +#endif > unsigned long sk_flags; > unsigned long sk_lingertime; > struct sk_buff_head sk_error_queue; > @@ -639,7 +643,24 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) > return sk->sk_backlog_rcv(sk, skb); > } > > -static inline void sock_rps_record_flow(const struct sock *sk) > +static inline void sock_reset_cpu(struct sock *sk) > +{ > +#if defined(CONFIG_NET_ACT_CPU) || defined(CONFIG_NET_ACT_CPU_MODULE) > + sk->sk_cpu = nr_cpumask_bits; > +#endif > +} > + > +static inline void sock_save_cpu(struct sock *sk) > +{ > +#if defined(CONFIG_NET_ACT_CPU) || defined(CONFIG_NET_ACT_CPU_MODULE) > + int cpu = get_cpu(); > + if (sk->sk_cpu != cpu) > + sk->sk_cpu = cpu; > + put_cpu(); sk->sk_cpu = raw_smp_processor_id(); > +#endif > +} > + > +static inline void sock_rps_record_flow(struct sock *sk) > { > #ifdef CONFIG_RPS > struct rps_sock_flow_table *sock_flow_table; > @@ -649,6 +670,7 @@ static inline void sock_rps_record_flow(const struct sock *sk) > rps_record_sock_flow(sock_flow_table, sk->sk_rxhash); > rcu_read_unlock(); > #endif > + sock_save_cpu(sk); > } > > static inline void sock_rps_reset_flow(const struct sock *sk) > diff --git a/include/net/tc_act/tc_cpu.h b/include/net/tc_act/tc_cpu.h > new file mode 100644 > index 0000000..0504bf0 > --- /dev/null > +++ b/include/net/tc_act/tc_cpu.h > @@ -0,0 +1,18 @@ > +#ifndef __NET_TC_CPU_H > +#define __NET_TC_CPU_H > + > +#include <linux/types.h> > +#include <net/act_api.h> > + > +struct tcf_cpu { > + struct tcf_common common; > + u32 type; > + u32 value; > +}; > + > +static inline struct tcf_cpu *to_tcf_cpu(struct tcf_common *pc) > +{ > + return container_of(pc, struct tcf_cpu, common); > +} > + > +#endif /* __NET_TC_CPU_H */ > diff --git a/net/core/dev.c b/net/core/dev.c > index e2b9fa2..45e8a21 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -2443,8 +2443,7 @@ static int rps_ipi_queued(struct softnet_data *sd) > * enqueue_to_backlog is called to queue an skb to a per CPU backlog > * queue (may be a remote CPU queue). > */ > -static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > - unsigned int *qtail) > +int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) > { > struct softnet_data *sd; > unsigned long flags; > @@ -2482,6 +2481,7 @@ enqueue: > kfree_skb(skb); > return NET_RX_DROP; > } > +EXPORT_SYMBOL(enqueue_to_backlog); > > /** > * netif_rx - post buffer to the network code > diff --git a/net/core/sock.c b/net/core/sock.c > index 363bc26..7a71e76 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -1045,6 +1045,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, > if (!try_module_get(prot->owner)) > goto out_free_sec; > sk_tx_queue_clear(sk); > + sock_reset_cpu(sk); > } > > return sk; > diff --git a/net/sched/Kconfig b/net/sched/Kconfig > index 2f691fb..a826758 100644 > --- a/net/sched/Kconfig > +++ b/net/sched/Kconfig > @@ -427,6 +427,18 @@ config NET_CLS_ACT > A recent version of the iproute2 package is required to use > extended matches. > > +config NET_ACT_CPU > + tristate "Packet distributing" > + depends on NET_CLS_ACT > + depends on RPS > + ---help--- > + Say Y here to do packets distributing. With it, you can distribute > + the packets among the CPUs on the system in any way as you like. It > + only works in ingress. > + > + To compile this code as a module, choose M here: the > + module will be called act_cpu. > + > config NET_ACT_POLICE > tristate "Traffic Policing" > depends on NET_CLS_ACT > diff --git a/net/sched/Makefile b/net/sched/Makefile > index f14e71b..a9e0b96 100644 > --- a/net/sched/Makefile > +++ b/net/sched/Makefile > @@ -7,6 +7,7 @@ obj-y := sch_generic.o sch_mq.o > obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o > obj-$(CONFIG_NET_CLS) += cls_api.o > obj-$(CONFIG_NET_CLS_ACT) += act_api.o > +obj-$(CONFIG_NET_ACT_CPU) += act_cpu.o > obj-$(CONFIG_NET_ACT_POLICE) += act_police.o > obj-$(CONFIG_NET_ACT_GACT) += act_gact.o > obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o > diff --git a/net/sched/act_cpu.c b/net/sched/act_cpu.c > new file mode 100644 > index 0000000..6b7d7fc > --- /dev/null > +++ b/net/sched/act_cpu.c > @@ -0,0 +1,260 @@ > +/* > + * Packet distributing actions > + * > + * Copyright (c) 2010- Changli Gao <xiaosuo@gmail.com> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License as published by the Free > + * Software Foundation; either version 2 of the License, or (at your option) > + * any later version. > + */ > + > +#include <linux/types.h> > +#include <linux/kernel.h> > +#include <linux/string.h> > +#include <linux/errno.h> > +#include <linux/skbuff.h> > +#include <linux/rtnetlink.h> > +#include <linux/module.h> > +#include <linux/init.h> > +#include <linux/gfp.h> > +#include <net/net_namespace.h> > +#include <net/netlink.h> > +#include <net/pkt_sched.h> > +#include <net/tcp.h> > +#include <net/udp.h> > +#include <linux/tc_act/tc_cpu.h> > +#include <net/tc_act/tc_cpu.h> > + > +#define CPU_TAB_MASK 15 > +static struct tcf_common *tcf_cpu_ht[CPU_TAB_MASK + 1]; > +static u32 cpu_idx_gen; > +static DEFINE_RWLOCK(cpu_lock); > + > +static struct tcf_hashinfo cpu_hash_info = { > + .htab = tcf_cpu_ht, > + .hmask = CPU_TAB_MASK, > + .lock = &cpu_lock > +}; > + > +static const struct nla_policy cpu_policy[TCA_CPU_MAX + 1] = { > + [TCA_CPU_PARMS] = { .len = sizeof(struct tc_cpu) }, > +}; > + > +static int tcf_cpu_init(struct nlattr *nla, struct nlattr *est, > + struct tc_action *a, int ovr, int bind) > +{ > + struct nlattr *tb[TCA_CPU_MAX + 1]; > + struct tc_cpu *parm; > + struct tcf_cpu *p; > + struct tcf_common *pc; > + int ret; > + > + if (nla == NULL) > + return -EINVAL; > + ret = nla_parse_nested(tb, TCA_CPU_MAX, nla, cpu_policy); > + if (ret < 0) > + return ret; > + if (tb[TCA_CPU_PARMS] == NULL) > + return -EINVAL; > + parm = nla_data(tb[TCA_CPU_PARMS]); > + if (parm->type > TCA_CPU_TYPE_MAX || parm->action != TC_ACT_STOLEN) > + return -EINVAL; > + > + pc = tcf_hash_check(parm->index, a, bind, &cpu_hash_info); > + if (!pc) { > + pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, > + &cpu_idx_gen, &cpu_hash_info); > + if (IS_ERR(pc)) > + return PTR_ERR(pc); > + ret = ACT_P_CREATED; > + } else { > + if (!ovr) { > + tcf_hash_release(pc, bind, &cpu_hash_info); > + return -EEXIST; > + } > + ret = 0; > + } > + p = to_tcf_cpu(pc); > + > + spin_lock_bh(&p->tcf_lock); > + p->type = parm->type; > + p->value = parm->value; > + p->tcf_action = parm->action; > + spin_unlock_bh(&p->tcf_lock); > + > + if (ret == ACT_P_CREATED) > + tcf_hash_insert(pc, &cpu_hash_info); > + > + return ret; > +} > + > +static int tcf_cpu_cleanup(struct tc_action *a, int bind) > +{ > + struct tcf_cpu *p = a->priv; > + > + return tcf_hash_release(&p->common, bind, &cpu_hash_info); > +} > + > +static int tcf_cpu_from_sock(struct sk_buff *skb) > +{ > + struct iphdr *iph; > + struct sock *sk; > + struct { > + __be16 source, dest; > + } *ports; > + int cpu; > + > + if (skb->dev == NULL || skb->protocol != __constant_htons(ETH_P_IP)) > + goto err; > + if (!pskb_may_pull(skb, sizeof(*iph))) > + goto err; > + iph = (struct iphdr *) skb->data; > + if (!pskb_may_pull(skb, iph->ihl * 4 + 4)) > + goto err; > + ports = (void *) (skb->data + iph->ihl * 4); Why doing the search again, in case skb->sk already set by another module before you, like tproxy ? > + switch (iph->protocol) { > + case IPPROTO_TCP: > + sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, > + ports->source, iph->daddr, ports->dest, > + skb->dev->ifindex); > + break; > + case IPPROTO_UDP: > + sk = udp4_lib_lookup(dev_net(skb->dev), iph->saddr, > + ports->source, iph->daddr, ports->dest, > + skb->dev->ifindex); > + break; > + default: > + goto err; > + } > + > + if (!sk) > + goto err; > + cpu = sk->sk_cpu; > + if (sk->sk_protocol == IPPROTO_TCP && sk->sk_state == TCP_TIME_WAIT) > + inet_twsk_put(inet_twsk(sk)); > + else > + sock_put(sk); > + > + return cpu; > + > +err: > + return smp_processor_id(); > +} > + > +static int tcf_cpu(struct sk_buff *skb, struct tc_action *a, > + struct tcf_result *res) > +{ > + struct tcf_cpu *p = a->priv; > + u32 type; > + u32 value; > + int cpu, action; > + struct sk_buff *nskb; > + unsigned int qtail; > + > + spin_lock(&p->tcf_lock); Ok, the big lock... We have a lockless TCP/UDP input path, and this modules adds a lock again. > + p->tcf_tm.lastuse = jiffies; > + p->tcf_bstats.bytes += qdisc_pkt_len(skb); > + p->tcf_bstats.packets++; > + type = p->type; > + value = p->value; > + action = p->tcf_action; > + spin_unlock(&p->tcf_lock); > + Please change all this crap (legacy crap, copied from other tc modules), by modern one, using RCU and no locking in hot path. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jul 14, 2010 at 11:41 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > Le mercredi 14 juillet 2010 à 11:17 +0800, Changli Gao a écrit : >> I want to know if I can assign the sk to skb as nf_tproxy_core does to avoid >> the duplicate search later. Thanks. >> > > I suggest we first correct bugs before adding new features. > > For me, tproxy is a high suspect at this moment, I would not use it on > production machine. > >> act_cpu: packet distributing >> >> This TC action can be used to redirect packets to the CPU: >> * specified by the cpuid option >> * specified by the class minor ID obtained previously >> * on which the corresponding application runs >> >> It supports the similar functions of RPS and RFS, but is more flexible. >> > > Thins kind of claims is disgusting. > > No documentation, I have no idea how you setup the thing. for example: redirect packets to the CPU on which the corresponding application runs tc qdisc add dev eth0 ingress tc filter add dev eth0 parent ffff:0 protocol ip basic action cpu socket redirect packets to special CPU. tc filter add dev eth0 parent ffff:0 protocol ip basic action cpu cpuid 1 sport hash packet distributing among CPU 0-1. tc filter add dev eth0 parent ffff:0 protocol ip handle 1 flow hash keys proto-src divisor 2 action cpu map 1 >> +static inline void sock_save_cpu(struct sock *sk) >> +{ >> +#if defined(CONFIG_NET_ACT_CPU) || defined(CONFIG_NET_ACT_CPU_MODULE) >> + int cpu = get_cpu(); >> + if (sk->sk_cpu != cpu) >> + sk->sk_cpu = cpu; >> + put_cpu(); > > sk->sk_cpu = raw_smp_processor_id(); Thanks. > > Why doing the search again, in case skb->sk already set by another > module before you, like tproxy ? > Although it is unlikely skb->sk is non null, as tc is before netfilter, I will handle this case. Thanks. >> +static int tcf_cpu(struct sk_buff *skb, struct tc_action *a, >> + struct tcf_result *res) >> +{ >> + struct tcf_cpu *p = a->priv; >> + u32 type; >> + u32 value; >> + int cpu, action; >> + struct sk_buff *nskb; >> + unsigned int qtail; >> + >> + spin_lock(&p->tcf_lock); > > Ok, the big lock... > > We have a lockless TCP/UDP input path, and this modules adds a lock > again. > >> + p->tcf_tm.lastuse = jiffies; >> + p->tcf_bstats.bytes += qdisc_pkt_len(skb); >> + p->tcf_bstats.packets++; >> + type = p->type; >> + value = p->value; >> + action = p->tcf_action; >> + spin_unlock(&p->tcf_lock); >> + > > Please change all this crap (legacy crap, copied from other tc > modules), by modern one, using RCU and no locking in hot path. > Thanks, I'll try. It is a write critical section, and for me it is difficult to convert this lock to RCU. Could you show me some examples? Thanks.
Le mercredi 14 juillet 2010 à 12:17 +0800, Changli Gao a écrit : > On Wed, Jul 14, 2010 at 11:41 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > > Why doing the search again, in case skb->sk already set by another > > module before you, like tproxy ? > > > > Although it is unlikely skb->sk is non null, as tc is before > netfilter, I will handle this case. Thanks. > In this case, provide the skb->sk already set case on tproxy, not in act_cpu. But doing it on both will give a hint for future modules... > >> +static int tcf_cpu(struct sk_buff *skb, struct tc_action *a, > >> + struct tcf_result *res) > >> +{ > >> + struct tcf_cpu *p = a->priv; > >> + u32 type; > >> + u32 value; > >> + int cpu, action; > >> + struct sk_buff *nskb; > >> + unsigned int qtail; > >> + > >> + spin_lock(&p->tcf_lock); > > > > Ok, the big lock... > > > > We have a lockless TCP/UDP input path, and this modules adds a lock > > again. > > > >> + p->tcf_tm.lastuse = jiffies; > >> + p->tcf_bstats.bytes += qdisc_pkt_len(skb); > >> + p->tcf_bstats.packets++; > >> + type = p->type; > >> + value = p->value; > >> + action = p->tcf_action; > >> + spin_unlock(&p->tcf_lock); > >> + > > > > Please change all this crap (legacy crap, copied from other tc > > modules), by modern one, using RCU and no locking in hot path. > > > > Thanks, I'll try. It is a write critical section, and for me it is > difficult to convert this lock to RCU. Could you show me some > examples? We can convert bytes/packets stats to percpu stats for example. That might need a generic change. Then, a normal RCU protection should be enough to fetch type/value/action fields. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2010-07-14 at 12:17 +0800, Changli Gao wrote: > Thanks, I'll try. It is a write critical section, and for me it is > difficult to convert this lock to RCU. Could you show me some > examples? RCU maybe a little trickier here Eric. Actions could be shared i.e. example, it is possible to have a policer action restricting rates for a group of flows across multiple netdevices etc. Since action stats get written to by different CPUs concurrently. It could be probably done if one was to implement per-cpu stats which get summed-up when user space asks. cheers, jamal -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le jeudi 15 juillet 2010 à 08:48 -0400, jamal a écrit : > On Wed, 2010-07-14 at 12:17 +0800, Changli Gao wrote: > > > Thanks, I'll try. It is a write critical section, and for me it is > > difficult to convert this lock to RCU. Could you show me some > > examples? > > RCU maybe a little trickier here Eric. Actions could be shared i.e. > example, it is possible to have a policer action restricting rates for a > group of flows across multiple netdevices etc. Since action stats get > written to by different CPUs concurrently. It could be probably done if > one was to implement per-cpu stats which get summed-up when user space > asks. It's certainly tricky, but is act_cpu useful in its current shape, based on an infrastructure that had to use a lock because of exact rates/accounting ? I dont understand how distributing packets to different cpus, if going through a central lock can be an improvement. Changli patches are most of the time not documented, and no performance data is provided. Even if we solve this locking problem, using percpu variables, act_cpu hits another problem : The socket refcount, taken by the 'master' cpu, and released by the consumer cpu. RFS provides sort of a lazy flow-based distribution without central lock or cache line ping pongs. Why Changli dont use this, we dont know yet. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jul 16, 2010 at 12:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > Le jeudi 15 juillet 2010 à 08:48 -0400, jamal a écrit : >> >> RCU maybe a little trickier here Eric. Actions could be shared i.e. >> example, it is possible to have a policer action restricting rates for a >> group of flows across multiple netdevices etc. Since action stats get >> written to by different CPUs concurrently. It could be probably done if >> one was to implement per-cpu stats which get summed-up when user space >> asks. > > It's certainly tricky, but is act_cpu useful in its current shape, based > on an infrastructure that had to use a lock because of exact > rates/accounting ? > > I dont understand how distributing packets to different cpus, if going > through a central lock can be an improvement. Changli patches are most > of the time not documented, and no performance data is provided. > The tcf_lock is per-instance, so I should not an issue here if the corresponding NIC isn't an multiqueue NIC or the instance is per-rx-queue. I agree the performance data is necessary and I'll publish it in the formal patch. > Even if we solve this locking problem, using percpu variables, act_cpu > hits another problem : > > The socket refcount, taken by the 'master' cpu, and released by the > consumer cpu. This is why I asked if I can assign sk to skb. > > RFS provides sort of a lazy flow-based distribution without central lock > or cache line ping pongs. Why Changli dont use this, we dont know yet. > The fact is I don't know how to do this. I'll work on this issue later. Thanks.
Le vendredi 16 juillet 2010 à 07:30 +0800, Changli Gao a écrit : > On Fri, Jul 16, 2010 at 12:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > > Even if we solve this locking problem, using percpu variables, act_cpu > > hits another problem : > > > > The socket refcount, taken by the 'master' cpu, and released by the > > consumer cpu. > This is why I asked if I can assign sk to skb. > This assignement doesnt change the cache line ping pong problem. Let me explain for you : You lookup the TCP or UDP socket. This automatically takes a refcount on it (atomic operation on sk->sk_refcnt). Then you assign sk to skb. (skb->sk = sk) Then you transfert skb handling to another cpu (IPI cost already in RPS as we all know) This remote cpu finds skb->sk (and avoids the lookup/refcnt) and handles the packet through TCP/UDP stack. This remote cpu _releases_ the socket refcount. Conclusion : "The socket refcount, taken by the 'master' cpu, and released by the consumer cpu." -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2010-07-16 at 07:30 +0800, Changli Gao wrote: > The tcf_lock is per-instance, so I should not an issue here if the > corresponding NIC isn't an multiqueue NIC or the instance is > per-rx-queue. It has more to do with config. Actions can be explicitly pointed to using the index parameter. Example for configuring two flows using an aggregate rate with a policer instance 1: tc filter add dev eth0 ... u32 flow1 action police blah index 1 tc filter add dev eth2 .... u32 flow2 action police blah index 1 If you dont configure it such that multiple tc flows share the same action as above, then likely the same cpu will always grab the lock. It is still costly as Eric points out - but i will be very curious to find out the results. > I agree > the performance data is necessary and I'll publish it in the formal > patch. If you do this, please compare against rps and non-rps and then i feel motivated to volunteer to create a setup (not in July unfortunately) to do a similar test on your code and compare against rps on a nehalem-with-shitty-sky2 that i can get time on. cheers, jamal -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c4fedf0..318d422 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1435,6 +1435,8 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd, DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail); + #define HAVE_NETIF_QUEUE extern void __netif_schedule(struct Qdisc *q); diff --git a/include/linux/tc_act/tc_cpu.h b/include/linux/tc_act/tc_cpu.h new file mode 100644 index 0000000..2704607 --- /dev/null +++ b/include/linux/tc_act/tc_cpu.h @@ -0,0 +1,31 @@ +#ifndef __LINUX_TC_CPU_H +#define __LINUX_TC_CPU_H + +#include <linux/pkt_cls.h> +#include <linux/types.h> + +#define TCA_ACT_CPU 12 + +enum { + TCA_CPU_UNSPEC, + TCA_CPU_PARMS, + TCA_CPU_TM, + __TCA_CPU_MAX +}; +#define TCA_CPU_MAX (__TCA_CPU_MAX - 1) + +enum { + TCA_CPU_TYPE_MAP, + TCA_CPU_TYPE_CPUID, + TCA_CPU_TYPE_SOCKET, + __TCA_CPU_TYPE_MAX +}; +#define TCA_CPU_TYPE_MAX (__TCA_CPU_TYPE_MAX - 1) + +struct tc_cpu { + tc_gen; + __u32 type; + __u32 value; +}; + +#endif /* __LINUX_TC_CPU_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 3100e71..7913158 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -200,6 +200,7 @@ struct sock_common { * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_rxhash: flow hash received from netif layer + * @sk_cpu: the CPU on which the corresponding process works. * @sk_filter: socket filtering instructions * @sk_protinfo: private area, net family specific, when not using slab * @sk_timer: sock cleanup timer @@ -284,6 +285,9 @@ struct sock { #ifdef CONFIG_RPS __u32 sk_rxhash; #endif +#if defined(CONFIG_NET_ACT_CPU) || defined(CONFIG_NET_ACT_CPU_MODULE) + int sk_cpu; +#endif unsigned long sk_flags; unsigned long sk_lingertime; struct sk_buff_head sk_error_queue; @@ -639,7 +643,24 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) return sk->sk_backlog_rcv(sk, skb); } -static inline void sock_rps_record_flow(const struct sock *sk) +static inline void sock_reset_cpu(struct sock *sk) +{ +#if defined(CONFIG_NET_ACT_CPU) || defined(CONFIG_NET_ACT_CPU_MODULE) + sk->sk_cpu = nr_cpumask_bits; +#endif +} + +static inline void sock_save_cpu(struct sock *sk) +{ +#if defined(CONFIG_NET_ACT_CPU) || defined(CONFIG_NET_ACT_CPU_MODULE) + int cpu = get_cpu(); + if (sk->sk_cpu != cpu) + sk->sk_cpu = cpu; + put_cpu(); +#endif +} + +static inline void sock_rps_record_flow(struct sock *sk) { #ifdef CONFIG_RPS struct rps_sock_flow_table *sock_flow_table; @@ -649,6 +670,7 @@ static inline void sock_rps_record_flow(const struct sock *sk) rps_record_sock_flow(sock_flow_table, sk->sk_rxhash); rcu_read_unlock(); #endif + sock_save_cpu(sk); } static inline void sock_rps_reset_flow(const struct sock *sk) diff --git a/include/net/tc_act/tc_cpu.h b/include/net/tc_act/tc_cpu.h new file mode 100644 index 0000000..0504bf0 --- /dev/null +++ b/include/net/tc_act/tc_cpu.h @@ -0,0 +1,18 @@ +#ifndef __NET_TC_CPU_H +#define __NET_TC_CPU_H + +#include <linux/types.h> +#include <net/act_api.h> + +struct tcf_cpu { + struct tcf_common common; + u32 type; + u32 value; +}; + +static inline struct tcf_cpu *to_tcf_cpu(struct tcf_common *pc) +{ + return container_of(pc, struct tcf_cpu, common); +} + +#endif /* __NET_TC_CPU_H */ diff --git a/net/core/dev.c b/net/core/dev.c index e2b9fa2..45e8a21 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2443,8 +2443,7 @@ static int rps_ipi_queued(struct softnet_data *sd) * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). */ -static int enqueue_to_backlog(struct sk_buff *skb, int cpu, - unsigned int *qtail) +int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { struct softnet_data *sd; unsigned long flags; @@ -2482,6 +2481,7 @@ enqueue: kfree_skb(skb); return NET_RX_DROP; } +EXPORT_SYMBOL(enqueue_to_backlog); /** * netif_rx - post buffer to the network code diff --git a/net/core/sock.c b/net/core/sock.c index 363bc26..7a71e76 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1045,6 +1045,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); + sock_reset_cpu(sk); } return sk; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2f691fb..a826758 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -427,6 +427,18 @@ config NET_CLS_ACT A recent version of the iproute2 package is required to use extended matches. +config NET_ACT_CPU + tristate "Packet distributing" + depends on NET_CLS_ACT + depends on RPS + ---help--- + Say Y here to do packets distributing. With it, you can distribute + the packets among the CPUs on the system in any way as you like. It + only works in ingress. + + To compile this code as a module, choose M here: the + module will be called act_cpu. + config NET_ACT_POLICE tristate "Traffic Policing" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index f14e71b..a9e0b96 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -7,6 +7,7 @@ obj-y := sch_generic.o sch_mq.o obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o obj-$(CONFIG_NET_CLS) += cls_api.o obj-$(CONFIG_NET_CLS_ACT) += act_api.o +obj-$(CONFIG_NET_ACT_CPU) += act_cpu.o obj-$(CONFIG_NET_ACT_POLICE) += act_police.o obj-$(CONFIG_NET_ACT_GACT) += act_gact.o obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o diff --git a/net/sched/act_cpu.c b/net/sched/act_cpu.c new file mode 100644 index 0000000..6b7d7fc --- /dev/null +++ b/net/sched/act_cpu.c @@ -0,0 +1,260 @@ +/* + * Packet distributing actions + * + * Copyright (c) 2010- Changli Gao <xiaosuo@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/gfp.h> +#include <net/net_namespace.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/tc_act/tc_cpu.h> +#include <net/tc_act/tc_cpu.h> + +#define CPU_TAB_MASK 15 +static struct tcf_common *tcf_cpu_ht[CPU_TAB_MASK + 1]; +static u32 cpu_idx_gen; +static DEFINE_RWLOCK(cpu_lock); + +static struct tcf_hashinfo cpu_hash_info = { + .htab = tcf_cpu_ht, + .hmask = CPU_TAB_MASK, + .lock = &cpu_lock +}; + +static const struct nla_policy cpu_policy[TCA_CPU_MAX + 1] = { + [TCA_CPU_PARMS] = { .len = sizeof(struct tc_cpu) }, +}; + +static int tcf_cpu_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_CPU_MAX + 1]; + struct tc_cpu *parm; + struct tcf_cpu *p; + struct tcf_common *pc; + int ret; + + if (nla == NULL) + return -EINVAL; + ret = nla_parse_nested(tb, TCA_CPU_MAX, nla, cpu_policy); + if (ret < 0) + return ret; + if (tb[TCA_CPU_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_CPU_PARMS]); + if (parm->type > TCA_CPU_TYPE_MAX || parm->action != TC_ACT_STOLEN) + return -EINVAL; + + pc = tcf_hash_check(parm->index, a, bind, &cpu_hash_info); + if (!pc) { + pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, + &cpu_idx_gen, &cpu_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_hash_release(pc, bind, &cpu_hash_info); + return -EEXIST; + } + ret = 0; + } + p = to_tcf_cpu(pc); + + spin_lock_bh(&p->tcf_lock); + p->type = parm->type; + p->value = parm->value; + p->tcf_action = parm->action; + spin_unlock_bh(&p->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &cpu_hash_info); + + return ret; +} + +static int tcf_cpu_cleanup(struct tc_action *a, int bind) +{ + struct tcf_cpu *p = a->priv; + + return tcf_hash_release(&p->common, bind, &cpu_hash_info); +} + +static int tcf_cpu_from_sock(struct sk_buff *skb) +{ + struct iphdr *iph; + struct sock *sk; + struct { + __be16 source, dest; + } *ports; + int cpu; + + if (skb->dev == NULL || skb->protocol != __constant_htons(ETH_P_IP)) + goto err; + if (!pskb_may_pull(skb, sizeof(*iph))) + goto err; + iph = (struct iphdr *) skb->data; + if (!pskb_may_pull(skb, iph->ihl * 4 + 4)) + goto err; + ports = (void *) (skb->data + iph->ihl * 4); + switch (iph->protocol) { + case IPPROTO_TCP: + sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, + ports->source, iph->daddr, ports->dest, + skb->dev->ifindex); + break; + case IPPROTO_UDP: + sk = udp4_lib_lookup(dev_net(skb->dev), iph->saddr, + ports->source, iph->daddr, ports->dest, + skb->dev->ifindex); + break; + default: + goto err; + } + + if (!sk) + goto err; + cpu = sk->sk_cpu; + if (sk->sk_protocol == IPPROTO_TCP && sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put(inet_twsk(sk)); + else + sock_put(sk); + + return cpu; + +err: + return smp_processor_id(); +} + +static int tcf_cpu(struct sk_buff *skb, struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_cpu *p = a->priv; + u32 type; + u32 value; + int cpu, action; + struct sk_buff *nskb; + unsigned int qtail; + + spin_lock(&p->tcf_lock); + p->tcf_tm.lastuse = jiffies; + p->tcf_bstats.bytes += qdisc_pkt_len(skb); + p->tcf_bstats.packets++; + type = p->type; + value = p->value; + action = p->tcf_action; + spin_unlock(&p->tcf_lock); + + if (G_TC_AT(skb->tc_verd) & AT_EGRESS) { + if (net_ratelimit()) + pr_notice("act_cpu only works in ingress!\n"); + goto drop; + } + + nskb = skb_act_clone(skb, GFP_ATOMIC, action); + if (nskb == NULL) + goto drop; + nskb->tc_verd = 0; + nskb->tc_verd = SET_TC_NCLS(nskb->tc_verd); + + switch (type) { + case TCA_CPU_TYPE_MAP: + cpu = TC_H_MIN(res->classid) - value; + break; + case TCA_CPU_TYPE_CPUID: + cpu = value; + break; + case TCA_CPU_TYPE_SOCKET: + cpu = tcf_cpu_from_sock(nskb); + break; + default: + kfree_skb(nskb); + goto drop; + } + if (cpu >= nr_cpumask_bits || !cpu_online(cpu)) + cpu = smp_processor_id(); + + if (enqueue_to_backlog(nskb, cpu, &qtail) != NET_RX_SUCCESS) + goto drop; + + return action; + +drop: + spin_lock(&p->tcf_lock); + p->tcf_qstats.drops++; + spin_unlock(&p->tcf_lock); + return TC_ACT_SHOT; +} + +static int tcf_cpu_dump(struct sk_buff *skb, struct tc_action *a, int bind, + int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_cpu *p = a->priv; + struct tc_cpu opt; + struct tcf_t t; + + opt.index = p->tcf_index; + opt.action = p->tcf_action; + opt.refcnt = p->tcf_refcnt - ref; + opt.bindcnt = p->tcf_bindcnt - bind; + opt.type = p->type; + opt.value = p->value; + NLA_PUT(skb, TCA_CPU_PARMS, sizeof(opt), &opt); + t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + NLA_PUT(skb, TCA_CPU_TM, sizeof(t), &t); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_cpu_ops = { + .kind = "cpu", + .hinfo = &cpu_hash_info, + .type = TCA_ACT_CPU, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_cpu, + .dump = tcf_cpu_dump, + .cleanup = tcf_cpu_cleanup, + .lookup = tcf_hash_search, + .init = tcf_cpu_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Changli Gao <xiaosuo@gmail.com>"); +MODULE_DESCRIPTION("Packet distributing actions"); +MODULE_LICENSE("GPL"); + +static int __init cpu_init_module(void) +{ + return tcf_register_action(&act_cpu_ops); +} + +static void __exit cpu_cleanup_module(void) +{ + tcf_unregister_action(&act_cpu_ops); +} + +module_init(cpu_init_module); +module_exit(cpu_cleanup_module);
I want to know if I can assign the sk to skb as nf_tproxy_core does to avoid the duplicate search later. Thanks. act_cpu: packet distributing This TC action can be used to redirect packets to the CPU: * specified by the cpuid option * specified by the class minor ID obtained previously * on which the corresponding application runs It supports the similar functions of RPS and RFS, but is more flexible. Signed-off-by: Changli Gao <xiaosuo@gmail.com> ---- include/linux/netdevice.h | 2 include/linux/tc_act/tc_cpu.h | 31 +++++ include/net/sock.h | 24 +++ include/net/tc_act/tc_cpu.h | 18 ++ net/core/dev.c | 4 net/core/sock.c | 1 net/sched/Kconfig | 12 + net/sched/Makefile | 1 net/sched/act_cpu.c | 260 ++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 350 insertions(+), 3 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html