Message ID | 1321476666-8225-2-git-send-email-nhorman@tuxdriver.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On Wed, 16 Nov 2011 15:51:05 -0500, Neil Horman wrote: > +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) > +{ > + struct cgroup_netprio_state *cs; > + struct net_device *dev; > + > + cs = cgrp_netprio_state(cgrp); > + rtnl_lock(); > + for_each_netdev(&init_net, dev) { > + if (dev->priomap) > + dev->priomap->priomap[cs->prioidx] = 0; > + } > + rtnl_unlock(); > + put_prioidx(cs->prioidx); > +out_free: > + kfree(cs); > +} 'out_free' is unused. > + > +static int write_priomap(struct cgroup *cgrp, struct cftype *cft, > + const char *buffer) > +{ > + char *devname = kstrdup(buffer, GFP_KERNEL); > + int ret = -EINVAL; > + u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; > + unsigned long priority; > + char *priostr; > + struct net_device *dev; > + > + devname = kstrdup(buffer, GFP_KERNEL); kstrdup() is called twice... Thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/16/2011 12:51 PM, Neil Horman wrote: > This patch adds in the infrastructure code to create the network priority > cgroup. The cgroup, in addition to the standard processes file creates two > control files: > > 1) prioidx - This is a read-only file that exports the index of this cgroup. > This is a value that is both arbitrary and unique to a cgroup in this subsystem, > and is used to index the per-device priority map > > 2) priomap - This is a writeable file. On read it reports a table of 2-tuples > <name:priority> where name is the name of a network interface and priority is > indicates the priority assigned to frames egresessing on the named interface and > originating from a pid in this cgroup > > This cgroup allows for skb priority to be set prior to a root qdisc getting > selected. This is benenficial for DCB enabled systems, in that it allows for any > application to use dcb configured priorities so without application modification > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com> > Signed-off-by: John Fastabend <john.r.fastabend@intel.com> > CC: Robert Love <robert.w.love@intel.com> > CC: "David S. Miller" <davem@davemloft.net> > --- > include/linux/cgroup_subsys.h | 8 + > include/linux/netdevice.h | 4 + > include/net/netprio_cgroup.h | 66 ++++++++ > include/net/sock.h | 3 + > net/Kconfig | 7 + > net/core/Makefile | 1 + > net/core/dev.c | 13 ++ > net/core/netprio_cgroup.c | 340 +++++++++++++++++++++++++++++++++++++++++ > net/core/sock.c | 22 +++- > net/socket.c | 2 + > 10 files changed, 465 insertions(+), 1 deletions(-) > create mode 100644 include/net/netprio_cgroup.h > create mode 100644 net/core/netprio_cgroup.c > Hi Neil, couple more comments. > diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h > index ac663c1..0bd390c 100644 > --- a/include/linux/cgroup_subsys.h > +++ b/include/linux/cgroup_subsys.h > @@ -59,8 +59,16 @@ SUBSYS(net_cls) > SUBSYS(blkio) > #endif > > +/* */ > + > #ifdef CONFIG_CGROUP_PERF > SUBSYS(perf) > #endif > > /* */ > + > +#ifdef CONFIG_NETPRIO_CGROUP > +SUBSYS(net_prio) > +#endif > + > +/* */ > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 0db1f5f..86e8c3f 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -50,6 +50,7 @@ > #ifdef CONFIG_DCB > #include <net/dcbnl.h> > #endif > +#include <net/netprio_cgroup.h> > > struct vlan_group; > struct netpoll_info; > @@ -1312,6 +1313,9 @@ struct net_device { > /* max exchange id for FCoE LRO by ddp */ > unsigned int fcoe_ddp_xid; > #endif > +#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) > + struct netprio_map *priomap; > +#endif Any reason not to annotate this? 'struct netprio_map __rcu *priomap;' and then use rcu_dereference_protected() throughout. > /* phy device may attach itself for hardware timestamping */ > struct phy_device *phydev; > > diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h > new file mode 100644 > index 0000000..6b65936 > --- /dev/null > +++ b/include/net/netprio_cgroup.h > @@ -0,0 +1,66 @@ > +/* > + * netprio_cgroup.h Control Group Priority set > + * > + * > + * Authors: Neil Horman <nhorman@tuxdriver.com> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License as published by the Free > + * Software Foundation; either version 2 of the License, or (at your option) > + * any later version. > + * > + */ > + > +#ifndef _NETPRIO_CGROUP_H > +#define _NETPRIO_CGROUP_H > +#include <linux/module.h> > +#include <linux/cgroup.h> > +#include <linux/hardirq.h> > +#include <linux/rcupdate.h> > + > +struct cgroup_netprio_state > +{ > + struct cgroup_subsys_state css; > + u32 prioidx; > +}; > + > +struct netprio_map { > + struct rcu_head rcu; > + u32 priomap_len; > + u32 priomap[]; > +}; > + > +#ifdef CONFIG_CGROUPS > + > +#ifndef CONFIG_NETPRIO_CGROUP > +extern int net_prio_subsys_id; > +#endif > + > +extern void sock_update_netprioidx(struct sock *sk); > +extern void skb_update_prio(struct sk_buff *skb); > + > +static inline struct cgroup_netprio_state > + *task_netprio_state(struct task_struct *p) > +{ > +#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) > + return container_of(task_subsys_state(p, net_prio_subsys_id), > + struct cgroup_netprio_state, css); > +#else > + return NULL; > +#endif > +} > + > +#else > + > +#define sock_update_netprioidx(sk) > +#define skb_update_prio(skb) > + > +static inline struct cgroup_netprio_state > + *task_netprio_state(struct task_struct *p) > +{ > + return NULL; > +} > + > +#endif > + > +#endif /* _NET_CLS_CGROUP_H */ > diff --git a/include/net/sock.h b/include/net/sock.h > index 5ac682f..87b24aa 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -321,6 +321,9 @@ struct sock { > unsigned short sk_ack_backlog; > unsigned short sk_max_ack_backlog; > __u32 sk_priority; > +#ifdef CONFIG_CGROUPS > + __u32 sk_cgrp_prioidx; > +#endif > struct pid *sk_peer_pid; > const struct cred *sk_peer_cred; > long sk_rcvtimeo; > diff --git a/net/Kconfig b/net/Kconfig > index a073148..63d2c5d 100644 > --- a/net/Kconfig > +++ b/net/Kconfig > @@ -232,6 +232,13 @@ config XPS > depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS > default y > > +config NETPRIO_CGROUP > + tristate "Network priority cgroup" > + depends on CGROUPS > + ---help--- > + Cgroup subsystem for use in assigning processes to network priorities on > + a per-interface basis > + > config HAVE_BPF_JIT > bool > > diff --git a/net/core/Makefile b/net/core/Makefile > index 0d357b1..3606d40 100644 > --- a/net/core/Makefile > +++ b/net/core/Makefile > @@ -19,3 +19,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o > obj-$(CONFIG_TRACEPOINTS) += net-traces.o > obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o > obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o > +obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o > diff --git a/net/core/dev.c b/net/core/dev.c > index b7ba81a..a1dca83 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -2456,6 +2456,17 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, > return rc; > } > > +#ifdef CONFIG_CGROUPS > +void skb_update_prio(struct sk_buff *skb) > +{ > + struct netprio_map *map = rcu_dereference(skb->dev->priomap); > + > + if ((!skb->priority) && (skb->sk) && map) > + skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx]; > +} > +EXPORT_SYMBOL_GPL(skb_update_prio); > +#endif > + > static DEFINE_PER_CPU(int, xmit_recursion); > #define RECURSION_LIMIT 10 > > @@ -2496,6 +2507,8 @@ int dev_queue_xmit(struct sk_buff *skb) > */ > rcu_read_lock_bh(); > > + skb_update_prio(skb); > + > txq = dev_pick_tx(dev, skb); > q = rcu_dereference_bh(txq->qdisc); > > diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c > new file mode 100644 > index 0000000..14e896c > --- /dev/null > +++ b/net/core/netprio_cgroup.c > @@ -0,0 +1,340 @@ > +/* > + * net/sched/cls_cgroup.c Control Group Classifier ^^^^^^^^^^^^^^^^^^ should be netprio_cgroup.c presumably. > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + * > + * Authors: Thomas Graf <tgraf@suug.ch> > + */ > + > +#include <linux/module.h> > +#include <linux/slab.h> > +#include <linux/types.h> > +#include <linux/string.h> > +#include <linux/errno.h> > +#include <linux/skbuff.h> > +#include <linux/cgroup.h> > +#include <linux/rcupdate.h> > +#include <linux/atomic.h> > +#include <net/rtnetlink.h> > +#include <net/pkt_cls.h> > +#include <net/sock.h> > +#include <net/netprio_cgroup.h> > + > +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, > + struct cgroup *cgrp); > +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); > +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); > + > +struct cgroup_subsys net_prio_subsys = { > + .name = "net_prio", > + .create = cgrp_create, > + .destroy = cgrp_destroy, > + .populate = cgrp_populate, > +#ifdef CONFIG_NETPRIO_CGROUP > + .subsys_id = net_prio_subsys_id, > +#endif > + .module = THIS_MODULE > +}; > + > +#define PRIOIDX_SZ 128 > + > +static unsigned long prioidx_map[PRIOIDX_SZ]; > +static DEFINE_SPINLOCK(prioidx_map_lock); > +static atomic_t max_prioidx = ATOMIC_INIT(0); > + > +static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) > +{ > + return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), > + struct cgroup_netprio_state, css); > +} > + > +static int get_prioidx(u32 *prio) > +{ > + unsigned long flags; > + u32 prioidx; > + > + spin_lock_irqsave(&prioidx_map_lock, flags); > + prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); > + set_bit(prioidx, prioidx_map); > + spin_unlock_irqrestore(&prioidx_map_lock, flags); > + if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) > + return -ENOSPC; > + > + atomic_set(&max_prioidx, prioidx); > + *prio = prioidx; > + return 0; > +} > + > +static void put_prioidx(u32 idx) > +{ > + unsigned long flags; > + spin_lock_irqsave(&prioidx_map_lock, flags); > + clear_bit(idx, prioidx_map); > + spin_unlock_irqrestore(&prioidx_map_lock, flags); > +} > + > +static void extend_netdev_table(struct net_device *dev, u32 new_len) > +{ > + size_t new_size = sizeof(struct netprio_map) + > + ((sizeof(u32) * new_len)); > + struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); > + struct netprio_map *old_priomap; > + int i; > + > + old_priomap = rcu_dereference_protected(dev->priomap, 1); This always has rtnl_lock hence the '1' so use rtnl_dereference(). > + also extra white line. > + > + if (!new_priomap) { > + printk(KERN_WARNING "Unable to alloc new priomap!\n"); > + return; > + } > + > + for (i = 0; > + dev->priomap && (i < dev->priomap->priomap_len); ^^^^^^^^^^^^ ^^^^^^^^^^^^^ probably should use old_priomap here. > + i++) > + new_priomap->priomap[i] = dev->priomap->priomap[i]; > + > + new_priomap->priomap_len = new_len; > + > + rcu_assign_pointer(dev->priomap, new_priomap); > + if (old_priomap) > + kfree_rcu(old_priomap, rcu); > + remove extra line. > +} > + > +static void update_netdev_tables(void) > +{ > + struct net_device *dev; > + u32 max_len = atomic_read(&max_prioidx); > + > + rtnl_lock(); > + > + for_each_netdev(&init_net, dev) { > + if ((!dev->priomap) || > + (dev->priomap->priomap_len < max_len)) > + extend_netdev_table(dev, max_len); With __rcu change add rtnl_dereference() > + } > + > + rtnl_unlock(); > +} > + > +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, > + struct cgroup *cgrp) > +{ > + struct cgroup_netprio_state *cs; > + int ret; > + > + cs = kzalloc(sizeof(*cs), GFP_KERNEL); > + if (!cs) > + return ERR_PTR(-ENOMEM); > + > + if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) > + return ERR_PTR(-EINVAL); > + > + ret = get_prioidx(&cs->prioidx); > + if (ret != 0) { > + printk(KERN_WARNING "No space in priority index array\n"); > + return ERR_PTR(ret); > + } > + > + return &cs->css; > +} > + > +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) > +{ > + struct cgroup_netprio_state *cs; > + struct net_device *dev; > + > + cs = cgrp_netprio_state(cgrp); > + rtnl_lock(); > + for_each_netdev(&init_net, dev) { > + if (dev->priomap) ditto rtnl_dereference() > + dev->priomap->priomap[cs->prioidx] = 0; > + } > + rtnl_unlock(); > + put_prioidx(cs->prioidx); > +out_free: > + kfree(cs); > +} > + > +static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) > +{ > + return (u64)cgrp_netprio_state(cgrp)->prioidx; > +} > + > +static int read_priomap(struct cgroup *cont, struct cftype *cft, > + struct cgroup_map_cb *cb) > +{ > + struct net_device *dev; > + u32 prioidx = cgrp_netprio_state(cont)->prioidx; > + u32 priority; > + > + /* > + * Stub until I add the per-interface priority map > + */ we have a per interface map now... so drop the comments. > + rcu_read_lock(); > + for_each_netdev_rcu(&init_net, dev) { > + priority = dev->priomap ? dev->priomap->priomap[prioidx] : 0; This really needs a rcu_dereference() AFAICT. > + cb->fill(cb, dev->name, priority); > + } > + rcu_read_unlock(); > + return 0; > +} > + > +static int write_priomap(struct cgroup *cgrp, struct cftype *cft, > + const char *buffer) > +{ > + char *devname = kstrdup(buffer, GFP_KERNEL); > + int ret = -EINVAL; > + u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; > + unsigned long priority; > + char *priostr; > + struct net_device *dev; > + > + devname = kstrdup(buffer, GFP_KERNEL); > + if (!devname) > + return -ENOMEM; > + > + /* > + * Minimally sized valid priomap string > + */ > + if (strlen(devname) < 3) > + goto out_free_devname; > + > + priostr = strstr(devname, " "); > + if (!priostr) > + goto out_free_devname; > + > + /* > + *Separate the devname from the associated priority > + *and advance the priostr poitner to the priority value > + */ > + *priostr = '\0'; > + priostr++; > + > + /* > + * If the priostr points to NULL, we're at the end of the passed > + * in string, and its not a valid write > + */ > + if (*priostr == '\0') > + goto out_free_devname; > + > + ret = kstrtoul(priostr, 10, &priority); > + if (ret < 0) > + goto out_free_devname; > + > + ret = -ENODEV; > + > + dev = dev_get_by_name(&init_net, devname); > + if (!dev) > + goto out_free_devname; > + > + update_netdev_tables(); > + ret = 0; > + if (dev->priomap) > + dev->priomap->priomap[prioidx] = priority; > + Whats protecting this? This needs a dereference and rcu_read_lock() I think. > + dev_put(dev); > + > +out_free_devname: > + kfree(devname); > + return ret; > +} > + > +static struct cftype ss_files[] = { > + { > + .name = "prioidx", > + .read_u64 = read_prioidx, > + }, > + { > + .name = "ifpriomap", > + .read_map = read_priomap, > + .write_string = write_priomap, > + }, > +}; > + > +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) > +{ > + return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); > +} > + > +static int netprio_device_event(struct notifier_block *unused, > + unsigned long event, void *ptr) > +{ > + struct net_device *dev = ptr; > + struct netprio_map *old; > + u32 max_len = atomic_read(&max_prioidx); > + > + old = rcu_dereference_protected(dev->priomap, 1); > + /* > + * Note this is called with rtnl_lock held so we have update side > + * protection on our rcu assignments > + */ > + > + switch (event) { > + > + case NETDEV_REGISTER: > + if (max_len) > + extend_netdev_table(dev, max_len); > + break; > + case NETDEV_UNREGISTER: > + rcu_assign_pointer(dev->priomap, NULL); > + if (old) > + kfree_rcu(old, rcu); > + break; > + } > + return NOTIFY_DONE; > +} > + > +static struct notifier_block netprio_device_notifier = { > + .notifier_call = netprio_device_event > +}; > + > +static int __init init_cgroup_netprio(void) > +{ > + int ret; > + > + ret = cgroup_load_subsys(&net_prio_subsys); > + if (ret) > + goto out; > +#ifndef CONFIG_NETPRIO_CGROUP > + smp_wmb(); > + net_prio_subsys_id = net_prio_subsys.subsys_id; > +#endif > + > + register_netdevice_notifier(&netprio_device_notifier); > + > +out: > + return ret; > +} > + > +static void __exit exit_cgroup_netprio(void) > +{ > + struct netprio_map *old; > + struct net_device *dev; > + > + unregister_netdevice_notifier(&netprio_device_notifier); > + > + cgroup_unload_subsys(&net_prio_subsys); > + > +#ifndef CONFIG_NETPRIO_CGROUP > + net_prio_subsys_id = -1; > + synchronize_rcu(); > +#endif > + > + rtnl_lock(); > + for_each_netdev(&init_net, dev) { > + old = dev->priomap; rtnl_dereference() here also if adding __rcu > + rcu_assign_pointer(dev->priomap, NULL); > + if (old) > + kfree_rcu(old, rcu); > + } > + rtnl_unlock(); > +} > + > +module_init(init_cgroup_netprio); > +module_exit(exit_cgroup_netprio); > +MODULE_LICENSE("GPL v2"); > diff --git a/net/core/sock.c b/net/core/sock.c > index 5a08762..77a4888 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -125,6 +125,7 @@ > #include <net/xfrm.h> > #include <linux/ipsec.h> > #include <net/cls_cgroup.h> > +#include <net/netprio_cgroup.h> > > #include <linux/filter.h> > > @@ -221,10 +222,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; > int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); > EXPORT_SYMBOL(sysctl_optmem_max); > > -#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) > +#if defined(CONFIG_CGROUPS) > +#if !defined(CONFIG_NET_CLS_CGROUP) > int net_cls_subsys_id = -1; > EXPORT_SYMBOL_GPL(net_cls_subsys_id); > #endif > +#if !defined(CONFIG_NETPRIO_CGROUP) > +int net_prio_subsys_id = -1; > +EXPORT_SYMBOL_GPL(net_prio_subsys_id); > +#endif > +#endif > > static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) > { > @@ -1111,6 +1118,18 @@ void sock_update_classid(struct sock *sk) > sk->sk_classid = classid; > } > EXPORT_SYMBOL(sock_update_classid); > + > +void sock_update_netprioidx(struct sock *sk) > +{ > + struct cgroup_netprio_state *state; > + if (in_interrupt()) > + return; > + rcu_read_lock(); > + state = task_netprio_state(current); > + sk->sk_cgrp_prioidx = state ? state->prioidx : 0; > + rcu_read_unlock(); > +} > +EXPORT_SYMBOL_GPL(sock_update_netprioidx); > #endif > > /** > @@ -1138,6 +1157,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, > atomic_set(&sk->sk_wmem_alloc, 1); > > sock_update_classid(sk); > + sock_update_netprioidx(sk); > } > > return sk; > diff --git a/net/socket.c b/net/socket.c > index 2877647..108716f 100644 > --- a/net/socket.c > +++ b/net/socket.c > @@ -549,6 +549,8 @@ static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, > > sock_update_classid(sock->sk); > > + sock_update_netprioidx(sock->sk); > + > si->sock = sock; > si->scm = NULL; > si->msg = msg; > -- > 1.7.6.4 > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Nov 17, 2011 at 03:25:58AM -0800, John Fastabend wrote: > On 11/16/2011 12:51 PM, Neil Horman wrote: > > This patch adds in the infrastructure code to create the network priority > > cgroup. The cgroup, in addition to the standard processes file creates two > > control files: > > > > 1) prioidx - This is a read-only file that exports the index of this cgroup. > > This is a value that is both arbitrary and unique to a cgroup in this subsystem, > > and is used to index the per-device priority map > > > > 2) priomap - This is a writeable file. On read it reports a table of 2-tuples > > <name:priority> where name is the name of a network interface and priority is > > indicates the priority assigned to frames egresessing on the named interface and > > originating from a pid in this cgroup > > > > This cgroup allows for skb priority to be set prior to a root qdisc getting > > selected. This is benenficial for DCB enabled systems, in that it allows for any > > application to use dcb configured priorities so without application modification > > > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com> > > Signed-off-by: John Fastabend <john.r.fastabend@intel.com> > > CC: Robert Love <robert.w.love@intel.com> > > CC: "David S. Miller" <davem@davemloft.net> > > --- > > include/linux/cgroup_subsys.h | 8 + > > include/linux/netdevice.h | 4 + > > include/net/netprio_cgroup.h | 66 ++++++++ > > include/net/sock.h | 3 + > > net/Kconfig | 7 + > > net/core/Makefile | 1 + > > net/core/dev.c | 13 ++ > > net/core/netprio_cgroup.c | 340 +++++++++++++++++++++++++++++++++++++++++ > > net/core/sock.c | 22 +++- > > net/socket.c | 2 + > > 10 files changed, 465 insertions(+), 1 deletions(-) > > create mode 100644 include/net/netprio_cgroup.h > > create mode 100644 net/core/netprio_cgroup.c > > > > Hi Neil, > > couple more comments. > Thanks John, I'll fix these points up and post a v2 later today. Neil > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ac663c1..0bd390c 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -59,8 +59,16 @@ SUBSYS(net_cls) SUBSYS(blkio) #endif +/* */ + #ifdef CONFIG_CGROUP_PERF SUBSYS(perf) #endif /* */ + +#ifdef CONFIG_NETPRIO_CGROUP +SUBSYS(net_prio) +#endif + +/* */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0db1f5f..86e8c3f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -50,6 +50,7 @@ #ifdef CONFIG_DCB #include <net/dcbnl.h> #endif +#include <net/netprio_cgroup.h> struct vlan_group; struct netpoll_info; @@ -1312,6 +1313,9 @@ struct net_device { /* max exchange id for FCoE LRO by ddp */ unsigned int fcoe_ddp_xid; #endif +#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) + struct netprio_map *priomap; +#endif /* phy device may attach itself for hardware timestamping */ struct phy_device *phydev; diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h new file mode 100644 index 0000000..6b65936 --- /dev/null +++ b/include/net/netprio_cgroup.h @@ -0,0 +1,66 @@ +/* + * netprio_cgroup.h Control Group Priority set + * + * + * Authors: Neil Horman <nhorman@tuxdriver.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#ifndef _NETPRIO_CGROUP_H +#define _NETPRIO_CGROUP_H +#include <linux/module.h> +#include <linux/cgroup.h> +#include <linux/hardirq.h> +#include <linux/rcupdate.h> + +struct cgroup_netprio_state +{ + struct cgroup_subsys_state css; + u32 prioidx; +}; + +struct netprio_map { + struct rcu_head rcu; + u32 priomap_len; + u32 priomap[]; +}; + +#ifdef CONFIG_CGROUPS + +#ifndef CONFIG_NETPRIO_CGROUP +extern int net_prio_subsys_id; +#endif + +extern void sock_update_netprioidx(struct sock *sk); +extern void skb_update_prio(struct sk_buff *skb); + +static inline struct cgroup_netprio_state + *task_netprio_state(struct task_struct *p) +{ +#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) + return container_of(task_subsys_state(p, net_prio_subsys_id), + struct cgroup_netprio_state, css); +#else + return NULL; +#endif +} + +#else + +#define sock_update_netprioidx(sk) +#define skb_update_prio(skb) + +static inline struct cgroup_netprio_state + *task_netprio_state(struct task_struct *p) +{ + return NULL; +} + +#endif + +#endif /* _NET_CLS_CGROUP_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 5ac682f..87b24aa 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -321,6 +321,9 @@ struct sock { unsigned short sk_ack_backlog; unsigned short sk_max_ack_backlog; __u32 sk_priority; +#ifdef CONFIG_CGROUPS + __u32 sk_cgrp_prioidx; +#endif struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; diff --git a/net/Kconfig b/net/Kconfig index a073148..63d2c5d 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -232,6 +232,13 @@ config XPS depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS default y +config NETPRIO_CGROUP + tristate "Network priority cgroup" + depends on CGROUPS + ---help--- + Cgroup subsystem for use in assigning processes to network priorities on + a per-interface basis + config HAVE_BPF_JIT bool diff --git a/net/core/Makefile b/net/core/Makefile index 0d357b1..3606d40 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o +obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o diff --git a/net/core/dev.c b/net/core/dev.c index b7ba81a..a1dca83 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2456,6 +2456,17 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +#ifdef CONFIG_CGROUPS +void skb_update_prio(struct sk_buff *skb) +{ + struct netprio_map *map = rcu_dereference(skb->dev->priomap); + + if ((!skb->priority) && (skb->sk) && map) + skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx]; +} +EXPORT_SYMBOL_GPL(skb_update_prio); +#endif + static DEFINE_PER_CPU(int, xmit_recursion); #define RECURSION_LIMIT 10 @@ -2496,6 +2507,8 @@ int dev_queue_xmit(struct sk_buff *skb) */ rcu_read_lock_bh(); + skb_update_prio(skb); + txq = dev_pick_tx(dev, skb); q = rcu_dereference_bh(txq->qdisc); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c new file mode 100644 index 0000000..14e896c --- /dev/null +++ b/net/core/netprio_cgroup.c @@ -0,0 +1,340 @@ +/* + * net/sched/cls_cgroup.c Control Group Classifier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/cgroup.h> +#include <linux/rcupdate.h> +#include <linux/atomic.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> +#include <net/netprio_cgroup.h> + +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, + struct cgroup *cgrp); +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); + +struct cgroup_subsys net_prio_subsys = { + .name = "net_prio", + .create = cgrp_create, + .destroy = cgrp_destroy, + .populate = cgrp_populate, +#ifdef CONFIG_NETPRIO_CGROUP + .subsys_id = net_prio_subsys_id, +#endif + .module = THIS_MODULE +}; + +#define PRIOIDX_SZ 128 + +static unsigned long prioidx_map[PRIOIDX_SZ]; +static DEFINE_SPINLOCK(prioidx_map_lock); +static atomic_t max_prioidx = ATOMIC_INIT(0); + +static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), + struct cgroup_netprio_state, css); +} + +static int get_prioidx(u32 *prio) +{ + unsigned long flags; + u32 prioidx; + + spin_lock_irqsave(&prioidx_map_lock, flags); + prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); + set_bit(prioidx, prioidx_map); + spin_unlock_irqrestore(&prioidx_map_lock, flags); + if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) + return -ENOSPC; + + atomic_set(&max_prioidx, prioidx); + *prio = prioidx; + return 0; +} + +static void put_prioidx(u32 idx) +{ + unsigned long flags; + spin_lock_irqsave(&prioidx_map_lock, flags); + clear_bit(idx, prioidx_map); + spin_unlock_irqrestore(&prioidx_map_lock, flags); +} + +static void extend_netdev_table(struct net_device *dev, u32 new_len) +{ + size_t new_size = sizeof(struct netprio_map) + + ((sizeof(u32) * new_len)); + struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); + struct netprio_map *old_priomap; + int i; + + old_priomap = rcu_dereference_protected(dev->priomap, 1); + + + if (!new_priomap) { + printk(KERN_WARNING "Unable to alloc new priomap!\n"); + return; + } + + for (i = 0; + dev->priomap && (i < dev->priomap->priomap_len); + i++) + new_priomap->priomap[i] = dev->priomap->priomap[i]; + + new_priomap->priomap_len = new_len; + + rcu_assign_pointer(dev->priomap, new_priomap); + if (old_priomap) + kfree_rcu(old_priomap, rcu); + +} + +static void update_netdev_tables(void) +{ + struct net_device *dev; + u32 max_len = atomic_read(&max_prioidx); + + rtnl_lock(); + + for_each_netdev(&init_net, dev) { + if ((!dev->priomap) || + (dev->priomap->priomap_len < max_len)) + extend_netdev_table(dev, max_len); + } + + rtnl_unlock(); +} + +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, + struct cgroup *cgrp) +{ + struct cgroup_netprio_state *cs; + int ret; + + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return ERR_PTR(-ENOMEM); + + if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) + return ERR_PTR(-EINVAL); + + ret = get_prioidx(&cs->prioidx); + if (ret != 0) { + printk(KERN_WARNING "No space in priority index array\n"); + return ERR_PTR(ret); + } + + return &cs->css; +} + +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct cgroup_netprio_state *cs; + struct net_device *dev; + + cs = cgrp_netprio_state(cgrp); + rtnl_lock(); + for_each_netdev(&init_net, dev) { + if (dev->priomap) + dev->priomap->priomap[cs->prioidx] = 0; + } + rtnl_unlock(); + put_prioidx(cs->prioidx); +out_free: + kfree(cs); +} + +static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +{ + return (u64)cgrp_netprio_state(cgrp)->prioidx; +} + +static int read_priomap(struct cgroup *cont, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct net_device *dev; + u32 prioidx = cgrp_netprio_state(cont)->prioidx; + u32 priority; + + /* + * Stub until I add the per-interface priority map + */ + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + priority = dev->priomap ? dev->priomap->priomap[prioidx] : 0; + cb->fill(cb, dev->name, priority); + } + rcu_read_unlock(); + return 0; +} + +static int write_priomap(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + char *devname = kstrdup(buffer, GFP_KERNEL); + int ret = -EINVAL; + u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; + unsigned long priority; + char *priostr; + struct net_device *dev; + + devname = kstrdup(buffer, GFP_KERNEL); + if (!devname) + return -ENOMEM; + + /* + * Minimally sized valid priomap string + */ + if (strlen(devname) < 3) + goto out_free_devname; + + priostr = strstr(devname, " "); + if (!priostr) + goto out_free_devname; + + /* + *Separate the devname from the associated priority + *and advance the priostr poitner to the priority value + */ + *priostr = '\0'; + priostr++; + + /* + * If the priostr points to NULL, we're at the end of the passed + * in string, and its not a valid write + */ + if (*priostr == '\0') + goto out_free_devname; + + ret = kstrtoul(priostr, 10, &priority); + if (ret < 0) + goto out_free_devname; + + ret = -ENODEV; + + dev = dev_get_by_name(&init_net, devname); + if (!dev) + goto out_free_devname; + + update_netdev_tables(); + ret = 0; + if (dev->priomap) + dev->priomap->priomap[prioidx] = priority; + + dev_put(dev); + +out_free_devname: + kfree(devname); + return ret; +} + +static struct cftype ss_files[] = { + { + .name = "prioidx", + .read_u64 = read_prioidx, + }, + { + .name = "ifpriomap", + .read_map = read_priomap, + .write_string = write_priomap, + }, +}; + +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); +} + +static int netprio_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct netprio_map *old; + u32 max_len = atomic_read(&max_prioidx); + + old = rcu_dereference_protected(dev->priomap, 1); + /* + * Note this is called with rtnl_lock held so we have update side + * protection on our rcu assignments + */ + + switch (event) { + + case NETDEV_REGISTER: + if (max_len) + extend_netdev_table(dev, max_len); + break; + case NETDEV_UNREGISTER: + rcu_assign_pointer(dev->priomap, NULL); + if (old) + kfree_rcu(old, rcu); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block netprio_device_notifier = { + .notifier_call = netprio_device_event +}; + +static int __init init_cgroup_netprio(void) +{ + int ret; + + ret = cgroup_load_subsys(&net_prio_subsys); + if (ret) + goto out; +#ifndef CONFIG_NETPRIO_CGROUP + smp_wmb(); + net_prio_subsys_id = net_prio_subsys.subsys_id; +#endif + + register_netdevice_notifier(&netprio_device_notifier); + +out: + return ret; +} + +static void __exit exit_cgroup_netprio(void) +{ + struct netprio_map *old; + struct net_device *dev; + + unregister_netdevice_notifier(&netprio_device_notifier); + + cgroup_unload_subsys(&net_prio_subsys); + +#ifndef CONFIG_NETPRIO_CGROUP + net_prio_subsys_id = -1; + synchronize_rcu(); +#endif + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + old = dev->priomap; + rcu_assign_pointer(dev->priomap, NULL); + if (old) + kfree_rcu(old, rcu); + } + rtnl_unlock(); +} + +module_init(init_cgroup_netprio); +module_exit(exit_cgroup_netprio); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/sock.c b/net/core/sock.c index 5a08762..77a4888 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -125,6 +125,7 @@ #include <net/xfrm.h> #include <linux/ipsec.h> #include <net/cls_cgroup.h> +#include <net/netprio_cgroup.h> #include <linux/filter.h> @@ -221,10 +222,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); -#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) +#if defined(CONFIG_CGROUPS) +#if !defined(CONFIG_NET_CLS_CGROUP) int net_cls_subsys_id = -1; EXPORT_SYMBOL_GPL(net_cls_subsys_id); #endif +#if !defined(CONFIG_NETPRIO_CGROUP) +int net_prio_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_prio_subsys_id); +#endif +#endif static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) { @@ -1111,6 +1118,18 @@ void sock_update_classid(struct sock *sk) sk->sk_classid = classid; } EXPORT_SYMBOL(sock_update_classid); + +void sock_update_netprioidx(struct sock *sk) +{ + struct cgroup_netprio_state *state; + if (in_interrupt()) + return; + rcu_read_lock(); + state = task_netprio_state(current); + sk->sk_cgrp_prioidx = state ? state->prioidx : 0; + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif /** @@ -1138,6 +1157,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); + sock_update_netprioidx(sk); } return sk; diff --git a/net/socket.c b/net/socket.c index 2877647..108716f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -549,6 +549,8 @@ static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, sock_update_classid(sock->sk); + sock_update_netprioidx(sock->sk); + si->sock = sock; si->scm = NULL; si->msg = msg;