Message ID | 1271022140-3917-1-git-send-email-xiaosuo@gmail.com |
---|---|
State | Rejected, archived |
Delegated to: | David Miller |
Headers | show |
Le lundi 12 avril 2010 à 05:42 +0800, Changli Gao a écrit : > add rps flow director support > > with rps flow director, users can do weighted packet dispatching among CPUs. > For example, CPU0:CPU1 is 1:3 for eth0's rx-0: > > localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows > localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0 > localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1 > localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2 > localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3 > > Signed-off-by: Changli Gao <xiaosuo@gmail.com> > ---- Changli I am a bit disappointed to find so many bugs in your patch. I believe this is over engineering at this stage, we yet have to get some benches or real world results. Plus it conflicts with the much more interesting upcoming stuff (RFS). You name this patch 'flow director', to get our attention, but it's an old idea of you, to get different weights on cpus, that RPS is not yet able to perform. Maybe this is the reason you forgot to CC Tom Herbert (and me) ? Consider now : 1) echo 65000 >/sys/class/net/eth0/queues/rx-0/rps_flow_0 possible crash, dereferencing a smaller cpumap. 2) echo 3000000000 >/sys/class/net/eth0/queues/rx-0/rps_flow_0 probable crash, because of overflow in RPS_MAP_SIZE(flows) 3) How can rps_flow_attribute & rps_flow_attribute_size be static (one instance for whole kernel), if your intent is to have a per rxqueue attributes ? (/sys/class/net/eth0/queues/rx-0/ ...). Or the first lines of update_rps_flow_files() are completely wrong... echo 10 > /sys/class/net/eth0/queues/rx-0/rps_flows echo 2 > /sys/class/net/eth1/queues/rx-0/rps_flows cat /sys/class/net/eth0/queues/rx-0/rps_flow_9 4) Lack of atomic changes of the RPS flows -> many packet reordering can occur. 5) Many possible memory leaks in update_rps_flow_files(), you obviously were very lazy. We try to build a bug-free kernel, not only a 'cool kernel', and if you are lazy, your patches wont be accepted. > net/core/net-sysfs.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 172 insertions(+), 4 deletions(-) > diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c > index 1e7fdd6..d904610 100644 > --- a/net/core/net-sysfs.c > +++ b/net/core/net-sysfs.c > @@ -511,6 +511,109 @@ static struct sysfs_ops rx_queue_sysfs_ops = { > .store = rx_queue_attr_store, > }; > > +static DEFINE_MUTEX(rps_map_lock); > + > +static ssize_t show_rps_flow(struct netdev_rx_queue *queue, > + struct rx_queue_attribute *attribute, char *buf) > +{ > + unsigned long flowid; > + struct rps_map *map; > + u16 cpu; > + > + strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid); > + rcu_read_lock(); > + map = rcu_dereference(queue->rps_map); > + if (map && flowid < map->len) > + cpu = map->cpus[flowid]; > + else > + cpu = 0; > + rcu_read_unlock(); > + return sprintf(buf, "%hu\n", cpu); > +} > + > +static ssize_t store_rps_flow(struct netdev_rx_queue *queue, > + struct rx_queue_attribute *attribute, > + const char *buf, size_t len) > +{ > + unsigned long flowid, cpu; > + struct rps_map *map; > + > + if (!capable(CAP_NET_ADMIN)) > + return -EPERM; > + > + if (strict_strtoul(buf, 0, &cpu)) > + return -EINVAL; > + strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid); > + > + mutex_lock(&rps_map_lock); > + map = queue->rps_map; > + if (map && flowid < map->len) > + map->cpus[flowid] = cpu; what can happen is cpu=65000, and NR_CPUS=32 ? > + mutex_unlock(&rps_map_lock); > + > + return len; > +} > + > +static struct rx_queue_attribute **rps_flow_attribute; > +static int rps_flow_attribute_size; > + > +/* must be called with rps_map_lock locked */ > +static int update_rps_flow_files(struct kobject *kobj, > + struct rps_map *old_map, struct rps_map *map) > +{ > + int i; > + int old_map_len = old_map ? old_map->len : 0; > + int map_len = map ? map->len : 0; > + > + if (old_map_len >= map_len) { > + for (i = map_len; i < old_map_len; i++) > + sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr); Removing attributes for this rxqueue, while anothe might need them ? > + return 0; > + } > + > + if (map_len > rps_flow_attribute_size) { > + struct rx_queue_attribute **attrs; > + char name[sizeof("rps_flow_4294967295")]; > + char *pname; > + > + attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *), > + GFP_KERNEL); > + if (attrs == NULL) > + return -ENOMEM; > + rps_flow_attribute = attrs; > + for (i = rps_flow_attribute_size; i < map_len; i++) { > + rps_flow_attribute[i] = kmalloc(sizeof(**attrs), > + GFP_KERNEL); > + if (rps_flow_attribute[i] == NULL) > + break; > + sprintf(name, "rps_flow_%d", i); > + pname = kstrdup(name, GFP_KERNEL); > + if (pname == NULL) { > + kfree(rps_flow_attribute[i]); > + break; > + } > + rps_flow_attribute[i]->attr.name = pname; > + rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR; > + rps_flow_attribute[i]->show = show_rps_flow; > + rps_flow_attribute[i]->store = store_rps_flow; > + } > + rps_flow_attribute_size = i; > + if (i != map_len) > + return -ENOMEM; > + } > + > + for (i = old_map_len; i < map_len; i++) { > + if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) { > + while (--i >= old_map_len) > + sysfs_remove_file(kobj, > + &rps_flow_attribute[i]->attr); No changes to rps_flow_atribute_size ? > + return -ENOMEM; > + } > + } > + > + return 0; > +} > + > static ssize_t show_rps_map(struct netdev_rx_queue *queue, > struct rx_queue_attribute *attribute, char *buf) > { > @@ -555,7 +658,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, > struct rps_map *old_map, *map; > cpumask_var_t mask; > int err, cpu, i; > - static DEFINE_SPINLOCK(rps_map_lock); > > if (!capable(CAP_NET_ADMIN)) > return -EPERM; > @@ -588,10 +690,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, > map = NULL; > } > > - spin_lock(&rps_map_lock); > + mutex_lock(&rps_map_lock); > old_map = queue->rps_map; > - rcu_assign_pointer(queue->rps_map, map); > - spin_unlock(&rps_map_lock); > + err = update_rps_flow_files(&queue->kobj, old_map, map); > + if (!err) > + rcu_assign_pointer(queue->rps_map, map); > + mutex_unlock(&rps_map_lock); > + > + if (err) > + return err; > > if (old_map) > call_rcu(&old_map->rcu, rps_map_release); > @@ -603,8 +710,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, > static struct rx_queue_attribute rps_cpus_attribute = > __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); > > +static ssize_t show_rps_flows(struct netdev_rx_queue *queue, > + struct rx_queue_attribute *attribute, char *buf) > +{ > + struct rps_map *map; > + unsigned int len; > + > + rcu_read_lock(); > + map = rcu_dereference(queue->rps_map); > + len = map ? map->len : 0; > + rcu_read_unlock(); > + return sprintf(buf, "%u\n", len); > +} > + > +static ssize_t store_rps_flows(struct netdev_rx_queue *queue, > + struct rx_queue_attribute *attribute, > + const char *buf, size_t len) > +{ > + struct rps_map *old_map, *map; > + unsigned long flows; > + int err; > + > + if (!capable(CAP_NET_ADMIN)) > + return -EPERM; > + > + if (strict_strtoul(buf, 0, &flows)) > + return -EINVAL; Are you aware RPS_MAP_SIZE(0x80000000) can overflow ? > + if (flows != 0) { > + map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows), > + L1_CACHE_BYTES), GFP_KERNEL); > + if (map == NULL) > + return -ENOMEM; > + map->len = flows; > + } else { > + map = NULL; > + } > + > + mutex_lock(&rps_map_lock); > + old_map = queue->rps_map; > + err = update_rps_flow_files(&queue->kobj, old_map, map); > + if (!err) { > + if (old_map && map) > + memcpy(map->cpus, old_map->cpus, > + sizeof(map->cpus[0]) * > + min_t(unsigned int, flows, old_map->len)); > + rcu_assign_pointer(queue->rps_map, map); > + } > + mutex_unlock(&rps_map_lock); > + > + if (err) > + return err; > + > + if (old_map) > + call_rcu(&old_map->rcu, rps_map_release); > + > + return len; > +} > + > +static struct rx_queue_attribute rps_flows_attribute = > + __ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows); > + > static struct attribute *rx_queue_default_attrs[] = { > &rps_cpus_attribute.attr, > + &rps_flows_attribute.attr, > NULL > }; > > -- -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 12, 2010 at 12:05 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > Le lundi 12 avril 2010 à 05:42 +0800, Changli Gao a écrit : > > Changli > > I am a bit disappointed to find so many bugs in your patch. :(. Thanks for your patiently review. > > I believe this is over engineering at this stage, we yet have to get > some benches or real world results. OK. Leave this patch here for testing. And I'll post benchmarking after I do. > > Plus it conflicts with the much more interesting upcoming stuff (RFS). > You name this patch 'flow director', to get our attention, but it's an > old idea of you, to get different weights on cpus, that RPS is not yet > able to perform. > I don't think it conflicts with RFS. RFS is for applications, and RPS flow director is for firewalls and routers. Because softirqs aren't under scheduler control, in order to do softirq load balancing, we have to scheduler flows internally. At the same time, RPS flow director exposes more than the old design, and keeps the interface rps_cpus there. > Maybe this is the reason you forgot to CC Tom Herbert (and me) ? > Sorry for forgetting adding you and Tom to CC list. > Consider now : > > 1) echo 65000 >/sys/class/net/eth0/queues/rx-0/rps_flow_0 > possible crash, dereferencing a smaller cpumap. Yea, I supposed cpu_online() would check cpu was valid or not. After checking the code, I found that you were right. OK, I'll add this sanity check. if (cpu >= nr_cpumask_bits) return -EINVAL; > > 2) echo 3000000000 >/sys/class/net/eth0/queues/rx-0/rps_flow_0 > probable crash, because of overflow in RPS_MAP_SIZE(flows) did you mean rps_flows? Yea, RPS_MAP_SIZE may overflow. Maybe I need check the upper boundary. if (flows > USHORT_MAX) return -EINVAL; > > 3) How can rps_flow_attribute & rps_flow_attribute_size be static (one > instance for whole kernel), if your intent is to have a per rxqueue > attributes ? (/sys/class/net/eth0/queues/rx-0/ ...). Or the first lines > of update_rps_flow_files() are completely wrong... > > echo 10 > /sys/class/net/eth0/queues/rx-0/rps_flows > echo 2 > /sys/class/net/eth1/queues/rx-0/rps_flows > cat /sys/class/net/eth0/queues/rx-0/rps_flow_9 Yea, each rxqueue has his own attributes. rps_flow_attribute is much like rps_cpus_attribute. As sysfs_create_file() and sysfs_remove_file() don't modify them, and only use them constantly, we can make them static for all the rxqueues. > > 4) Lack of atomic changes of the RPS flows -> many packet reordering can > occur. Yea, if you do packet dispatching dynamically. I don't know how to avoid packet reordering totally. If OOO doesn't occur frequently, it isn't a real problem. > > 5) Many possible memory leaks in update_rps_flow_files(), you obviously > were very lazy. We try to build a bug-free kernel, not only a 'cool > kernel', and if you are lazy, your patches wont be accepted. > Yea, I was lazy, and didn't shrink the rps_flow_attribute. I'll add reference counters for rps_flow_attributes to trace them usage, and free them when they aren't needed.
On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote: > add rps flow director support > > with rps flow director, users can do weighted packet dispatching among CPUs. > For example, CPU0:CPU1 is 1:3 for eth0's rx-0: > "Flow director" is a misnomer here in that it has no per flow awareness, that is what RFS provides. Please use a different name. > localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows > localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0 > localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1 > localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2 > localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3 > It might be better to put this in its own directory and also do it per CPU instead of hash entry. This should result in a lot fewer entries and I'm not sure how you would deal with holes in the hash table for unspecified entries. Also, it would be nice not to have to specify a number of entries. Maybe something like: localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0 localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1 To specify CPU 0 with weight 1, CPU 1 with weight 3. > Signed-off-by: Changli Gao <xiaosuo@gmail.com> > ---- > net/core/net-sysfs.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 172 insertions(+), 4 deletions(-) > diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c > index 1e7fdd6..d904610 100644 > --- a/net/core/net-sysfs.c > +++ b/net/core/net-sysfs.c > @@ -511,6 +511,109 @@ static struct sysfs_ops rx_queue_sysfs_ops = { > .store = rx_queue_attr_store, > }; > > +static DEFINE_MUTEX(rps_map_lock); > + > +static ssize_t show_rps_flow(struct netdev_rx_queue *queue, > + struct rx_queue_attribute *attribute, char *buf) > +{ > + unsigned long flowid; > + struct rps_map *map; > + u16 cpu; > + > + strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid); > + rcu_read_lock(); > + map = rcu_dereference(queue->rps_map); > + if (map && flowid < map->len) > + cpu = map->cpus[flowid]; > + else > + cpu = 0; > + rcu_read_unlock(); > + return sprintf(buf, "%hu\n", cpu); > +} > + > +static ssize_t store_rps_flow(struct netdev_rx_queue *queue, > + struct rx_queue_attribute *attribute, > + const char *buf, size_t len) > +{ > + unsigned long flowid, cpu; > + struct rps_map *map; > + > + if (!capable(CAP_NET_ADMIN)) > + return -EPERM; > + > + if (strict_strtoul(buf, 0, &cpu)) > + return -EINVAL; > + strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid); > + > + mutex_lock(&rps_map_lock); > + map = queue->rps_map; > + if (map && flowid < map->len) > + map->cpus[flowid] = cpu; > + mutex_unlock(&rps_map_lock); > + > + return len; > +} > + > +static struct rx_queue_attribute **rps_flow_attribute; > +static int rps_flow_attribute_size; > + > +/* must be called with rps_map_lock locked */ > +static int update_rps_flow_files(struct kobject *kobj, > + struct rps_map *old_map, struct rps_map *map) > +{ > + int i; > + int old_map_len = old_map ? old_map->len : 0; > + int map_len = map ? map->len : 0; > + > + if (old_map_len >= map_len) { > + for (i = map_len; i < old_map_len; i++) > + sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr); > + return 0; > + } > + > + if (map_len > rps_flow_attribute_size) { > + struct rx_queue_attribute **attrs; > + char name[sizeof("rps_flow_4294967295")]; > + char *pname; > + > + attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *), > + GFP_KERNEL); > + if (attrs == NULL) > + return -ENOMEM; > + rps_flow_attribute = attrs; > + for (i = rps_flow_attribute_size; i < map_len; i++) { > + rps_flow_attribute[i] = kmalloc(sizeof(**attrs), > + GFP_KERNEL); > + if (rps_flow_attribute[i] == NULL) > + break; > + sprintf(name, "rps_flow_%d", i); > + pname = kstrdup(name, GFP_KERNEL); > + if (pname == NULL) { > + kfree(rps_flow_attribute[i]); > + break; > + } > + rps_flow_attribute[i]->attr.name = pname; > + rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR; > + rps_flow_attribute[i]->show = show_rps_flow; > + rps_flow_attribute[i]->store = store_rps_flow; > + } > + rps_flow_attribute_size = i; > + if (i != map_len) > + return -ENOMEM; > + } > + > + for (i = old_map_len; i < map_len; i++) { > + if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) { > + while (--i >= old_map_len) > + sysfs_remove_file(kobj, > + &rps_flow_attribute[i]->attr); > + return -ENOMEM; > + } > + } > + > + return 0; > +} > + > static ssize_t show_rps_map(struct netdev_rx_queue *queue, > struct rx_queue_attribute *attribute, char *buf) > { > @@ -555,7 +658,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, > struct rps_map *old_map, *map; > cpumask_var_t mask; > int err, cpu, i; > - static DEFINE_SPINLOCK(rps_map_lock); > > if (!capable(CAP_NET_ADMIN)) > return -EPERM; > @@ -588,10 +690,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, > map = NULL; > } > > - spin_lock(&rps_map_lock); > + mutex_lock(&rps_map_lock); > old_map = queue->rps_map; > - rcu_assign_pointer(queue->rps_map, map); > - spin_unlock(&rps_map_lock); > + err = update_rps_flow_files(&queue->kobj, old_map, map); > + if (!err) > + rcu_assign_pointer(queue->rps_map, map); > + mutex_unlock(&rps_map_lock); > + > + if (err) > + return err; > > if (old_map) > call_rcu(&old_map->rcu, rps_map_release); > @@ -603,8 +710,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, > static struct rx_queue_attribute rps_cpus_attribute = > __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); > > +static ssize_t show_rps_flows(struct netdev_rx_queue *queue, > + struct rx_queue_attribute *attribute, char *buf) > +{ > + struct rps_map *map; > + unsigned int len; > + > + rcu_read_lock(); > + map = rcu_dereference(queue->rps_map); > + len = map ? map->len : 0; > + rcu_read_unlock(); > + return sprintf(buf, "%u\n", len); > +} > + > +static ssize_t store_rps_flows(struct netdev_rx_queue *queue, > + struct rx_queue_attribute *attribute, > + const char *buf, size_t len) > +{ > + struct rps_map *old_map, *map; > + unsigned long flows; > + int err; > + > + if (!capable(CAP_NET_ADMIN)) > + return -EPERM; > + > + if (strict_strtoul(buf, 0, &flows)) > + return -EINVAL; > + if (flows != 0) { > + map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows), > + L1_CACHE_BYTES), GFP_KERNEL); > + if (map == NULL) > + return -ENOMEM; > + map->len = flows; > + } else { > + map = NULL; > + } > + > + mutex_lock(&rps_map_lock); > + old_map = queue->rps_map; > + err = update_rps_flow_files(&queue->kobj, old_map, map); > + if (!err) { > + if (old_map && map) > + memcpy(map->cpus, old_map->cpus, > + sizeof(map->cpus[0]) * > + min_t(unsigned int, flows, old_map->len)); > + rcu_assign_pointer(queue->rps_map, map); > + } > + mutex_unlock(&rps_map_lock); > + > + if (err) > + return err; > + > + if (old_map) > + call_rcu(&old_map->rcu, rps_map_release); > + > + return len; > +} > + > +static struct rx_queue_attribute rps_flows_attribute = > + __ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows); > + > static struct attribute *rx_queue_default_attrs[] = { > &rps_cpus_attribute.attr, > + &rps_flows_attribute.attr, > NULL > }; > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 12, 2010 at 9:34 PM, Tom Herbert <therbert@google.com> wrote: > On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote: >> add rps flow director support >> >> with rps flow director, users can do weighted packet dispatching among CPUs. >> For example, CPU0:CPU1 is 1:3 for eth0's rx-0: >> > "Flow director" is a misnomer here in that it has no per flow > awareness, that is what RFS provides. Please use a different name. Flow here is a bundle of flow, not the original meaning. How about "rps_buckets" and "rps_bucket_x"? > >> localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows >> localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0 >> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1 >> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2 >> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3 >> > It might be better to put this in its own directory I have thought that before, but since they control the same data in kernel as rps_cpus does, I put them in the same directory. > and also do it per > CPU instead of hash entry. This should result in a lot fewer entries > and I'm not sure how you would deal with holes in the hash table for > unspecified entries. Also, it would be nice not to have to specify a > number of entries. Maybe something like: > > localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0 > localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1 > > To specify CPU 0 with weight 1, CPU 1 with weight 3. > Your way is more simple and straightforward. My idea has it own advantage: 1. control the rate precision through rps_flows. 2. do dynamic weighted packet dispatching by migrating some flows from some CPUs to other CPUs. During this operations, only the flows migrated are affected, and OOO only occurs in these flows.
On Mon, Apr 12, 2010 at 7:27 AM, Changli Gao <xiaosuo@gmail.com> wrote: > On Mon, Apr 12, 2010 at 9:34 PM, Tom Herbert <therbert@google.com> wrote: >> On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote: >>> add rps flow director support >>> >>> with rps flow director, users can do weighted packet dispatching among CPUs. >>> For example, CPU0:CPU1 is 1:3 for eth0's rx-0: >>> >> "Flow director" is a misnomer here in that it has no per flow >> awareness, that is what RFS provides. Please use a different name. > > Flow here is a bundle of flow, not the original meaning. How about > "rps_buckets" and "rps_bucket_x"? > Ideally, this should replace rps_cpus if it's a better interface.... right now these would be conflicting interfaces. >> >>> localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows >>> localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0 >>> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1 >>> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2 >>> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3 >>> >> It might be better to put this in its own directory > > I have thought that before, but since they control the same data in > kernel as rps_cpus does, I put them in the same directory. > >> and also do it per >> CPU instead of hash entry. This should result in a lot fewer entries >> and I'm not sure how you would deal with holes in the hash table for >> unspecified entries. Also, it would be nice not to have to specify a >> number of entries. Maybe something like: >> >> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0 >> localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1 >> >> To specify CPU 0 with weight 1, CPU 1 with weight 3. >> > > Your way is more simple and straightforward. My idea has it own advantage: > 1. control the rate precision through rps_flows. > 2. do dynamic weighted packet dispatching by migrating some flows from > some CPUs to other CPUs. During this operations, only the flows > migrated are affected, and OOO only occurs in these flows. It's probably a little more work, but the CPU->weight mappings could be implemented to cause minimal disruption in the rps_map. Also, if OOO is an issue, then the mitigation technique in RFS could be applied (this will work best when hash table is larger I believe). Tom > > -- > Regards, > Changli Gao(xiaosuo@gmail.com) > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Apr 13, 2010 at 1:13 AM, Tom Herbert <therbert@google.com> wrote: > On Mon, Apr 12, 2010 at 7:27 AM, Changli Gao <xiaosuo@gmail.com> wrote: >> On Mon, Apr 12, 2010 at 9:34 PM, Tom Herbert <therbert@google.com> wrote: >> > > Ideally, this should replace rps_cpus if it's a better interface.... > right now these would be conflicting interfaces. > How about sw-rxs and sw-rx-$ (SoftWare Receive queue). It is a software emulation of hardware receive queue, as softnet to NIC. sw-rx-$ is equivalent of /proc/irq/$/smp_affinity, only cpuid is instead of cpumask. Does anyone support this interface replacing the current rps_cpus? I do. :) > > It's probably a little more work, but the CPU->weight mappings could > be implemented to cause minimal disruption in the rps_map. Also, if > OOO is an issue, then the mitigation technique in RFS could be applied > (this will work best when hash table is larger I believe). > I am thinking about the cost of keeping packets in order. Is it really worth for this kind of random migration?
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1e7fdd6..d904610 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -511,6 +511,109 @@ static struct sysfs_ops rx_queue_sysfs_ops = { .store = rx_queue_attr_store, }; +static DEFINE_MUTEX(rps_map_lock); + +static ssize_t show_rps_flow(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, char *buf) +{ + unsigned long flowid; + struct rps_map *map; + u16 cpu; + + strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid); + rcu_read_lock(); + map = rcu_dereference(queue->rps_map); + if (map && flowid < map->len) + cpu = map->cpus[flowid]; + else + cpu = 0; + rcu_read_unlock(); + return sprintf(buf, "%hu\n", cpu); +} + +static ssize_t store_rps_flow(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, + const char *buf, size_t len) +{ + unsigned long flowid, cpu; + struct rps_map *map; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (strict_strtoul(buf, 0, &cpu)) + return -EINVAL; + strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid); + + mutex_lock(&rps_map_lock); + map = queue->rps_map; + if (map && flowid < map->len) + map->cpus[flowid] = cpu; + mutex_unlock(&rps_map_lock); + + return len; +} + +static struct rx_queue_attribute **rps_flow_attribute; +static int rps_flow_attribute_size; + +/* must be called with rps_map_lock locked */ +static int update_rps_flow_files(struct kobject *kobj, + struct rps_map *old_map, struct rps_map *map) +{ + int i; + int old_map_len = old_map ? old_map->len : 0; + int map_len = map ? map->len : 0; + + if (old_map_len >= map_len) { + for (i = map_len; i < old_map_len; i++) + sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr); + return 0; + } + + if (map_len > rps_flow_attribute_size) { + struct rx_queue_attribute **attrs; + char name[sizeof("rps_flow_4294967295")]; + char *pname; + + attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *), + GFP_KERNEL); + if (attrs == NULL) + return -ENOMEM; + rps_flow_attribute = attrs; + for (i = rps_flow_attribute_size; i < map_len; i++) { + rps_flow_attribute[i] = kmalloc(sizeof(**attrs), + GFP_KERNEL); + if (rps_flow_attribute[i] == NULL) + break; + sprintf(name, "rps_flow_%d", i); + pname = kstrdup(name, GFP_KERNEL); + if (pname == NULL) { + kfree(rps_flow_attribute[i]); + break; + } + rps_flow_attribute[i]->attr.name = pname; + rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR; + rps_flow_attribute[i]->show = show_rps_flow; + rps_flow_attribute[i]->store = store_rps_flow; + } + rps_flow_attribute_size = i; + if (i != map_len) + return -ENOMEM; + } + + for (i = old_map_len; i < map_len; i++) { + if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) { + while (--i >= old_map_len) + sysfs_remove_file(kobj, + &rps_flow_attribute[i]->attr); + return -ENOMEM; + } + } + + return 0; +} + static ssize_t show_rps_map(struct netdev_rx_queue *queue, struct rx_queue_attribute *attribute, char *buf) { @@ -555,7 +658,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, struct rps_map *old_map, *map; cpumask_var_t mask; int err, cpu, i; - static DEFINE_SPINLOCK(rps_map_lock); if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -588,10 +690,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, map = NULL; } - spin_lock(&rps_map_lock); + mutex_lock(&rps_map_lock); old_map = queue->rps_map; - rcu_assign_pointer(queue->rps_map, map); - spin_unlock(&rps_map_lock); + err = update_rps_flow_files(&queue->kobj, old_map, map); + if (!err) + rcu_assign_pointer(queue->rps_map, map); + mutex_unlock(&rps_map_lock); + + if (err) + return err; if (old_map) call_rcu(&old_map->rcu, rps_map_release); @@ -603,8 +710,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, static struct rx_queue_attribute rps_cpus_attribute = __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); +static ssize_t show_rps_flows(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, char *buf) +{ + struct rps_map *map; + unsigned int len; + + rcu_read_lock(); + map = rcu_dereference(queue->rps_map); + len = map ? map->len : 0; + rcu_read_unlock(); + return sprintf(buf, "%u\n", len); +} + +static ssize_t store_rps_flows(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct rps_map *old_map, *map; + unsigned long flows; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (strict_strtoul(buf, 0, &flows)) + return -EINVAL; + if (flows != 0) { + map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows), + L1_CACHE_BYTES), GFP_KERNEL); + if (map == NULL) + return -ENOMEM; + map->len = flows; + } else { + map = NULL; + } + + mutex_lock(&rps_map_lock); + old_map = queue->rps_map; + err = update_rps_flow_files(&queue->kobj, old_map, map); + if (!err) { + if (old_map && map) + memcpy(map->cpus, old_map->cpus, + sizeof(map->cpus[0]) * + min_t(unsigned int, flows, old_map->len)); + rcu_assign_pointer(queue->rps_map, map); + } + mutex_unlock(&rps_map_lock); + + if (err) + return err; + + if (old_map) + call_rcu(&old_map->rcu, rps_map_release); + + return len; +} + +static struct rx_queue_attribute rps_flows_attribute = + __ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows); + static struct attribute *rx_queue_default_attrs[] = { &rps_cpus_attribute.attr, + &rps_flows_attribute.attr, NULL };
add rps flow director support with rps flow director, users can do weighted packet dispatching among CPUs. For example, CPU0:CPU1 is 1:3 for eth0's rx-0: localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0 localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1 localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2 localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3 Signed-off-by: Changli Gao <xiaosuo@gmail.com> ---- net/core/net-sysfs.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 172 insertions(+), 4 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html