diff mbox

rps: add flow director support

Message ID 1271022140-3917-1-git-send-email-xiaosuo@gmail.com
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Changli Gao April 11, 2010, 9:42 p.m. UTC
add rps flow director support

with rps flow director, users can do weighted packet dispatching among CPUs.
For example, CPU0:CPU1 is 1:3 for eth0's rx-0:

 localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows  
 localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
 localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
 localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
 localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 net/core/net-sysfs.c |  176 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 172 insertions(+), 4 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Dumazet April 11, 2010, 4:05 p.m. UTC | #1
Le lundi 12 avril 2010 à 05:42 +0800, Changli Gao a écrit :
> add rps flow director support
> 
> with rps flow director, users can do weighted packet dispatching among CPUs.
> For example, CPU0:CPU1 is 1:3 for eth0's rx-0:
> 
>  localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows  
>  localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----

Changli

I am a bit disappointed to find so many bugs in your patch.

I believe this is over engineering at this stage, we yet have to get
some benches or real world results.

Plus it conflicts with the much more interesting upcoming stuff (RFS).
You name this patch 'flow director', to get our attention, but it's an
old idea of you, to get different weights on cpus, that RPS is not yet
able to perform.

Maybe this is the reason you forgot to CC Tom Herbert (and me) ?

Consider now :

1) echo 65000 >/sys/class/net/eth0/queues/rx-0/rps_flow_0
   possible crash, dereferencing a smaller cpumap.

2) echo 3000000000 >/sys/class/net/eth0/queues/rx-0/rps_flow_0
   probable crash, because of overflow in RPS_MAP_SIZE(flows)

3) How can rps_flow_attribute & rps_flow_attribute_size be static (one
instance for whole kernel), if your intent is to have a per rxqueue
attributes ? (/sys/class/net/eth0/queues/rx-0/ ...). Or the first lines
of update_rps_flow_files() are completely wrong...

echo 10 > /sys/class/net/eth0/queues/rx-0/rps_flows
echo 2 > /sys/class/net/eth1/queues/rx-0/rps_flows
cat /sys/class/net/eth0/queues/rx-0/rps_flow_9 

4) Lack of atomic changes of the RPS flows -> many packet reordering can
occur.

5) Many possible memory leaks in update_rps_flow_files(), you obviously
were very lazy. We try to build a bug-free kernel, not only a 'cool
kernel', and if you are lazy, your patches wont be accepted.



>  net/core/net-sysfs.c |  176 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 172 insertions(+), 4 deletions(-)
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 1e7fdd6..d904610 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -511,6 +511,109 @@ static struct sysfs_ops rx_queue_sysfs_ops = {
>  	.store = rx_queue_attr_store,
>  };
>  
> +static DEFINE_MUTEX(rps_map_lock);
> +
> +static ssize_t show_rps_flow(struct netdev_rx_queue *queue,
> +			     struct rx_queue_attribute *attribute, char *buf)
> +{
> +	unsigned long flowid;
> +	struct rps_map *map;
> +	u16 cpu;
> +
> +	strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +	rcu_read_lock();
> +	map = rcu_dereference(queue->rps_map);
> +	if (map && flowid < map->len)
> +		cpu = map->cpus[flowid];
> +	else
> +		cpu = 0;
> +	rcu_read_unlock();
> +	return sprintf(buf, "%hu\n", cpu);
> +}
> +
> +static ssize_t store_rps_flow(struct netdev_rx_queue *queue,
> +			      struct rx_queue_attribute *attribute,
> +			      const char *buf, size_t len)
> +{
> +	unsigned long flowid, cpu;
> +	struct rps_map *map;
> +
> +	if (!capable(CAP_NET_ADMIN))
> +		return -EPERM;
> +
> +	if (strict_strtoul(buf, 0, &cpu))
> +		return -EINVAL;
> +	strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +
> +	mutex_lock(&rps_map_lock);
> +	map = queue->rps_map;
> +	if (map && flowid < map->len)
> +		map->cpus[flowid] = cpu;

what can happen is cpu=65000, and NR_CPUS=32 ?

> +	mutex_unlock(&rps_map_lock);
> +
> +	return len;
> +}
> +
> +static struct rx_queue_attribute **rps_flow_attribute;
> +static int rps_flow_attribute_size;
> +
> +/* must be called with rps_map_lock locked */
> +static int update_rps_flow_files(struct kobject *kobj,
> +				 struct rps_map *old_map, struct rps_map *map)
> +{
> +	int i;
> +	int old_map_len = old_map ? old_map->len : 0;
> +	int map_len = map ? map->len : 0;
> +
> +	if (old_map_len >= map_len) {
> +		for (i = map_len; i < old_map_len; i++)
> +			sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr);
	Removing attributes for this rxqueue, while anothe might need them ?

> +		return 0;
> +	}
> +
> +	if (map_len > rps_flow_attribute_size) {
> +		struct rx_queue_attribute **attrs;
> +		char name[sizeof("rps_flow_4294967295")];
> +		char *pname;
> +
> +		attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *),
> +				 GFP_KERNEL);
> +		if (attrs == NULL)
> +			return -ENOMEM;
> +		rps_flow_attribute = attrs;
> +		for (i = rps_flow_attribute_size; i < map_len; i++) {
> +			rps_flow_attribute[i] = kmalloc(sizeof(**attrs),
> +							GFP_KERNEL);
> +			if (rps_flow_attribute[i] == NULL)
> +				break;
> +			sprintf(name, "rps_flow_%d", i);
> +			pname = kstrdup(name, GFP_KERNEL);
> +			if (pname == NULL) {
> +				kfree(rps_flow_attribute[i]);
> +				break;
> +			}
> +			rps_flow_attribute[i]->attr.name = pname;
> +			rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR;
> +			rps_flow_attribute[i]->show = show_rps_flow;
> +			rps_flow_attribute[i]->store = store_rps_flow;
> +		}
> +		rps_flow_attribute_size = i;
> +		if (i != map_len)
> +			return -ENOMEM;
> +	}
> +
> +	for (i = old_map_len; i < map_len; i++) {
> +		if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) {
> +			while (--i >= old_map_len)
> +				sysfs_remove_file(kobj,
> +						  &rps_flow_attribute[i]->attr);

			No changes to rps_flow_atribute_size ?

> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static ssize_t show_rps_map(struct netdev_rx_queue *queue,
>  			    struct rx_queue_attribute *attribute, char *buf)
>  {
> @@ -555,7 +658,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>  	struct rps_map *old_map, *map;
>  	cpumask_var_t mask;
>  	int err, cpu, i;
> -	static DEFINE_SPINLOCK(rps_map_lock);
>  
>  	if (!capable(CAP_NET_ADMIN))
>  		return -EPERM;
> @@ -588,10 +690,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>  		map = NULL;
>  	}
>  
> -	spin_lock(&rps_map_lock);
> +	mutex_lock(&rps_map_lock);
>  	old_map = queue->rps_map;
> -	rcu_assign_pointer(queue->rps_map, map);
> -	spin_unlock(&rps_map_lock);
> +	err = update_rps_flow_files(&queue->kobj, old_map, map);
> +	if (!err)
> +		rcu_assign_pointer(queue->rps_map, map);
> +	mutex_unlock(&rps_map_lock);
> +
> +	if (err)
> +		return err;
>  
>  	if (old_map)
>  		call_rcu(&old_map->rcu, rps_map_release);
> @@ -603,8 +710,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>  static struct rx_queue_attribute rps_cpus_attribute =
>  	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
>  
> +static ssize_t show_rps_flows(struct netdev_rx_queue *queue,
> +		struct rx_queue_attribute *attribute, char *buf)
> +{
> +	struct rps_map *map;
> +	unsigned int len;
> +
> +	rcu_read_lock();
> +	map = rcu_dereference(queue->rps_map);
> +	len = map ? map->len : 0;
> +	rcu_read_unlock();
> +	return sprintf(buf, "%u\n", len);
> +}
> +
> +static ssize_t store_rps_flows(struct netdev_rx_queue *queue,
> +			       struct rx_queue_attribute *attribute,
> +			       const char *buf, size_t len)
> +{
> +	struct rps_map *old_map, *map;
> +	unsigned long flows;
> +	int err;
> +
> +	if (!capable(CAP_NET_ADMIN))
> +		return -EPERM;
> +
> +	if (strict_strtoul(buf, 0, &flows))
> +		return -EINVAL;

Are you aware RPS_MAP_SIZE(0x80000000) can overflow ?

> +	if (flows != 0) {
> +		map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows),
> +				    L1_CACHE_BYTES), GFP_KERNEL);
> +		if (map == NULL)
> +			return -ENOMEM;
> +		map->len = flows;
> +	} else {
> +		map = NULL;
> +	}
> +
> +	mutex_lock(&rps_map_lock);
> +	old_map = queue->rps_map;
> +	err = update_rps_flow_files(&queue->kobj, old_map, map);
> +	if (!err) {
> +		if (old_map && map)
> +			memcpy(map->cpus, old_map->cpus,
> +			       sizeof(map->cpus[0]) *
> +			       min_t(unsigned int, flows, old_map->len));
> +		rcu_assign_pointer(queue->rps_map, map);
> +	}
> +	mutex_unlock(&rps_map_lock);
> +
> +	if (err)
> +		return err;
> +
> +	if (old_map)
> +		call_rcu(&old_map->rcu, rps_map_release);
> +
> +	return len;
> +}
> +
> +static struct rx_queue_attribute rps_flows_attribute =
> +	__ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows);
> +
>  static struct attribute *rx_queue_default_attrs[] = {
>  	&rps_cpus_attribute.attr,
> +	&rps_flows_attribute.attr,
>  	NULL
>  };
>  
> --



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao April 12, 2010, 3:58 a.m. UTC | #2
On Mon, Apr 12, 2010 at 12:05 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le lundi 12 avril 2010 à 05:42 +0800, Changli Gao a écrit :
>
> Changli
>
> I am a bit disappointed to find so many bugs in your patch.

:(. Thanks for your patiently review.

>
> I believe this is over engineering at this stage, we yet have to get
> some benches or real world results.

OK. Leave this patch here for testing. And I'll post benchmarking after I do.

>
> Plus it conflicts with the much more interesting upcoming stuff (RFS).
> You name this patch 'flow director', to get our attention, but it's an
> old idea of you, to get different weights on cpus, that RPS is not yet
> able to perform.
>

I don't think it conflicts with RFS. RFS is for applications, and RPS
flow director is for firewalls and routers. Because softirqs aren't
under scheduler control, in order to do softirq load balancing, we
have to scheduler flows internally. At the same time, RPS flow
director exposes more than the old design, and keeps the interface
rps_cpus there.

> Maybe this is the reason you forgot to CC Tom Herbert (and me) ?
>

Sorry for forgetting adding you and Tom to CC list.

> Consider now :
>
> 1) echo 65000 >/sys/class/net/eth0/queues/rx-0/rps_flow_0
>   possible crash, dereferencing a smaller cpumap.

Yea, I supposed cpu_online() would check cpu was valid or not. After
checking the code, I found that you were right. OK, I'll add this
sanity check.

if (cpu >= nr_cpumask_bits)
      return -EINVAL;

>
> 2) echo 3000000000 >/sys/class/net/eth0/queues/rx-0/rps_flow_0
>   probable crash, because of overflow in RPS_MAP_SIZE(flows)

did you mean rps_flows? Yea, RPS_MAP_SIZE may overflow. Maybe I need
check the upper boundary.

if (flows > USHORT_MAX)
     return -EINVAL;

>
> 3) How can rps_flow_attribute & rps_flow_attribute_size be static (one
> instance for whole kernel), if your intent is to have a per rxqueue
> attributes ? (/sys/class/net/eth0/queues/rx-0/ ...). Or the first lines
> of update_rps_flow_files() are completely wrong...
>
> echo 10 > /sys/class/net/eth0/queues/rx-0/rps_flows
> echo 2 > /sys/class/net/eth1/queues/rx-0/rps_flows
> cat /sys/class/net/eth0/queues/rx-0/rps_flow_9

Yea, each rxqueue has his own attributes. rps_flow_attribute is much
like rps_cpus_attribute. As sysfs_create_file() and
sysfs_remove_file() don't modify them, and only use them constantly,
we can make them static for all the rxqueues.

>
> 4) Lack of atomic changes of the RPS flows -> many packet reordering can
> occur.

Yea, if you do packet dispatching dynamically. I don't know how to
avoid packet reordering totally. If OOO doesn't occur frequently, it
isn't a real problem.

>
> 5) Many possible memory leaks in update_rps_flow_files(), you obviously
> were very lazy. We try to build a bug-free kernel, not only a 'cool
> kernel', and if you are lazy, your patches wont be accepted.
>

Yea, I was lazy, and didn't shrink the rps_flow_attribute. I'll add
reference counters for rps_flow_attributes to trace them usage, and
free them when they aren't needed.
Tom Herbert April 12, 2010, 1:34 p.m. UTC | #3
On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> add rps flow director support
>
> with rps flow director, users can do weighted packet dispatching among CPUs.
> For example, CPU0:CPU1 is 1:3 for eth0's rx-0:
>
"Flow director" is a misnomer here in that it has no per flow
awareness, that is what RFS provides.  Please use a different name.

>  localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows
>  localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3
>
It might be better to put this in its own directory and also do it per
CPU instead of hash entry.  This should result in a lot fewer entries
and I'm not sure how you would deal with holes in the hash table for
unspecified entries.  Also, it would be nice not to have to specify a
number of entries.  Maybe something like:

localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0
localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1

To specify CPU 0 with weight 1, CPU 1 with weight 3.

> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
>  net/core/net-sysfs.c |  176 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 172 insertions(+), 4 deletions(-)
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 1e7fdd6..d904610 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -511,6 +511,109 @@ static struct sysfs_ops rx_queue_sysfs_ops = {
>        .store = rx_queue_attr_store,
>  };
>
> +static DEFINE_MUTEX(rps_map_lock);
> +
> +static ssize_t show_rps_flow(struct netdev_rx_queue *queue,
> +                            struct rx_queue_attribute *attribute, char *buf)
> +{
> +       unsigned long flowid;
> +       struct rps_map *map;
> +       u16 cpu;
> +
> +       strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +       rcu_read_lock();
> +       map = rcu_dereference(queue->rps_map);
> +       if (map && flowid < map->len)
> +               cpu = map->cpus[flowid];
> +       else
> +               cpu = 0;
> +       rcu_read_unlock();
> +       return sprintf(buf, "%hu\n", cpu);
> +}
> +
> +static ssize_t store_rps_flow(struct netdev_rx_queue *queue,
> +                             struct rx_queue_attribute *attribute,
> +                             const char *buf, size_t len)
> +{
> +       unsigned long flowid, cpu;
> +       struct rps_map *map;
> +
> +       if (!capable(CAP_NET_ADMIN))
> +               return -EPERM;
> +
> +       if (strict_strtoul(buf, 0, &cpu))
> +               return -EINVAL;
> +       strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +
> +       mutex_lock(&rps_map_lock);
> +       map = queue->rps_map;
> +       if (map && flowid < map->len)
> +               map->cpus[flowid] = cpu;
> +       mutex_unlock(&rps_map_lock);
> +
> +       return len;
> +}
> +
> +static struct rx_queue_attribute **rps_flow_attribute;
> +static int rps_flow_attribute_size;
> +
> +/* must be called with rps_map_lock locked */
> +static int update_rps_flow_files(struct kobject *kobj,
> +                                struct rps_map *old_map, struct rps_map *map)
> +{
> +       int i;
> +       int old_map_len = old_map ? old_map->len : 0;
> +       int map_len = map ? map->len : 0;
> +
> +       if (old_map_len >= map_len) {
> +               for (i = map_len; i < old_map_len; i++)
> +                       sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr);
> +               return 0;
> +       }
> +
> +       if (map_len > rps_flow_attribute_size) {
> +               struct rx_queue_attribute **attrs;
> +               char name[sizeof("rps_flow_4294967295")];
> +               char *pname;
> +
> +               attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *),
> +                                GFP_KERNEL);
> +               if (attrs == NULL)
> +                       return -ENOMEM;
> +               rps_flow_attribute = attrs;
> +               for (i = rps_flow_attribute_size; i < map_len; i++) {
> +                       rps_flow_attribute[i] = kmalloc(sizeof(**attrs),
> +                                                       GFP_KERNEL);
> +                       if (rps_flow_attribute[i] == NULL)
> +                               break;
> +                       sprintf(name, "rps_flow_%d", i);
> +                       pname = kstrdup(name, GFP_KERNEL);
> +                       if (pname == NULL) {
> +                               kfree(rps_flow_attribute[i]);
> +                               break;
> +                       }
> +                       rps_flow_attribute[i]->attr.name = pname;
> +                       rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR;
> +                       rps_flow_attribute[i]->show = show_rps_flow;
> +                       rps_flow_attribute[i]->store = store_rps_flow;
> +               }
> +               rps_flow_attribute_size = i;
> +               if (i != map_len)
> +                       return -ENOMEM;
> +       }
> +
> +       for (i = old_map_len; i < map_len; i++) {
> +               if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) {
> +                       while (--i >= old_map_len)
> +                               sysfs_remove_file(kobj,
> +                                                 &rps_flow_attribute[i]->attr);
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
>  static ssize_t show_rps_map(struct netdev_rx_queue *queue,
>                            struct rx_queue_attribute *attribute, char *buf)
>  {
> @@ -555,7 +658,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>        struct rps_map *old_map, *map;
>        cpumask_var_t mask;
>        int err, cpu, i;
> -       static DEFINE_SPINLOCK(rps_map_lock);
>
>        if (!capable(CAP_NET_ADMIN))
>                return -EPERM;
> @@ -588,10 +690,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>                map = NULL;
>        }
>
> -       spin_lock(&rps_map_lock);
> +       mutex_lock(&rps_map_lock);
>        old_map = queue->rps_map;
> -       rcu_assign_pointer(queue->rps_map, map);
> -       spin_unlock(&rps_map_lock);
> +       err = update_rps_flow_files(&queue->kobj, old_map, map);
> +       if (!err)
> +               rcu_assign_pointer(queue->rps_map, map);
> +       mutex_unlock(&rps_map_lock);
> +
> +       if (err)
> +               return err;
>
>        if (old_map)
>                call_rcu(&old_map->rcu, rps_map_release);
> @@ -603,8 +710,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>  static struct rx_queue_attribute rps_cpus_attribute =
>        __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
>
> +static ssize_t show_rps_flows(struct netdev_rx_queue *queue,
> +               struct rx_queue_attribute *attribute, char *buf)
> +{
> +       struct rps_map *map;
> +       unsigned int len;
> +
> +       rcu_read_lock();
> +       map = rcu_dereference(queue->rps_map);
> +       len = map ? map->len : 0;
> +       rcu_read_unlock();
> +       return sprintf(buf, "%u\n", len);
> +}
> +
> +static ssize_t store_rps_flows(struct netdev_rx_queue *queue,
> +                              struct rx_queue_attribute *attribute,
> +                              const char *buf, size_t len)
> +{
> +       struct rps_map *old_map, *map;
> +       unsigned long flows;
> +       int err;
> +
> +       if (!capable(CAP_NET_ADMIN))
> +               return -EPERM;
> +
> +       if (strict_strtoul(buf, 0, &flows))
> +               return -EINVAL;
> +       if (flows != 0) {
> +               map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows),
> +                                   L1_CACHE_BYTES), GFP_KERNEL);
> +               if (map == NULL)
> +                       return -ENOMEM;
> +               map->len = flows;
> +       } else {
> +               map = NULL;
> +       }
> +
> +       mutex_lock(&rps_map_lock);
> +       old_map = queue->rps_map;
> +       err = update_rps_flow_files(&queue->kobj, old_map, map);
> +       if (!err) {
> +               if (old_map && map)
> +                       memcpy(map->cpus, old_map->cpus,
> +                              sizeof(map->cpus[0]) *
> +                              min_t(unsigned int, flows, old_map->len));
> +               rcu_assign_pointer(queue->rps_map, map);
> +       }
> +       mutex_unlock(&rps_map_lock);
> +
> +       if (err)
> +               return err;
> +
> +       if (old_map)
> +               call_rcu(&old_map->rcu, rps_map_release);
> +
> +       return len;
> +}
> +
> +static struct rx_queue_attribute rps_flows_attribute =
> +       __ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows);
> +
>  static struct attribute *rx_queue_default_attrs[] = {
>        &rps_cpus_attribute.attr,
> +       &rps_flows_attribute.attr,
>        NULL
>  };
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao April 12, 2010, 2:27 p.m. UTC | #4
On Mon, Apr 12, 2010 at 9:34 PM, Tom Herbert <therbert@google.com> wrote:
> On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote:
>> add rps flow director support
>>
>> with rps flow director, users can do weighted packet dispatching among CPUs.
>> For example, CPU0:CPU1 is 1:3 for eth0's rx-0:
>>
> "Flow director" is a misnomer here in that it has no per flow
> awareness, that is what RFS provides.  Please use a different name.

Flow here is a bundle of flow, not the original meaning. How about
"rps_buckets" and "rps_bucket_x"?

>
>>  localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows
>>  localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3
>>
> It might be better to put this in its own directory

I have thought that before, but since they control the same data in
kernel as rps_cpus does, I put them in the same directory.

> and also do it per
> CPU instead of hash entry.  This should result in a lot fewer entries
> and I'm not sure how you would deal with holes in the hash table for
> unspecified entries.  Also, it would be nice not to have to specify a
> number of entries.  Maybe something like:
>
> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0
> localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1
>
> To specify CPU 0 with weight 1, CPU 1 with weight 3.
>

Your way is more simple and straightforward. My idea has it own advantage:
1. control the rate precision through rps_flows.
2. do dynamic weighted packet dispatching by migrating some flows from
some CPUs to other CPUs. During this operations, only the flows
migrated are affected, and OOO only occurs in these flows.
Tom Herbert April 12, 2010, 5:13 p.m. UTC | #5
On Mon, Apr 12, 2010 at 7:27 AM, Changli Gao <xiaosuo@gmail.com> wrote:
> On Mon, Apr 12, 2010 at 9:34 PM, Tom Herbert <therbert@google.com> wrote:
>> On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote:
>>> add rps flow director support
>>>
>>> with rps flow director, users can do weighted packet dispatching among CPUs.
>>> For example, CPU0:CPU1 is 1:3 for eth0's rx-0:
>>>
>> "Flow director" is a misnomer here in that it has no per flow
>> awareness, that is what RFS provides.  Please use a different name.
>
> Flow here is a bundle of flow, not the original meaning. How about
> "rps_buckets" and "rps_bucket_x"?
>

Ideally, this should replace rps_cpus if it's a better interface....
right now these would be conflicting interfaces.

>>
>>>  localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows
>>>  localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
>>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
>>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
>>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3
>>>
>> It might be better to put this in its own directory
>
> I have thought that before, but since they control the same data in
> kernel as rps_cpus does, I put them in the same directory.
>
>> and also do it per
>> CPU instead of hash entry.  This should result in a lot fewer entries
>> and I'm not sure how you would deal with holes in the hash table for
>> unspecified entries.  Also, it would be nice not to have to specify a
>> number of entries.  Maybe something like:
>>
>> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0
>> localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1
>>
>> To specify CPU 0 with weight 1, CPU 1 with weight 3.
>>
>
> Your way is more simple and straightforward. My idea has it own advantage:
> 1. control the rate precision through rps_flows.
> 2. do dynamic weighted packet dispatching by migrating some flows from
> some CPUs to other CPUs. During this operations, only the flows
> migrated are affected, and OOO only occurs in these flows.

It's probably a little more work, but the CPU->weight mappings could
be implemented to cause minimal disruption in the rps_map.  Also, if
OOO is an issue, then the mitigation technique in RFS could be applied
(this will work best when hash table is larger I believe).

Tom

>
> --
> Regards,
> Changli Gao(xiaosuo@gmail.com)
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao April 13, 2010, 3:11 a.m. UTC | #6
On Tue, Apr 13, 2010 at 1:13 AM, Tom Herbert <therbert@google.com> wrote:
> On Mon, Apr 12, 2010 at 7:27 AM, Changli Gao <xiaosuo@gmail.com> wrote:
>> On Mon, Apr 12, 2010 at 9:34 PM, Tom Herbert <therbert@google.com> wrote:
>>
>
> Ideally, this should replace rps_cpus if it's a better interface....
> right now these would be conflicting interfaces.
>

How about sw-rxs and sw-rx-$ (SoftWare Receive queue). It is a
software emulation of hardware receive queue, as softnet to NIC.
sw-rx-$ is equivalent of /proc/irq/$/smp_affinity, only cpuid is
instead of cpumask.

Does anyone support this interface replacing the current rps_cpus? I do. :)

>
> It's probably a little more work, but the CPU->weight mappings could
> be implemented to cause minimal disruption in the rps_map.  Also, if
> OOO is an issue, then the mitigation technique in RFS could be applied
> (this will work best when hash table is larger I believe).
>

I am thinking about the cost of keeping packets in order. Is it really
worth for this kind of random migration?
diff mbox

Patch

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1e7fdd6..d904610 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -511,6 +511,109 @@  static struct sysfs_ops rx_queue_sysfs_ops = {
 	.store = rx_queue_attr_store,
 };
 
+static DEFINE_MUTEX(rps_map_lock);
+
+static ssize_t show_rps_flow(struct netdev_rx_queue *queue,
+			     struct rx_queue_attribute *attribute, char *buf)
+{
+	unsigned long flowid;
+	struct rps_map *map;
+	u16 cpu;
+
+	strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
+	rcu_read_lock();
+	map = rcu_dereference(queue->rps_map);
+	if (map && flowid < map->len)
+		cpu = map->cpus[flowid];
+	else
+		cpu = 0;
+	rcu_read_unlock();
+	return sprintf(buf, "%hu\n", cpu);
+}
+
+static ssize_t store_rps_flow(struct netdev_rx_queue *queue,
+			      struct rx_queue_attribute *attribute,
+			      const char *buf, size_t len)
+{
+	unsigned long flowid, cpu;
+	struct rps_map *map;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (strict_strtoul(buf, 0, &cpu))
+		return -EINVAL;
+	strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
+
+	mutex_lock(&rps_map_lock);
+	map = queue->rps_map;
+	if (map && flowid < map->len)
+		map->cpus[flowid] = cpu;
+	mutex_unlock(&rps_map_lock);
+
+	return len;
+}
+
+static struct rx_queue_attribute **rps_flow_attribute;
+static int rps_flow_attribute_size;
+
+/* must be called with rps_map_lock locked */
+static int update_rps_flow_files(struct kobject *kobj,
+				 struct rps_map *old_map, struct rps_map *map)
+{
+	int i;
+	int old_map_len = old_map ? old_map->len : 0;
+	int map_len = map ? map->len : 0;
+
+	if (old_map_len >= map_len) {
+		for (i = map_len; i < old_map_len; i++)
+			sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr);
+		return 0;
+	}
+
+	if (map_len > rps_flow_attribute_size) {
+		struct rx_queue_attribute **attrs;
+		char name[sizeof("rps_flow_4294967295")];
+		char *pname;
+
+		attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *),
+				 GFP_KERNEL);
+		if (attrs == NULL)
+			return -ENOMEM;
+		rps_flow_attribute = attrs;
+		for (i = rps_flow_attribute_size; i < map_len; i++) {
+			rps_flow_attribute[i] = kmalloc(sizeof(**attrs),
+							GFP_KERNEL);
+			if (rps_flow_attribute[i] == NULL)
+				break;
+			sprintf(name, "rps_flow_%d", i);
+			pname = kstrdup(name, GFP_KERNEL);
+			if (pname == NULL) {
+				kfree(rps_flow_attribute[i]);
+				break;
+			}
+			rps_flow_attribute[i]->attr.name = pname;
+			rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR;
+			rps_flow_attribute[i]->show = show_rps_flow;
+			rps_flow_attribute[i]->store = store_rps_flow;
+		}
+		rps_flow_attribute_size = i;
+		if (i != map_len)
+			return -ENOMEM;
+	}
+
+	for (i = old_map_len; i < map_len; i++) {
+		if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) {
+			while (--i >= old_map_len)
+				sysfs_remove_file(kobj,
+						  &rps_flow_attribute[i]->attr);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 static ssize_t show_rps_map(struct netdev_rx_queue *queue,
 			    struct rx_queue_attribute *attribute, char *buf)
 {
@@ -555,7 +658,6 @@  ssize_t store_rps_map(struct netdev_rx_queue *queue,
 	struct rps_map *old_map, *map;
 	cpumask_var_t mask;
 	int err, cpu, i;
-	static DEFINE_SPINLOCK(rps_map_lock);
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -588,10 +690,15 @@  ssize_t store_rps_map(struct netdev_rx_queue *queue,
 		map = NULL;
 	}
 
-	spin_lock(&rps_map_lock);
+	mutex_lock(&rps_map_lock);
 	old_map = queue->rps_map;
-	rcu_assign_pointer(queue->rps_map, map);
-	spin_unlock(&rps_map_lock);
+	err = update_rps_flow_files(&queue->kobj, old_map, map);
+	if (!err)
+		rcu_assign_pointer(queue->rps_map, map);
+	mutex_unlock(&rps_map_lock);
+
+	if (err)
+		return err;
 
 	if (old_map)
 		call_rcu(&old_map->rcu, rps_map_release);
@@ -603,8 +710,69 @@  ssize_t store_rps_map(struct netdev_rx_queue *queue,
 static struct rx_queue_attribute rps_cpus_attribute =
 	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
 
+static ssize_t show_rps_flows(struct netdev_rx_queue *queue,
+		struct rx_queue_attribute *attribute, char *buf)
+{
+	struct rps_map *map;
+	unsigned int len;
+
+	rcu_read_lock();
+	map = rcu_dereference(queue->rps_map);
+	len = map ? map->len : 0;
+	rcu_read_unlock();
+	return sprintf(buf, "%u\n", len);
+}
+
+static ssize_t store_rps_flows(struct netdev_rx_queue *queue,
+			       struct rx_queue_attribute *attribute,
+			       const char *buf, size_t len)
+{
+	struct rps_map *old_map, *map;
+	unsigned long flows;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (strict_strtoul(buf, 0, &flows))
+		return -EINVAL;
+	if (flows != 0) {
+		map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows),
+				    L1_CACHE_BYTES), GFP_KERNEL);
+		if (map == NULL)
+			return -ENOMEM;
+		map->len = flows;
+	} else {
+		map = NULL;
+	}
+
+	mutex_lock(&rps_map_lock);
+	old_map = queue->rps_map;
+	err = update_rps_flow_files(&queue->kobj, old_map, map);
+	if (!err) {
+		if (old_map && map)
+			memcpy(map->cpus, old_map->cpus,
+			       sizeof(map->cpus[0]) *
+			       min_t(unsigned int, flows, old_map->len));
+		rcu_assign_pointer(queue->rps_map, map);
+	}
+	mutex_unlock(&rps_map_lock);
+
+	if (err)
+		return err;
+
+	if (old_map)
+		call_rcu(&old_map->rcu, rps_map_release);
+
+	return len;
+}
+
+static struct rx_queue_attribute rps_flows_attribute =
+	__ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows);
+
 static struct attribute *rx_queue_default_attrs[] = {
 	&rps_cpus_attribute.attr,
+	&rps_flows_attribute.attr,
 	NULL
 };