diff mbox

rps: make distributing packets fairly among all the online CPUs default

Message ID 4BA85EA2.1090304@gmail.com
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Changli Gao March 23, 2010, 6:24 a.m. UTC
make distributing packets fairly among all the online CPUs default.

Make distributing packets fairly among all the online CPUs default, then
users don't need any explicit configuration to get the benefit of RPS.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
net/core/dev.c | 34 ++++++++++++++++++++++++++++++++--
1 file changed, 32 insertions(+), 2 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Dumazet March 23, 2010, 7:20 a.m. UTC | #1
Le mardi 23 mars 2010 à 14:24 +0800, Changli Gao a écrit :
> make distributing packets fairly among all the online CPUs default.
> 
> Make distributing packets fairly among all the online CPUs default, then
> users don't need any explicit configuration to get the benefit of RPS.
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
> net/core/dev.c | 34 ++++++++++++++++++++++++++++++++--
> 1 file changed, 32 insertions(+), 2 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index c0e2608..a4246f1 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -5234,6 +5234,24 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
>  }
>  EXPORT_SYMBOL(netif_stacked_transfer_operstate);
>  
> +static struct rps_map* alloc_rps_map(void)
> +{
> +	struct rps_map *map;
> +	int i, cpu;
> +
> +	map = kzalloc(max_t(unsigned,
> +			    RPS_MAP_SIZE(cpumask_weight(cpu_online_mask)),
> +			    L1_CACHE_BYTES), GFP_KERNEL);
> +	if (map == NULL)
> +		return NULL;
> +	i = 0;
> +	for_each_online_cpu(cpu)
> +		map->cpus[i++] = cpu;
> +	map->len = i;
> +
> +	return map;
> +}
> +
>  /**
>   *	register_netdevice	- register a network device
>   *	@dev: device to register
> @@ -5282,7 +5300,13 @@ int register_netdevice(struct net_device *dev)
>  			ret = -ENOMEM;
>  			goto out;
>  		}
> -
> +		dev->_rx->rps_map = alloc_rps_map();
> +		if (dev->_rx->rps_map == NULL) {
> +			kfree(dev->_rx);
> +			dev->_rx = NULL;
> +			ret = -ENOMEM;
> +			goto out;
> +		}
>  		dev->_rx->first = dev->_rx;
>  		atomic_set(&dev->_rx->count, 1);
>  		dev->num_rx_queues = 1;
> @@ -5688,8 +5712,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  	 * Set a pointer to first element in the array which holds the
>  	 * reference count.
>  	 */
> -	for (i = 0; i < queue_count; i++)
> +	for (i = 0; i < queue_count; i++) {
>  		rx[i].first = rx;
> +		rx[i].rps_map = alloc_rps_map();
> +		if (rx[i].rps_map == NULL)
> +			goto free_rx;
> +	}
>  
>  	dev = PTR_ALIGN(p, NETDEV_ALIGN);
>  	dev->padded = (char *)dev - (char *)p;
> @@ -5723,6 +5751,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  	return dev;
>  
>  free_rx:
> +	for (i = 0; i < queue_count; i++)
> +		kfree(rx[i].rps_map);
>  	kfree(rx);
>  free_tx:
>  	kfree(tx);
> 
> 
> --

You cannot do this like this, these allocations wont be freed.

Better would be to take a look at net/core/net-sysfs.c



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 23, 2010, 7:27 a.m. UTC | #2
Le mardi 23 mars 2010 à 14:24 +0800, Changli Gao a écrit :
> make distributing packets fairly among all the online CPUs default.
> 
> Make distributing packets fairly among all the online CPUs default, then
> users don't need any explicit configuration to get the benefit of RPS.
> 

Some people want to dedicate a cpu to network interrupts, cache hits
only, to get low latencies.
RPS is not a win for all workloads.

I think we should not change default without a knob.

Didnt we discussed all this during last months ?


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao March 23, 2010, 9:56 a.m. UTC | #3
On Tue, Mar 23, 2010 at 3:20 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mardi 23 mars 2010 à 14:24 +0800, Changli Gao a écrit :
>
> You cannot do this like this, these allocations wont be freed.
>
> Better would be to take a look at net/core/net-sysfs.c
>

Sorry, but I don't know why these allocations won't be freed.
Changli Gao March 23, 2010, 10:03 a.m. UTC | #4
On Tue, Mar 23, 2010 at 3:27 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mardi 23 mars 2010 à 14:24 +0800, Changli Gao a écrit :
>> make distributing packets fairly among all the online CPUs default.
>>
>> Make distributing packets fairly among all the online CPUs default, then
>> users don't need any explicit configuration to get the benefit of RPS.
>>
>
> Some people want to dedicate a cpu to network interrupts, cache hits
> only, to get low latencies.
> RPS is not a win for all workloads.
>
> I think we should not change default without a knob.
>
> Didnt we discussed all this during last months ?
>

Ben Hutchings raised this question, but nobody replied with any message.

I do think distributing packets fairly among all the online CPUs will
helps most of users. I remember the smp_affinity of IRQ is the
online_cpu_mask be default.
Eric Dumazet March 23, 2010, 11:23 a.m. UTC | #5
Le mardi 23 mars 2010 à 17:56 +0800, Changli Gao a écrit :
> On Tue, Mar 23, 2010 at 3:20 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Le mardi 23 mars 2010 à 14:24 +0800, Changli Gao a écrit :
> >
> > You cannot do this like this, these allocations wont be freed.
> >
> > Better would be to take a look at net/core/net-sysfs.c
> >
> 
> Sorry, but I don't know why these allocations won't be freed.
> 
> 

Hint : This will be freed only if CONFIG_SYSFS is set



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao March 23, 2010, 11:27 a.m. UTC | #6
On Tue, Mar 23, 2010 at 7:23 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mardi 23 mars 2010 à 17:56 +0800, Changli Gao a écrit :
>> On Tue, Mar 23, 2010 at 3:20 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > Le mardi 23 mars 2010 à 14:24 +0800, Changli Gao a écrit :
>> >
>> > You cannot do this like this, these allocations wont be freed.
>> >
>> > Better would be to take a look at net/core/net-sysfs.c
>> >
>>
>> Sorry, but I don't know why these allocations won't be freed.
>>
>>
>
> Hint : This will be freed only if CONFIG_SYSFS is set
>

got it, thanks. But I just wonder if it is right the life cycle of
rps_map is maintained by sysfs rather than netdev itself.
Eric Dumazet March 23, 2010, 11:38 a.m. UTC | #7
Le mardi 23 mars 2010 à 18:03 +0800, Changli Gao a écrit :

> 
> Ben Hutchings raised this question, but nobody replied with any message.
> 
> I do think distributing packets fairly among all the online CPUs will
> helps most of users. I remember the smp_affinity of IRQ is the
> online_cpu_mask be default.
> 

I know _many_ applications that perform better when all network IRQS are
directed to one CPU only. Single threaded UDP server for example, with
fast answer, or routers, or ...

Many sysadmins tuned their machine to meet this requirement. Installing
2.6.35, if RPS enabled by default will make them unhappy, especially if
CONFIG_SYSFS is not set, since they wont be able to change RPS settings.

If RPS was good for all workloads, activating it would make sense, but
this is not the case.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao March 23, 2010, 11:58 a.m. UTC | #8
On Tue, Mar 23, 2010 at 7:38 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mardi 23 mars 2010 à 18:03 +0800, Changli Gao a écrit :
>
>
> I know _many_ applications that perform better when all network IRQS are
> directed to one CPU only. Single threaded UDP server for example, with
> fast answer, or routers, or ...

Yea, single threaded applicatons can't get much from RPS, but
multi-threaded applications such as apache do. And more and more
applications are programmed with the assumption there are more than
one CPUs/Cores.

>
> Many sysadmins tuned their machine to meet this requirement. Installing
> 2.6.35, if RPS enabled by default will make them unhappy, especially if
> CONFIG_SYSFS is not set, since they wont be able to change RPS settings.
>
> If RPS was good for all workloads, activating it would make sense, but
> this is not the case.
>

The sysadmins you mentioned above are Linux experters, and they know
how to tunning the system to get the best performance, but the default
configuration should be for the ordinary users, who don't know Linux
much.
Eric Dumazet March 23, 2010, 1:06 p.m. UTC | #9
Le mardi 23 mars 2010 à 19:58 +0800, Changli Gao a écrit :

> The sysadmins you mentioned above are Linux experters, and they know
> how to tunning the system to get the best performance, but the default
> configuration should be for the ordinary users, who don't know Linux
> much.
> 

I strongly NACK your change at this very moment, many devices run
without a linux expert to tune them, thanks.

Ordinary users dont need RPS magic in 2.6.35.

Maybe in 2.6.38+ we can reconsider this, if RPS deployment happens to be
successful, thanks to experts doing their job and reports.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen March 23, 2010, 2:05 p.m. UTC | #10
Changli Gao <xiaosuo@gmail.com> writes:
>
> I do think distributing packets fairly among all the online CPUs will
> helps most of users. I remember the smp_affinity of IRQ is the
> online_cpu_mask be default.

The established wisdom is that most apps prefer locality.

The cost of data transfer between CPUs is worse than the 
overhead of processing the packet header in the stack.

That's why no system does interrupt RR by default anymore.

Besides if you do such a change you would need extensive benchmarking
to verify that it's a good idea.

-Andi
David Miller March 23, 2010, 7:09 p.m. UTC | #11
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 23 Mar 2010 14:06:20 +0100

> Le mardi 23 mars 2010 à 19:58 +0800, Changli Gao a écrit :
> 
>> The sysadmins you mentioned above are Linux experters, and they know
>> how to tunning the system to get the best performance, but the default
>> configuration should be for the ordinary users, who don't know Linux
>> much.
>> 
> 
> I strongly NACK your change at this very moment, many devices run
> without a linux expert to tune them, thanks.

Right, we're not GNOME, we don't screw knowledgable users of
our software by default.

I agree with Eric %100 on this.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/core/dev.c b/net/core/dev.c
index c0e2608..a4246f1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5234,6 +5234,24 @@  void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 }
 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 
+static struct rps_map* alloc_rps_map(void)
+{
+	struct rps_map *map;
+	int i, cpu;
+
+	map = kzalloc(max_t(unsigned,
+			    RPS_MAP_SIZE(cpumask_weight(cpu_online_mask)),
+			    L1_CACHE_BYTES), GFP_KERNEL);
+	if (map == NULL)
+		return NULL;
+	i = 0;
+	for_each_online_cpu(cpu)
+		map->cpus[i++] = cpu;
+	map->len = i;
+
+	return map;
+}
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
@@ -5282,7 +5300,13 @@  int register_netdevice(struct net_device *dev)
 			ret = -ENOMEM;
 			goto out;
 		}
-
+		dev->_rx->rps_map = alloc_rps_map();
+		if (dev->_rx->rps_map == NULL) {
+			kfree(dev->_rx);
+			dev->_rx = NULL;
+			ret = -ENOMEM;
+			goto out;
+		}
 		dev->_rx->first = dev->_rx;
 		atomic_set(&dev->_rx->count, 1);
 		dev->num_rx_queues = 1;
@@ -5688,8 +5712,12 @@  struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	 * Set a pointer to first element in the array which holds the
 	 * reference count.
 	 */
-	for (i = 0; i < queue_count; i++)
+	for (i = 0; i < queue_count; i++) {
 		rx[i].first = rx;
+		rx[i].rps_map = alloc_rps_map();
+		if (rx[i].rps_map == NULL)
+			goto free_rx;
+	}
 
 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 	dev->padded = (char *)dev - (char *)p;
@@ -5723,6 +5751,8 @@  struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	return dev;
 
 free_rx:
+	for (i = 0; i < queue_count; i++)
+		kfree(rx[i].rps_map);
 	kfree(rx);
 free_tx:
 	kfree(tx);