diff mbox

[1/2] rps: core implementation

Message ID 65634d660911102253o2b4f7a19kfed5849e5c88bfe1@mail.gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Tom Herbert Nov. 11, 2009, 6:53 a.m. UTC
Third version of RPS.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/linux/interrupt.h |    1 +
 include/linux/netdevice.h |   18 ++++
 include/linux/skbuff.h    |    2 +
 net/core/dev.c            |  227 ++++++++++++++++++++++++++++++++++++++-------
 net/core/net-sysfs.c      |  135 +++++++++++++++++++++++++++
 5 files changed, 348 insertions(+), 35 deletions(-)

Comments

Eric Dumazet Nov. 11, 2009, 8:20 a.m. UTC | #1
Tom Herbert a écrit :
> Third version of RPS.
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
>  include/linux/interrupt.h |    1 +
>  include/linux/netdevice.h |   18 ++++
>  include/linux/skbuff.h    |    2 +
>  net/core/dev.c            |  227 ++++++++++++++++++++++++++++++++++++++-------
>  net/core/net-sysfs.c      |  135 +++++++++++++++++++++++++++
>  5 files changed, 348 insertions(+), 35 deletions(-)
> 

I must say this is really exciting :)

> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index b78cf81..fa91194 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -345,6 +345,7 @@ enum
>  	TIMER_SOFTIRQ,
>  	NET_TX_SOFTIRQ,
>  	NET_RX_SOFTIRQ,
> +	NET_RPS_SOFTIRQ,
>  	BLOCK_SOFTIRQ,
>  	BLOCK_IOPOLL_SOFTIRQ,
>  	TASKLET_SOFTIRQ,
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 8380009..c1b1bbb 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -639,6 +639,18 @@ struct net_device_ops {
>  };
> 
>  /*
> + * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
> + */
> +struct rps_map {
> +	int len;
> +	u16 map[0];
> +};
> +
> +/* Maximum size of RPS map (for allocation) */
> +#define RPS_MAP_SIZE (sizeof(struct rps_map) + \
> +    (num_possible_cpus() * sizeof(u16)))
> +

Problem of possible cpus is the number can be very large on some arches,
but yet few cpus online....

In this kind of situation, get_rps_cpu() will return -1 most of the time,
defeating goal of RPS ?


> +/*
>   *	The DEVICE structure.
>   *	Actually, this whole structure is a big mistake.  It mixes I/O
>   *	data with strictly "high-level" data, and it has to know about
> @@ -807,6 +819,9 @@ struct net_device
>  	void			*ax25_ptr;	/* AX.25 specific data */
>  	struct wireless_dev	*ieee80211_ptr;	/* IEEE 802.11 specific data,
>  						   assign before registering */
> +	void			*rps_maps;	/* Array of per-NAPI maps for
> +						   receive packet steeing */
> +	int			rps_num_maps;	/* Number of RPS maps */
> 
>  /*
>   * Cache line mostly used on receive path (including eth_type_trans())
> @@ -1217,6 +1232,9 @@ struct softnet_data
>  	struct Qdisc		*output_queue;
>  	struct sk_buff_head	input_pkt_queue;
>  	struct list_head	poll_list;
> +
> +	struct call_single_data	csd;
> +
>  	struct sk_buff		*completion_queue;
> 
>  	struct napi_struct	backlog;
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 0c68fbd..95feac7 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -396,6 +396,8 @@ struct sk_buff {
> 
>  	__u16			vlan_tci;
> 
> +	__u32			rxhash;
> +
>  	sk_buff_data_t		transport_header;
>  	sk_buff_data_t		network_header;
>  	sk_buff_data_t		mac_header;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 28b0b9e..735e7e3 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1976,6 +1976,162 @@ int weight_p __read_mostly = 64;            /*
> old backlog weight */
> 
>  DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
> 
> +static u32 simple_hashrnd;
> +
> +/**
> + * get_rps_cpu is called from netif_receive_skb and returns the target
> + * CPU from the RPS map of the receiving NAPI instance for a given skb.
> + */
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> +{
> +	u32 addr1, addr2, ports;
> +	struct ipv6hdr *ip6;
> +	struct iphdr *ip;
> +	u32 hash, ihl;
> +	u8 ip_proto;
> +	int cpu;
> +	struct rps_map *map = NULL;
> +
> +	if (dev->rps_num_maps) {
> +		/*
> +		 * Locate the map corresponding to the NAPI queue that
> +		 * the packet was received on.
> +		 */
> +		int index = skb_get_rx_queue(skb);
> +		if (index < 0 || index >= dev->rps_num_maps)
> +			index = 0;
> +
> +		map = (struct rps_map *)
> +		    (dev->rps_maps + (RPS_MAP_SIZE * index));
> +		if (!map->len)
> +			map = NULL;
> +	}
> +
> +	if (!map)
> +		return -1;
> +
> +	hash = skb->rxhash;
> +	if (hash)
> +		goto got_hash; /* Skip hash computation on packet header */
> +
> +	switch (skb->protocol) {
> +	case __constant_htons(ETH_P_IP):
> +		if (!pskb_may_pull(skb, sizeof(*ip)))
> +			return -1;
> +
> +		ip = (struct iphdr *) skb->data;
> +		ip_proto = ip->protocol;
> +		addr1 = ip->saddr;
> +		addr2 = ip->daddr;
> +		ihl = ip->ihl;
> +		break;
> +	case __constant_htons(ETH_P_IPV6):
> +		if (!pskb_may_pull(skb, sizeof(*ip6)))
> +			return -1;
> +
> +		ip6 = (struct ipv6hdr *) skb->data;
> +		ip_proto = ip6->nexthdr;
> +		addr1 = ip6->saddr.s6_addr32[3];
> +		addr2 = ip6->daddr.s6_addr32[3];
> +		ihl = (40 >> 2);
> +		break;
> +	default:
> +		return -1;
> +	}
> +	ports = 0;
> +	switch (ip_proto) {
> +	case IPPROTO_TCP:
> +	case IPPROTO_UDP:
> +	case IPPROTO_DCCP:
> +	case IPPROTO_ESP:
> +	case IPPROTO_AH:
> +	case IPPROTO_SCTP:
> +	case IPPROTO_UDPLITE:
> +		if (pskb_may_pull(skb, (ihl * 4) + 4))
> +			ports = *((u32 *) (skb->data + (ihl * 4)));
> +		break;
> +
> +	default:
> +		break;
> +	}
> +
> +	hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);

I wonder if you tried to exchange addr1/addr2  port1/port2 so that conntracking/routing
is also speedup ...

ie make sure hash will be the same regardless of the direction of packet.

union {
	u32 port;
	u16 ports[2];
} p;

if (addr1 < addr2)
	swap(addr1, addr2);

if (p.ports[0] < p.ports[1]);
	swap(p.ports[0], p.ports[1]);

hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);


I think I'll try to extend your patches with TX completion recycling too.

Ie record in skb the cpu number of original sender, and queue skb to
remote queue for destruction (sock_wfree() call and expensive scheduler calls...)

(This probably needs driver cooperation, instead of calling consume_skb(),
use a different function)

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Nov. 11, 2009, 4:28 p.m. UTC | #2
> I must say this is really exciting :)
>
Thanks!


>> +/* Maximum size of RPS map (for allocation) */
>> +#define RPS_MAP_SIZE (sizeof(struct rps_map) + \
>> +    (num_possible_cpus() * sizeof(u16)))
>> +
>
> Problem of possible cpus is the number can be very large on some arches,
> but yet few cpus online....
>
> In this kind of situation, get_rps_cpu() will return -1 most of the time,
> defeating goal of RPS ?
>
I suppose it would make sense to either use num_online_cpus or simply
put a reasonable limit on it (like HW RSS hash tables are 128 entries
I believe).


>> +     hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);
>
> I wonder if you tried to exchange addr1/addr2  port1/port2 so that conntracking/routing
> is also speedup ...
>
> ie make sure hash will be the same regardless of the direction of packet.
>
> union {
>        u32 port;
>        u16 ports[2];
> } p;
>
> if (addr1 < addr2)
>        swap(addr1, addr2);
>
> if (p.ports[0] < p.ports[1]);
>        swap(p.ports[0], p.ports[1]);
>
I have not considered that.  How much of a win would this be?

> hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);
>
Another possibility we considered was to call inet_hashfn and
inet6_ehashfn directly to get the hash, and store that value in
skb->rxhash and use it later on connection lookup in tcp_v4_rcv to
eliminate to another jhash.  This has some benefit, but it doesn't
help if we get different type of hash from HW (using that is a much
bigger win), and also we needed to pull in more IP header files into
dev.c.

>
> I think I'll try to extend your patches with TX completion recycling too.
>
> Ie record in skb the cpu number of original sender, and queue skb to
> remote queue for destruction (sock_wfree() call and expensive scheduler calls...)
>
We also have implemented a form of that if you are interested.  In
dev_kfree_skb put the skb on the completion list the origin CPU of the
skb (where it was allocated) and use the remote softirq to schedule
processing.

Tom
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Randy.Dunlap Nov. 11, 2009, 4:49 p.m. UTC | #3
On Tue, 10 Nov 2009 22:53:17 -0800 Tom Herbert wrote:

> Third version of RPS.
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
>  include/linux/interrupt.h |    1 +
>  include/linux/netdevice.h |   18 ++++
>  include/linux/skbuff.h    |    2 +
>  net/core/dev.c            |  227 ++++++++++++++++++++++++++++++++++++++-------
>  net/core/net-sysfs.c      |  135 +++++++++++++++++++++++++++
>  5 files changed, 348 insertions(+), 35 deletions(-)
> 

> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 0c68fbd..95feac7 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -396,6 +396,8 @@ struct sk_buff {
> 
>  	__u16			vlan_tci;
> 
> +	__u32			rxhash;
> +

@rxhash needs to be added to the kernel-doc for struct sk_buff.

>  	sk_buff_data_t		transport_header;
>  	sk_buff_data_t		network_header;
>  	sk_buff_data_t		mac_header;

> diff --git a/net/core/dev.c b/net/core/dev.c
> index 28b0b9e..735e7e3 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1976,6 +1976,162 @@ int weight_p __read_mostly = 64;            /*
> old backlog weight */
> 
>  DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
> 
> +static u32 simple_hashrnd;
> +
> +/**
> + * get_rps_cpu is called from netif_receive_skb and returns the target
> + * CPU from the RPS map of the receiving NAPI instance for a given skb.
> + */

"/**" in kernel source code means "begin kernel-doc notation", but that
is not kernel-doc notation, so please make it be kernel-doc, or don't
use "/**" to begin comment blocks.

(in several functions here...)

> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> +{
> +	u32 addr1, addr2, ports;
> +	struct ipv6hdr *ip6;
> +	struct iphdr *ip;
> +	u32 hash, ihl;
> +	u8 ip_proto;
> +	int cpu;
> +	struct rps_map *map = NULL;
> +
> +	if (dev->rps_num_maps) {
> +		/*
> +		 * Locate the map corresponding to the NAPI queue that
> +		 * the packet was received on.
> +		 */
> +		int index = skb_get_rx_queue(skb);
> +		if (index < 0 || index >= dev->rps_num_maps)
> +			index = 0;
> +
> +		map = (struct rps_map *)
> +		    (dev->rps_maps + (RPS_MAP_SIZE * index));
> +		if (!map->len)
> +			map = NULL;
> +	}
> +
> +	if (!map)
> +		return -1;
> +
> +	hash = skb->rxhash;
> +	if (hash)
> +		goto got_hash; /* Skip hash computation on packet header */
> +
> +	switch (skb->protocol) {
> +	case __constant_htons(ETH_P_IP):
> +		if (!pskb_may_pull(skb, sizeof(*ip)))
> +			return -1;
> +
> +		ip = (struct iphdr *) skb->data;
> +		ip_proto = ip->protocol;
> +		addr1 = ip->saddr;
> +		addr2 = ip->daddr;
> +		ihl = ip->ihl;
> +		break;
> +	case __constant_htons(ETH_P_IPV6):
> +		if (!pskb_may_pull(skb, sizeof(*ip6)))
> +			return -1;
> +
> +		ip6 = (struct ipv6hdr *) skb->data;
> +		ip_proto = ip6->nexthdr;
> +		addr1 = ip6->saddr.s6_addr32[3];
> +		addr2 = ip6->daddr.s6_addr32[3];
> +		ihl = (40 >> 2);
> +		break;
> +	default:
> +		return -1;
> +	}
> +	ports = 0;
> +	switch (ip_proto) {
> +	case IPPROTO_TCP:
> +	case IPPROTO_UDP:
> +	case IPPROTO_DCCP:
> +	case IPPROTO_ESP:
> +	case IPPROTO_AH:
> +	case IPPROTO_SCTP:
> +	case IPPROTO_UDPLITE:
> +		if (pskb_may_pull(skb, (ihl * 4) + 4))
> +			ports = *((u32 *) (skb->data + (ihl * 4)));
> +		break;
> +
> +	default:
> +		break;
> +	}
> +
> +	hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);
> +
> +got_hash:
> +	cpu = map->map[((u64) hash * map->len) >> 32];
> +
> +	return cpu_online(cpu) ? cpu : -1;
> +}

> +/**
> + * net_rps_action is called from NET_RPS_SOFTIRQ to do IPIs to schedule RX
> + * softirq on remote CPUs.  Called in a separate softirq to allow for
> + * coalescing.
> + */
> +static void net_rps_action(struct softirq_action *h)
> +{
> +	int cpu;
> +
> +	local_irq_disable();
> +
> +	for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
> +		struct softnet_data *queue = &per_cpu(softnet_data, cpu);
> +		__smp_call_function_single(cpu, &queue->csd, 0);
> +	}
> +	cpus_clear(__get_cpu_var(rps_remote_softirq_cpus));
> +
> +	local_irq_enable();
> +}
> +
> +/**
> + * enqueue_to_backlog is called to queue an skb to a per CPU backlog
> + * queue (may be a remote CPU queue).
> + */
> +static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
> +{


> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 753c420..ca250f6 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -18,6 +18,9 @@
>  #include <linux/wireless.h>
>  #include <net/wext.h>
> 
> +#include <linux/string.h>
> +#include <linux/ctype.h>
> +
>  #include "net-sysfs.h"
> 
>  #ifdef CONFIG_SYSFS
> @@ -249,6 +252,137 @@ static ssize_t store_tx_queue_len(struct device *dev,
>  	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
>  }
> 
> +static char *
> +get_token(const char **cp, size_t *len)
> +{

nit: not kernel style.

---
~Randy
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen Nov. 11, 2009, 9:43 p.m. UTC | #4
Tom Herbert <therbert@google.com> writes:

> Third version of RPS.

This really needs the text from 0/2 here as git changelog, otherwise
you make David's life hard if he wants to merge this.

> +
> +/*
>   *	The DEVICE structure.
>   *	Actually, this whole structure is a big mistake.  It mixes I/O
>   *	data with strictly "high-level" data, and it has to know about
> @@ -807,6 +819,9 @@ struct net_device
>  	void			*ax25_ptr;	/* AX.25 specific data */
>  	struct wireless_dev	*ieee80211_ptr;	/* IEEE 802.11 specific data,
>  						   assign before registering */
> +	void			*rps_maps;	/* Array of per-NAPI maps for
> +						   receive packet steeing */

Why is this void * here? This should be a real type.



> +	int			rps_num_maps;	/* Number of RPS maps */

This has a 4 byte hole on 64bit. Better move it somewhere else
where that isn't the case.

>
>  /*
>   * Cache line mostly used on receive path (including eth_type_trans())

Also make sure you you don't destroy these cache line optimizations.


> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 0c68fbd..95feac7 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -396,6 +396,8 @@ struct sk_buff {
>
>  	__u16			vlan_tci;
>
> +	__u32			rxhash;

Similarly here.


> + * get_rps_cpu is called from netif_receive_skb and returns the target
> + * CPU from the RPS map of the receiving NAPI instance for a given skb.
> + */
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> +{
> +	u32 addr1, addr2, ports;
> +	struct ipv6hdr *ip6;
> +	struct iphdr *ip;
> +	u32 hash, ihl;
> +	u8 ip_proto;
> +	int cpu;
> +	struct rps_map *map = NULL;
> +
> +	if (dev->rps_num_maps) {
> +		/*
> +		 * Locate the map corresponding to the NAPI queue that
> +		 * the packet was received on.
> +		 */
> +		int index = skb_get_rx_queue(skb);
> +		if (index < 0 || index >= dev->rps_num_maps)
> +			index = 0;
> +
> +		map = (struct rps_map *)
> +		    (dev->rps_maps + (RPS_MAP_SIZE * index));
> +		if (!map->len)
> +			map = NULL;

Can this really happen? Better might be to move this out of the fast path.

> +	switch (skb->protocol) {
> +	case __constant_htons(ETH_P_IP):
> +		if (!pskb_may_pull(skb, sizeof(*ip)))
> +			return -1;
> +
> +		ip = (struct iphdr *) skb->data;
> +		ip_proto = ip->protocol;
> +		addr1 = ip->saddr;
> +		addr2 = ip->daddr;
> +		ihl = ip->ihl;
> +		break;
> +	case __constant_htons(ETH_P_IPV6):
> +		if (!pskb_may_pull(skb, sizeof(*ip6)))
> +			return -1;
> +
> +		ip6 = (struct ipv6hdr *) skb->data;
> +		ip_proto = ip6->nexthdr;
> +		addr1 = ip6->saddr.s6_addr32[3];
> +		addr2 = ip6->daddr.s6_addr32[3];

Why only [3] ? Is this future proof?

> +/**
> + * net_rps_action is called from NET_RPS_SOFTIRQ to do IPIs to schedule RX
> + * softirq on remote CPUs.  Called in a separate softirq to allow for
> + * coalescing.
> + */
> +static void net_rps_action(struct softirq_action *h)
> +{
> +	int cpu;
> +
> +	local_irq_disable();
> +
> +	for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
> +		struct softnet_data *queue = &per_cpu(softnet_data, cpu);
> +		__smp_call_function_single(cpu, &queue->csd, 0);

How do you get around the standard deadlocks with IPI called from
irq disabled section?

And why are the interrupts are disabled here anyways?

> @@ -2696,21 +2842,24 @@ static int process_backlog(struct napi_struct
> *napi, int quota)
>  	int work = 0;
>  	struct softnet_data *queue = &__get_cpu_var(softnet_data);
>  	unsigned long start_time = jiffies;
> +	unsigned long flags;
>
>  	napi->weight = weight_p;
>  	do {
>  		struct sk_buff *skb;
>
>  		local_irq_disable();
> +		spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);

When you just disabled the irqs you obviously don't need a irqsave
in the next line.

>  		skb = __skb_dequeue(&queue->input_pkt_queue);
>  		if (!skb) {
>  			__napi_complete(napi);
> -			local_irq_enable();
> +			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
> +			    flags);
>  			break;

This will actually not turn on interrupts again because you only
saved them after disabling them.

>  		}
> -		local_irq_enable();
> +		spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);

Same

>
> -		netif_receive_skb(skb);
> +		__netif_receive_skb(skb);
>  	} while (++work < quota && jiffies == start_time);
>
>  	return work;
> @@ -5205,6 +5354,8 @@ void free_netdev(struct net_device *dev)
>  	/* Flush device addresses */
>  	dev_addr_flush(dev);
>
> +	kfree(dev->rps_maps);
> +
>  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>  		netif_napi_del(p);
>
> @@ -5644,6 +5795,10 @@ static int __init net_dev_init(void)
>  		queue->completion_queue = NULL;
>  		INIT_LIST_HEAD(&queue->poll_list);
>
> +		queue->csd.func = trigger_softirq;
> +		queue->csd.info = queue;
> +		queue->csd.flags = 0;
> +
>  		queue->backlog.poll = process_backlog;
>  		queue->backlog.weight = weight_p;
>  		queue->backlog.gro_list = NULL;
> @@ -5669,7 +5824,9 @@ static int __init net_dev_init(void)
>
>  	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
>  	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
> +	open_softirq(NET_RPS_SOFTIRQ, net_rps_action);
>
> +	get_random_bytes(&simple_hashrnd, 4);

It's a standard pet peeve of me, but it's quite unlikely you'll
get any useful entropy at this time of kernel startup.

Normally it's always the same.


> +static char *
> +get_token(const char **cp, size_t *len)
> +{
> +	const char *bp = *cp, *start;
> +
> +	while (isspace(*bp))
> +		bp++;
> +
> +	start = bp;
> +	while (!isspace(*bp) && *bp != '\0')
> +		bp++;
> +
> +	if (start != bp)
> +		*len = bp - start;
> +	else
> +		start = NULL;
> +
> +	*cp = bp;
> +	return start;
> +}
> +
> +static ssize_t store_rps_cpus(struct device *dev,
> +    struct device_attribute *attr, const char *buf, size_t len)
> +{
> +	struct net_device *net = to_net_dev(dev);
> +	struct napi_struct *napi;
> +	cpumask_t mask;
> +	int err, cpu, index, i;
> +	int cnt = 0;
> +	char *token;
> +	const char *cp = buf;
> +	size_t tlen;
> +
> +	if (!capable(CAP_NET_ADMIN))
> +		return -EPERM;
> +
> +	/*
> +	 * Pre-check that tokens parse properly before we commit to making
> +	 * any changes.
> +	 */
> +	while ((token = get_token(&cp, &tlen)))
> +		err = bitmap_parse(token, tlen, cpumask_bits(&mask),
> +		    nr_cpumask_bits);
> +
> +	if (err)
> +		return err;
> +
> +	rtnl_lock();

It seems weird to do user parsing while holding that lock.
Better first set up and allocate and then finally initialize global state.

> +	if (dev_isalive(net)) {

Especially since the device is alive. So what happens to interrupts
coming in in parallel? That seems racy.

+
+	queue = &per_cpu(softnet_data, cpu);
+	spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
+
+	__get_cpu_var(netdev_rx_stat).total++;
+	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {


It seems weird to do the local counter increase after grabbing 
the global lock. Also does someone count on the real receiver anyways?
That might be better, right now the count would be a bit 
misleading.

-Andi
Andi Kleen Nov. 11, 2009, 9:44 p.m. UTC | #5
Eric Dumazet <eric.dumazet@gmail.com> writes:
>
> Problem of possible cpus is the number can be very large on some arches,
> but yet few cpus online....

Actually that should be rare. Especially not very large compared to online.

The only case left is virtualization, but there the ratio of online
cores vs possible is not that large typically.

-Andi
David Miller Nov. 12, 2009, 2:32 a.m. UTC | #6
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 11 Nov 2009 22:44:25 +0100

> Eric Dumazet <eric.dumazet@gmail.com> writes:
>>
>> Problem of possible cpus is the number can be very large on some arches,
>> but yet few cpus online....
> 
> Actually that should be rare. Especially not very large compared to online.

The ratio can be as high as 256/1 on Niagara boxes.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 12, 2009, 8:23 p.m. UTC | #7
Tom Herbert a écrit :
> Third version of RPS.
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
>  include/linux/interrupt.h |    1 +
>  include/linux/netdevice.h |   18 ++++
>  include/linux/skbuff.h    |    2 +
>  net/core/dev.c            |  227 ++++++++++++++++++++++++++++++++++++++-------
>  net/core/net-sysfs.c      |  135 +++++++++++++++++++++++++++
>  5 files changed, 348 insertions(+), 35 deletions(-)
> 



@@ -2696,21 +2842,24 @@ static int process_backlog(struct napi_struct
*napi, int quota)
 	int work = 0;
 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
 	unsigned long start_time = jiffies;
+	unsigned long flags;

 	napi->weight = weight_p;
 	do {
 		struct sk_buff *skb;

<<HERE>>	local_irq_disable();
+		spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb) {
 			__napi_complete(napi);
-			local_irq_enable();
+			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+			    flags);
 			break;
 		}
-		local_irq_enable();
+		spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);

-		netif_receive_skb(skb);
+		__netif_receive_skb(skb);
 	} while (++work < quota && jiffies == start_time);

 	return work;

Not sure you still want the local_irq_disable() before 
spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);  ?

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Nov. 16, 2009, 11:15 a.m. UTC | #8
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 11 Nov 2009 09:20:42 +0100

> I think I'll try to extend your patches with TX completion recycling too.
>
> Ie record in skb the cpu number of original sender, and queue skb to
> remote queue for destruction (sock_wfree() call and expensive
> scheduler calls...)
>
> (This probably needs driver cooperation, instead of calling consume_skb(),
> use a different function)

You can add a new argument to consume_skb() which indicates to remote
schedule a local free.

I would also suggest to record the TX cpu at dev_hard_start_xmit()
time, rather than somewhere higher up such as the socket layer.
Otherwise you'll mess up routing/netfilter cases, and also mishandle
task migration.

But a very excellent idea.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Nov. 16, 2009, 11:19 a.m. UTC | #9
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Nov 2009 22:53:17 -0800

> +		/* Schedule NAPI for backlog device */
> +		if (napi_schedule_prep(&queue->backlog)) {
> +			if (cpu != smp_processor_id()) {
> +				cpu_set(cpu,
> +				    get_cpu_var(rps_remote_softirq_cpus));
> +				__raise_softirq_irqoff(NET_RPS_SOFTIRQ);
> +			} else
> +				__napi_schedule(&queue->backlog);
> +		}
> +		goto enqueue;

{,__}send_remote_softirq() doesn't work? :-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Nov. 16, 2009, 4:43 p.m. UTC | #10
On Mon, Nov 16, 2009 at 3:19 AM, David Miller <davem@davemloft.net> wrote:
> From: Tom Herbert <therbert@google.com>
> Date: Tue, 10 Nov 2009 22:53:17 -0800
>
>> +             /* Schedule NAPI for backlog device */
>> +             if (napi_schedule_prep(&queue->backlog)) {
>> +                     if (cpu != smp_processor_id()) {
>> +                             cpu_set(cpu,
>> +                                 get_cpu_var(rps_remote_softirq_cpus));
>> +                             __raise_softirq_irqoff(NET_RPS_SOFTIRQ);
>> +                     } else
>> +                             __napi_schedule(&queue->backlog);
>> +             }
>> +             goto enqueue;
>
> {,__}send_remote_softirq() doesn't work? :-)
>
NET_RPS_SOFTIRQ is intended to provide coalescing of IPIs.

send_remote_softirq could be used, but we would also need to get the
napi structure on the remote cpu poll list so that would probably need
to be protected by locks (something like __napi_schedule_oncpu could
be defined).  Would this be better to do?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Nov. 16, 2009, 5:02 p.m. UTC | #11
>> +     case __constant_htons(ETH_P_IPV6):
>> +             if (!pskb_may_pull(skb, sizeof(*ip6)))
>> +                     return -1;
>> +
>> +             ip6 = (struct ipv6hdr *) skb->data;
>> +             ip_proto = ip6->nexthdr;
>> +             addr1 = ip6->saddr.s6_addr32[3];
>> +             addr2 = ip6->daddr.s6_addr32[3];
>
> Why only [3] ? Is this future proof?
>
No.  But it's same as inet6_ehashfn :-)

>> +     for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
>> +             struct softnet_data *queue = &per_cpu(softnet_data, cpu);
>> +             __smp_call_function_single(cpu, &queue->csd, 0);
>
> How do you get around the standard deadlocks with IPI called from
> irq disabled section?
>

What are the standard deadlocks?  Looks like __send_remote_softirq
will call __smp_call_function with irq's disabled...

> And why are the interrupts are disabled here anyways?
>

Protects rps_remote_softirq_cpus.


> It's a standard pet peeve of me, but it's quite unlikely you'll
> get any useful entropy at this time of kernel startup.
>
> Normally it's always the same.
>
Would it make sense to just use skb_tx_hashrnd for the receive hash
key also (renaming it to be more general)?


>> +     if (err)
>> +             return err;
>> +
>> +     rtnl_lock();
>
> It seems weird to do user parsing while holding that lock.
> Better first set up and allocate and then finally initialize global state.

Yes.  We could build a new map each time and then insert it into the
device structure using an rcu lock (which I hope would be sufficient
locking)

Thanks,
Tom
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Nov. 17, 2009, 9:32 p.m. UTC | #12
Tom Herbert wrote, On 11/11/2009 07:53 AM:

> Third version of RPS.
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
...
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> +{
> +	u32 addr1, addr2, ports;
> +	struct ipv6hdr *ip6;
> +	struct iphdr *ip;
> +	u32 hash, ihl;
> +	u8 ip_proto;
> +	int cpu;
> +	struct rps_map *map = NULL;
> +
> +	if (dev->rps_num_maps) {
> +		/*
> +		 * Locate the map corresponding to the NAPI queue that
> +		 * the packet was received on.
> +		 */
> +		int index = skb_get_rx_queue(skb);
> +		if (index < 0 || index >= dev->rps_num_maps)

skb_get_rx_queue() returns u16, so 'index < 0' seems wrong here.

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Nov. 18, 2009, 7:21 a.m. UTC | #13
From: Tom Herbert <therbert@google.com>
Date: Mon, 16 Nov 2009 08:43:05 -0800

> On Mon, Nov 16, 2009 at 3:19 AM, David Miller <davem@davemloft.net> wrote:
>> {,__}send_remote_softirq() doesn't work? :-)
>>
> NET_RPS_SOFTIRQ is intended to provide coalescing of IPIs.
> 
> send_remote_softirq could be used, but we would also need to get the
> napi structure on the remote cpu poll list so that would probably need
> to be protected by locks (something like __napi_schedule_oncpu could
> be defined).  Would this be better to do?

We talked about this several times in the past too.  Let me think some
more about this, I want to consider all of the issues a bit before
making any suggestions.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Nov. 19, 2009, 8:08 a.m. UTC | #14
On 16-11-2009 17:43, Tom Herbert wrote:
> On Mon, Nov 16, 2009 at 3:19 AM, David Miller <davem@davemloft.net> wrote:
>> From: Tom Herbert <therbert@google.com>
>> Date: Tue, 10 Nov 2009 22:53:17 -0800
>>
>>> +             /* Schedule NAPI for backlog device */
>>> +             if (napi_schedule_prep(&queue->backlog)) {
>>> +                     if (cpu != smp_processor_id()) {
>>> +                             cpu_set(cpu,
>>> +                                 get_cpu_var(rps_remote_softirq_cpus));
>>> +                             __raise_softirq_irqoff(NET_RPS_SOFTIRQ);
>>> +                     } else
>>> +                             __napi_schedule(&queue->backlog);
>>> +             }
>>> +             goto enqueue;
>> {,__}send_remote_softirq() doesn't work? :-)
>>
> NET_RPS_SOFTIRQ is intended to provide coalescing of IPIs.

It seems calling net_rps_action() at the end of net_rx_action() should
do (mostly) the same, at least for napi drivers. And I'm not sure it's
worth to add a new softirq because of non-napis.

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Nov. 19, 2009, 9:57 a.m. UTC | #15
On 11-11-2009 07:53, Tom Herbert wrote:
> Third version of RPS.
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
...
> @@ -2266,10 +2401,10 @@ void netif_nit_deliver(struct sk_buff *skb)
>  }
> 
>  /**
> - *	netif_receive_skb - process receive buffer from network
> + *	__netif_receive_skb - process receive buffer from network
>   *	@skb: buffer to process
>   *
> - *	netif_receive_skb() is the main receive data processing function.
> + *	__netif__napireceive_skb() is the main receive data processing function.

--------------->^^^^^^^^^^^^ ?

>   *	It always succeeds. The buffer may be dropped during processing
>   *	for congestion control or by the protocol layers.
>   *
> @@ -2280,7 +2415,8 @@ void netif_nit_deliver(struct sk_buff *skb)
>   *	NET_RX_SUCCESS: no congestion
>   *	NET_RX_DROP: packet was dropped
>   */
> -int netif_receive_skb(struct sk_buff *skb)
> +
> +int __netif_receive_skb(struct sk_buff *skb)
>  {
>  	struct packet_type *ptype, *pt_prev;
>  	struct net_device *orig_dev;
> @@ -2378,6 +2514,16 @@ out:
>  }
>  EXPORT_SYMBOL(netif_receive_skb);
> 
> +int netif_receive_skb(struct sk_buff *skb)
> +{
> +	int cpu = get_rps_cpu(skb->dev, skb);
> +
> +	if (cpu < 0)

The description reads: "This solution queues packets early on in the
receive path on the backlog queues of other CPUs.", so I'm not sure
it's intended. Did you test it like this (and it was visibly worse)?:

	if (cpu < 0 || cpu == smp_processor_id())

> +		return __netif_receive_skb(skb);
> +	else
> +		return enqueue_to_backlog(skb, cpu);
> +}
> +

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen Nov. 19, 2009, 10:08 a.m. UTC | #16
On Mon, Nov 16, 2009 at 09:02:32AM -0800, Tom Herbert wrote:

Sorry for the late answer.

> >> +     case __constant_htons(ETH_P_IPV6):
> >> +             if (!pskb_may_pull(skb, sizeof(*ip6)))
> >> +                     return -1;
> >> +
> >> +             ip6 = (struct ipv6hdr *) skb->data;
> >> +             ip_proto = ip6->nexthdr;
> >> +             addr1 = ip6->saddr.s6_addr32[3];
> >> +             addr2 = ip6->daddr.s6_addr32[3];
> >
> > Why only [3] ? Is this future proof?
> >
> No.  But it's same as inet6_ehashfn :-)

Perhaps it would be good to consolidate all these ipv6 hashes
into one place where they could be at least fixed easily.

> 
> >> +     for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
> >> +             struct softnet_data *queue = &per_cpu(softnet_data, cpu);
> >> +             __smp_call_function_single(cpu, &queue->csd, 0);
> >
> > How do you get around the standard deadlocks with IPI called from
> > irq disabled section?
> >
> 
> What are the standard deadlocks?  Looks like __send_remote_softirq
> will call __smp_call_function with irq's disabled...

The traditional deadlock (that was before the queue smp_call_function)
was

A                        B
                         grab lock
interrupts off
spin on lock                 
                         send IPI
                         wait for specific CPU

never answers because
interrupts are off
                         hangs forever


I think with the queued smp_call_function it's better because
the locks are only hold much shorter and that particular scenario
is gone, but I'm not sure the problem has fully gone away. 

At least there are still plenty of WARN_ON( ... irqs_disabled()) in 
kernel/smp.c


> > It's a standard pet peeve of me, but it's quite unlikely you'll
> > get any useful entropy at this time of kernel startup.
> >
> > Normally it's always the same.
> >
> Would it make sense to just use skb_tx_hashrnd for the receive hash
> key also (renaming it to be more general)?

That has the same problem, although it's at least a bit later,
but I suspect it would be still not very random.

You could just drop it and always use a constant hash rnd?

-Andi
Tom Herbert Nov. 20, 2009, 6:41 a.m. UTC | #17
>> What are the standard deadlocks?  Looks like __send_remote_softirq
>> will call __smp_call_function with irq's disabled...
>
> The traditional deadlock (that was before the queue smp_call_function)
> was
>
> A                        B
>                         grab lock
> interrupts off
> spin on lock
>                         send IPI
>                         wait for specific CPU
>
> never answers because
> interrupts are off
>                         hangs forever
>
>
> I think with the queued smp_call_function it's better because
> the locks are only hold much shorter and that particular scenario
> is gone, but I'm not sure the problem has fully gone away.
>
> At least there are still plenty of WARN_ON( ... irqs_disabled()) in
> kernel/smp.c
>

So is send_remote_softirq also broken according to this?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 20, 2009, 6:49 a.m. UTC | #18
Tom Herbert a écrit :
> 
> So is send_remote_softirq also broken according to this?

I found this function not usable as is, anyway.
I tried to use it for XPS but failed.

Before calling it, we must put the work in a remote queue,
but if work has to be done by current cpu, we cannot remove this
work to put it in our queue. (We dont have a status from send_remote_softirq())


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Nov. 20, 2009, 5:08 p.m. UTC | #19
> The description reads: "This solution queues packets early on in the
> receive path on the backlog queues of other CPUs.", so I'm not sure
> it's intended.

That is precisely the intent.  Getting packets quickly distributed to
the target cpus maximizes parallelism and reduces latency.

Did you test it like this (and it was visibly worse)?:
>
>        if (cpu < 0 || cpu == smp_processor_id())
>
>> +             return __netif_receive_skb(skb);
>> +     else
>> +             return enqueue_to_backlog(skb, cpu);
>> +}
>> +
This increases overall latency due to head of line blocking which will
outweigh the benefits of optimizing for this one case.

Tom
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Nov. 20, 2009, 7 p.m. UTC | #20
On Fri, Nov 20, 2009 at 09:08:10AM -0800, Tom Herbert wrote:
> > The description reads: "This solution queues packets early on in the
> > receive path on the backlog queues of other CPUs.", so I'm not sure
> > it's intended.
> 
> That is precisely the intent.  Getting packets quickly distributed to
> the target cpus maximizes parallelism and reduces latency.

Then precisely the intent is "the backlog queues of target CPUs".
"The backlog queues of other CPUs" may suggest that e.g. one cpu is
only doing distribution etc.

> 
> Did you test it like this (and it was visibly worse)?:
> >
> >        if (cpu < 0 || cpu == smp_processor_id())
> >
> >> +             return __netif_receive_skb(skb);
> >> +     else
> >> +             return enqueue_to_backlog(skb, cpu);
> >> +}
> >> +
> This increases overall latency due to head of line blocking which will
> outweigh the benefits of optimizing for this one case.

The way I asked should suggest I "suspected" it's on purpose, and was
curious about "digits", but thanks for confirming this.

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Nov. 20, 2009, 10:52 p.m. UTC | #21
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Thu, 19 Nov 2009 08:08:31 +0000

> On 16-11-2009 17:43, Tom Herbert wrote:
>> NET_RPS_SOFTIRQ is intended to provide coalescing of IPIs.
> 
> It seems calling net_rps_action() at the end of net_rx_action() should
> do (mostly) the same, at least for napi drivers. And I'm not sure it's
> worth to add a new softirq because of non-napis.

I agree.  This is how we handle all of these kinds of issues.

And with GRO, any arguable latency problem this may have you're
going to eat most of the time anyways.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index b78cf81..fa91194 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -345,6 +345,7 @@  enum
 	TIMER_SOFTIRQ,
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
+	NET_RPS_SOFTIRQ,
 	BLOCK_SOFTIRQ,
 	BLOCK_IOPOLL_SOFTIRQ,
 	TASKLET_SOFTIRQ,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8380009..c1b1bbb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -639,6 +639,18 @@  struct net_device_ops {
 };

 /*
+ * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
+ */
+struct rps_map {
+	int len;
+	u16 map[0];
+};
+
+/* Maximum size of RPS map (for allocation) */
+#define RPS_MAP_SIZE (sizeof(struct rps_map) + \
+    (num_possible_cpus() * sizeof(u16)))
+
+/*
  *	The DEVICE structure.
  *	Actually, this whole structure is a big mistake.  It mixes I/O
  *	data with strictly "high-level" data, and it has to know about
@@ -807,6 +819,9 @@  struct net_device
 	void			*ax25_ptr;	/* AX.25 specific data */
 	struct wireless_dev	*ieee80211_ptr;	/* IEEE 802.11 specific data,
 						   assign before registering */
+	void			*rps_maps;	/* Array of per-NAPI maps for
+						   receive packet steeing */
+	int			rps_num_maps;	/* Number of RPS maps */

 /*
  * Cache line mostly used on receive path (including eth_type_trans())
@@ -1217,6 +1232,9 @@  struct softnet_data
 	struct Qdisc		*output_queue;
 	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
+
+	struct call_single_data	csd;
+
 	struct sk_buff		*completion_queue;

 	struct napi_struct	backlog;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0c68fbd..95feac7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -396,6 +396,8 @@  struct sk_buff {

 	__u16			vlan_tci;

+	__u32			rxhash;
+
 	sk_buff_data_t		transport_header;
 	sk_buff_data_t		network_header;
 	sk_buff_data_t		mac_header;
diff --git a/net/core/dev.c b/net/core/dev.c
index 28b0b9e..735e7e3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1976,6 +1976,162 @@  int weight_p __read_mostly = 64;            /*
old backlog weight */

 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };

+static u32 simple_hashrnd;
+
+/**
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving NAPI instance for a given skb.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2, ports;
+	struct ipv6hdr *ip6;
+	struct iphdr *ip;
+	u32 hash, ihl;
+	u8 ip_proto;
+	int cpu;
+	struct rps_map *map = NULL;
+
+	if (dev->rps_num_maps) {
+		/*
+		 * Locate the map corresponding to the NAPI queue that
+		 * the packet was received on.
+		 */
+		int index = skb_get_rx_queue(skb);
+		if (index < 0 || index >= dev->rps_num_maps)
+			index = 0;
+
+		map = (struct rps_map *)
+		    (dev->rps_maps + (RPS_MAP_SIZE * index));
+		if (!map->len)
+			map = NULL;
+	}
+
+	if (!map)
+		return -1;
+
+	hash = skb->rxhash;
+	if (hash)
+		goto got_hash; /* Skip hash computation on packet header */
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(*ip)))
+			return -1;
+
+		ip = (struct iphdr *) skb->data;
+		ip_proto = ip->protocol;
+		addr1 = ip->saddr;
+		addr2 = ip->daddr;
+		ihl = ip->ihl;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		if (!pskb_may_pull(skb, sizeof(*ip6)))
+			return -1;
+
+		ip6 = (struct ipv6hdr *) skb->data;
+		ip_proto = ip6->nexthdr;
+		addr1 = ip6->saddr.s6_addr32[3];
+		addr2 = ip6->daddr.s6_addr32[3];
+		ihl = (40 >> 2);
+		break;
+	default:
+		return -1;
+	}
+	ports = 0;
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		if (pskb_may_pull(skb, (ihl * 4) + 4))
+			ports = *((u32 *) (skb->data + (ihl * 4)));
+		break;
+
+	default:
+		break;
+	}
+
+	hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);
+
+got_hash:
+	cpu = map->map[((u64) hash * map->len) >> 32];
+
+	return cpu_online(cpu) ? cpu : -1;
+}
+
+static DEFINE_PER_CPU(cpumask_t, rps_remote_softirq_cpus);
+
+/* Called from hardirq (IPI) context */
+static void trigger_softirq(void *data)
+{
+	struct softnet_data *queue = data;
+	__napi_schedule(&queue->backlog);
+}
+
+/**
+ * net_rps_action is called from NET_RPS_SOFTIRQ to do IPIs to schedule RX
+ * softirq on remote CPUs.  Called in a separate softirq to allow for
+ * coalescing.
+ */
+static void net_rps_action(struct softirq_action *h)
+{
+	int cpu;
+
+	local_irq_disable();
+
+	for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
+		struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+		__smp_call_function_single(cpu, &queue->csd, 0);
+	}
+	cpus_clear(__get_cpu_var(rps_remote_softirq_cpus));
+
+	local_irq_enable();
+}
+
+/**
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+{
+	struct softnet_data *queue;
+	unsigned long flags;
+
+	queue = &per_cpu(softnet_data, cpu);
+	spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
+
+	__get_cpu_var(netdev_rx_stat).total++;
+	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+		if (queue->input_pkt_queue.qlen) {
+enqueue:
+			__skb_queue_tail(&queue->input_pkt_queue, skb);
+			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+			    flags);
+			return NET_RX_SUCCESS;
+		}
+
+		/* Schedule NAPI for backlog device */
+		if (napi_schedule_prep(&queue->backlog)) {
+			if (cpu != smp_processor_id()) {
+				cpu_set(cpu,
+				    get_cpu_var(rps_remote_softirq_cpus));
+				__raise_softirq_irqoff(NET_RPS_SOFTIRQ);
+			} else
+				__napi_schedule(&queue->backlog);
+		}
+		goto enqueue;
+	}
+
+	__get_cpu_var(netdev_rx_stat).dropped++;
+	spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);
+
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}

 /**
  *	netif_rx	-	post buffer to the network code
@@ -1994,8 +2150,7 @@  DEFINE_PER_CPU(struct netif_rx_stats,
netdev_rx_stat) = { 0, };

 int netif_rx(struct sk_buff *skb)
 {
-	struct softnet_data *queue;
-	unsigned long flags;
+	int cpu;

 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
@@ -2004,31 +2159,11 @@  int netif_rx(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);

-	/*
-	 * The code is rearranged so that the path is the most
-	 * short when CPU is congested, but is still operating.
-	 */
-	local_irq_save(flags);
-	queue = &__get_cpu_var(softnet_data);
+	cpu = get_rps_cpu(skb->dev, skb);
+	if (cpu < 0)
+		cpu = smp_processor_id();

-	__get_cpu_var(netdev_rx_stat).total++;
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
-enqueue:
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
-			local_irq_restore(flags);
-			return NET_RX_SUCCESS;
-		}
-
-		napi_schedule(&queue->backlog);
-		goto enqueue;
-	}
-
-	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
-
-	kfree_skb(skb);
-	return NET_RX_DROP;
+	return enqueue_to_backlog(skb, cpu);
 }
 EXPORT_SYMBOL(netif_rx);

@@ -2266,10 +2401,10 @@  void netif_nit_deliver(struct sk_buff *skb)
 }

 /**
- *	netif_receive_skb - process receive buffer from network
+ *	__netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
  *
- *	netif_receive_skb() is the main receive data processing function.
+ *	__netif__napireceive_skb() is the main receive data processing function.
  *	It always succeeds. The buffer may be dropped during processing
  *	for congestion control or by the protocol layers.
  *
@@ -2280,7 +2415,8 @@  void netif_nit_deliver(struct sk_buff *skb)
  *	NET_RX_SUCCESS: no congestion
  *	NET_RX_DROP: packet was dropped
  */
-int netif_receive_skb(struct sk_buff *skb)
+
+int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
@@ -2378,6 +2514,16 @@  out:
 }
 EXPORT_SYMBOL(netif_receive_skb);

+int netif_receive_skb(struct sk_buff *skb)
+{
+	int cpu = get_rps_cpu(skb->dev, skb);
+
+	if (cpu < 0)
+		return __netif_receive_skb(skb);
+	else
+		return enqueue_to_backlog(skb, cpu);
+}
+
 /* Network device is going away, flush any packets still pending  */
 static void flush_backlog(void *arg)
 {
@@ -2421,7 +2567,7 @@  static int napi_gro_complete(struct sk_buff *skb)
 	}

 out:
-	return netif_receive_skb(skb);
+	return __netif_receive_skb(skb);
 }

 void napi_gro_flush(struct napi_struct *napi)
@@ -2554,7 +2700,7 @@  int napi_skb_finish(int ret, struct sk_buff *skb)

 	switch (ret) {
 	case GRO_NORMAL:
-		return netif_receive_skb(skb);
+		return __netif_receive_skb(skb);

 	case GRO_DROP:
 		err = NET_RX_DROP;
@@ -2625,7 +2771,7 @@  int napi_frags_finish(struct napi_struct *napi,
struct sk_buff *skb, int ret)
 		skb->protocol = eth_type_trans(skb, napi->dev);

 		if (ret == GRO_NORMAL)
-			return netif_receive_skb(skb);
+			return __netif_receive_skb(skb);

 		skb_gro_pull(skb, -ETH_HLEN);
 		break;
@@ -2696,21 +2842,24 @@  static int process_backlog(struct napi_struct
*napi, int quota)
 	int work = 0;
 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
 	unsigned long start_time = jiffies;
+	unsigned long flags;

 	napi->weight = weight_p;
 	do {
 		struct sk_buff *skb;

 		local_irq_disable();
+		spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb) {
 			__napi_complete(napi);
-			local_irq_enable();
+			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+			    flags);
 			break;
 		}
-		local_irq_enable();
+		spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);

-		netif_receive_skb(skb);
+		__netif_receive_skb(skb);
 	} while (++work < quota && jiffies == start_time);

 	return work;
@@ -5205,6 +5354,8 @@  void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);

+	kfree(dev->rps_maps);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);

@@ -5644,6 +5795,10 @@  static int __init net_dev_init(void)
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);

+		queue->csd.func = trigger_softirq;
+		queue->csd.info = queue;
+		queue->csd.flags = 0;
+
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
 		queue->backlog.gro_list = NULL;
@@ -5669,7 +5824,9 @@  static int __init net_dev_init(void)

 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
+	open_softirq(NET_RPS_SOFTIRQ, net_rps_action);

+	get_random_bytes(&simple_hashrnd, 4);
 	hotcpu_notifier(dev_cpu_callback, 0);
 	dst_init();
 	dev_mcast_init();
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 753c420..ca250f6 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -18,6 +18,9 @@ 
 #include <linux/wireless.h>
 #include <net/wext.h>

+#include <linux/string.h>
+#include <linux/ctype.h>
+
 #include "net-sysfs.h"

 #ifdef CONFIG_SYSFS
@@ -249,6 +252,137 @@  static ssize_t store_tx_queue_len(struct device *dev,
 	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
 }

+static char *
+get_token(const char **cp, size_t *len)
+{
+	const char *bp = *cp, *start;
+
+	while (isspace(*bp))
+		bp++;
+
+	start = bp;
+	while (!isspace(*bp) && *bp != '\0')
+		bp++;
+
+	if (start != bp)
+		*len = bp - start;
+	else
+		start = NULL;
+
+	*cp = bp;
+	return start;
+}
+
+static ssize_t store_rps_cpus(struct device *dev,
+    struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct net_device *net = to_net_dev(dev);
+	struct napi_struct *napi;
+	cpumask_t mask;
+	int err, cpu, index, i;
+	int cnt = 0;
+	char *token;
+	const char *cp = buf;
+	size_t tlen;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	/*
+	 * Pre-check that tokens parse properly before we commit to making
+	 * any changes.
+	 */
+	while ((token = get_token(&cp, &tlen)))
+		err = bitmap_parse(token, tlen, cpumask_bits(&mask),
+		    nr_cpumask_bits);
+
+	if (err)
+		return err;
+
+	rtnl_lock();
+	if (dev_isalive(net)) {
+		if (!net->rps_maps) {
+			/*
+			 * Need to allocate the array of RPS maps, one map
+			 * for each NAPI instance on the device.
+			 */
+			list_for_each_entry(napi, &net->napi_list, dev_list)
+				cnt++;
+			net->rps_maps = kzalloc(RPS_MAP_SIZE * cnt, GFP_KERNEL);
+			if (!net->rps_maps) {
+				rtnl_unlock();
+				return -ENOMEM;
+			}
+			net->rps_num_maps = cnt;
+		}
+
+		cp = buf;
+		for (index = 0; index < net->rps_num_maps &&
+		   (token = get_token(&cp, &tlen)); index++) {
+			struct rps_map *map = (struct rps_map *)
+			    (net->rps_maps + (RPS_MAP_SIZE * index));
+			err = bitmap_parse(token, tlen, cpumask_bits(&mask),
+			    nr_cpumask_bits);
+			if (!err) {
+				cpus_and(mask, mask, cpu_online_map);
+				i = 0;
+				for_each_cpu_mask(cpu, mask)
+					map->map[i++] =  cpu;
+				map->len = i;
+			} else {
+				rtnl_unlock();
+				return err;
+			}
+		}
+
+		/*
+		 * Any per NAPI maps not being set are "zeroed" by setting
+		 * map length length to zero.
+		 */
+		for (; index < net->rps_num_maps; index++) {
+			struct rps_map *map = (struct rps_map *)
+			    (net->rps_maps + (RPS_MAP_SIZE * index));
+			map->len = 0;
+		}
+	}
+	rtnl_unlock();
+
+	return len;
+}
+
+static ssize_t show_rps_cpus(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	size_t len = 0;
+	cpumask_t mask;
+	int i, j;
+
+	read_lock(&dev_base_lock);
+	if (dev_isalive(net)) {
+		for (j = 0; j < net->rps_num_maps; j++) {
+			struct rps_map *map = (struct rps_map *)
+			    (net->rps_maps + (RPS_MAP_SIZE * j));
+			cpus_clear(mask);
+			for (i = 0; i < map->len; i++)
+				cpu_set(map->map[i], mask);
+
+			len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask);
+			if (PAGE_SIZE - len < 3) {
+				read_unlock(&dev_base_lock);
+				return -EINVAL;
+			}
+			if (j < net->rps_num_maps)
+				len += sprintf(buf + len, " ");
+		}
+	}
+
+	read_unlock(&dev_base_lock);
+
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
 static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -305,6 +439,7 @@  static struct device_attribute net_class_attributes[] = {
 	__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
 	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
 	       store_tx_queue_len),
+	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_cpus, store_rps_cpus),
 	{}
 };