diff mbox

ifb: add multi-queue support

Message ID 4AFA8911.7050204@gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Changli Gao Nov. 11, 2009, 9:51 a.m. UTC
ifb: add multi-queue support

Add multi-queue support, and one kernel thread is created for per queue.
It can used to emulate multi-queue NIC in software, and distribute work
among CPUs.
gentux linux # modprobe ifb numtxqs=2
gentux linux # ifconfig ifb0 up
gentux linux # pgrep ifb0
18508
18509
gentux linux # taskset -p 1 18508
pid 18508's current affinity mask: 3
pid 18508's new affinity mask: 1
gentux linux # taskset -p 2 18509
pid 18509's current affinity mask: 3
pid 18509's new affinity mask: 2
gentux linux # tc qdisc add dev br0 ingress
gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
action mirred egress redirect dev ifb0

This patch also introduces a ip link option "numtxqs" for specifying the
number of the TX queues. so you can add a new ifb with the command:

ip link add numtxqs 4 type ifb

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
drivers/net/ifb.c | 414 ++++++++++++++++++++++++++++++++----------------
include/linux/if_link.h | 1
net/core/rtnetlink.c | 3
3 files changed, 283 insertions(+), 135 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Changli Gao Nov. 11, 2009, 9:56 a.m. UTC | #1
2009/11/11 Changli Gao <xiaosuo@gmail.com>:
> ifb: add multi-queue support
>
> Add multi-queue support, and one kernel thread is created for per queue.
> It can used to emulate multi-queue NIC in software, and distribute work
> among CPUs.
> gentux linux # modprobe ifb numtxqs=2
> gentux linux # ifconfig ifb0 up
> gentux linux # pgrep ifb0
> 18508
> 18509
> gentux linux # taskset -p 1 18508
> pid 18508's current affinity mask: 3
> pid 18508's new affinity mask: 1
> gentux linux # taskset -p 2 18509
> pid 18509's current affinity mask: 3
> pid 18509's new affinity mask: 2
> gentux linux # tc qdisc add dev br0 ingress
> gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
> action mirred egress redirect dev ifb0
>
> This patch also introduces a ip link option "numtxqs" for specifying the
> number of the TX queues. so you can add a new ifb with the command:
>
> ip link add numtxqs 4 type ifb
>

The corresponding patch for iproute2 is attached too. And you need to
update the linux header files too.
Eric Dumazet Nov. 11, 2009, 10:30 a.m. UTC | #2
Changli Gao a écrit :
> ifb: add multi-queue support
> 
> Add multi-queue support, and one kernel thread is created for per queue.
> It can used to emulate multi-queue NIC in software, and distribute work
> among CPUs.
> gentux linux # modprobe ifb numtxqs=2
> gentux linux # ifconfig ifb0 up
> gentux linux # pgrep ifb0
> 18508
> 18509
> gentux linux # taskset -p 1 18508
> pid 18508's current affinity mask: 3
> pid 18508's new affinity mask: 1
> gentux linux # taskset -p 2 18509
> pid 18509's current affinity mask: 3
> pid 18509's new affinity mask: 2
> gentux linux # tc qdisc add dev br0 ingress
> gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
> action mirred egress redirect dev ifb0
> 
> This patch also introduces a ip link option "numtxqs" for specifying the
> number of the TX queues. so you can add a new ifb with the command:
> 
> ip link add numtxqs 4 type ifb
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
> drivers/net/ifb.c | 414 ++++++++++++++++++++++++++++++++----------------
> include/linux/if_link.h | 1
> net/core/rtnetlink.c | 3
> 3 files changed, 283 insertions(+), 135 deletions(-)
> 
> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
> index 69c2566..ac04e85 100644
> --- a/drivers/net/ifb.c
> +++ b/drivers/net/ifb.c
> @@ -33,161 +33,157 @@
>  #include <linux/etherdevice.h>
>  #include <linux/init.h>
>  #include <linux/moduleparam.h>
> +#include <linux/wait.h>
> +#include <linux/sched.h>
> +#include <linux/kthread.h>
> +#include <linux/ip.h>
> +#include <linux/ipv6.h>
> +#include <net/ip.h>
>  #include <net/pkt_sched.h>
>  #include <net/net_namespace.h>
>  
> -#define TX_TIMEOUT  (2*HZ)
> -
>  #define TX_Q_LIMIT    32
> +
> +struct ifb_private_q {
> +	struct net_device	*dev;
> +	struct sk_buff_head	rq;
> +	struct sk_buff_head	tq;
> +	wait_queue_head_t	wq;
> +	struct task_struct	*task;
> +	unsigned long		rx_packets;
> +	unsigned long		rx_bytes;
> +	unsigned long		rx_dropped;
> +} ____cacheline_aligned_in_smp;
> +

Could you split this struct in two parts, one used by ifb_xmit(),
one used by the ifb_thread() ?
This would reduce number of cache line ping pongs...


struct ifb_private_q {

	struct net_device	*dev;
	struct task_struct	*task;

/* used by ifb_xmit() */

	struct sk_buff_head	rq;
	unsigned long		rx_packets;
	unsigned long		rx_bytes;
	unsigned long		rx_dropped;
	wait_queue_head_t	wq;

/* used by ifb_thread() */

	struct sk_buff_head	tq ____cacheline_aligned_in_smp;
} ____cacheline_aligned_in_smp;


Or even better, tq could be local to ifb_xmit()

ifb_xmit() would purge it itself, and could use __skb_dequeue() and
other lockless primitives.

The whole :

while ((skb = skb_dequeue(&pq->rq)) != NULL)
	skb_queue_tail(&pq->tq, skb);

could be optimized a lot IMHO, its almost a skb_queue_splice() thing...

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao Nov. 11, 2009, 10:57 a.m. UTC | #3
2009/11/11 Eric Dumazet <eric.dumazet@gmail.com>:
>
> Or even better, tq could be local to ifb_xmit()
>
> ifb_xmit() would purge it itself, and could use __skb_dequeue() and
> other lockless primitives.
>
> The whole :
>
> while ((skb = skb_dequeue(&pq->rq)) != NULL)
>        skb_queue_tail(&pq->tq, skb);
>
> could be optimized a lot IMHO, its almost a skb_queue_splice() thing...
>

Thanks, it is a good idea. Any others?
Patrick McHardy Nov. 11, 2009, 3:59 p.m. UTC | #4
Changli Gao wrote:
> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
> index 69c2566..ac04e85 100644
> --- a/drivers/net/ifb.c
> +++ b/drivers/net/ifb.c
> ...
> +/* Number of ifb devices to be set up by this module. */
>  static int numifbs = 2;
> +module_param(numifbs, int, 0444);
> +MODULE_PARM_DESC(numifbs, "Number of ifb devices");
>  
> -static void ri_tasklet(unsigned long dev);
> -static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
> -static int ifb_open(struct net_device *dev);
> -static int ifb_close(struct net_device *dev);
> +/* Number of TX queues per ifb */
> +static int numtxqs = 1;
> +module_param(numtxqs, int, 0444);
> +MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");

unsigned?

> +		while ((skb = skb_dequeue(&pq->tq)) != NULL) {
> +			u32 from = G_TC_FROM(skb->tc_verd);
> +	
> +			skb->tc_verd = 0;
> +			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
> +			txq->tx_packets++;
> +			txq->tx_bytes +=skb->len;
> +
> +			rcu_read_lock();
> +			skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
> +			if (!skb->dev) {
> +				rcu_read_unlock();
> +				dev_kfree_skb(skb);
> +				txq->tx_dropped++;
> +				break;
> +			}
> +			rcu_read_unlock();
> +			skb->iif = dev->ifindex;

What protects the device from disappearing here and below during
dev_queue_xmit() and netif_rx_ni()?

> +	
> +			if (from & AT_EGRESS) {
> +				dev_queue_xmit(skb);
> +			} else if (from & AT_INGRESS) {
> +				skb_pull(skb, skb->dev->hard_header_len);
> +				netif_rx_ni(skb);
> +			} else
> +				BUG();
>  		}

> +static u32 simple_tx_hashrnd;
> +
> +static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
> +{
> +	u32 addr1, addr2;
> +	u32 hash, ihl;
> +	union {
> +		u16 in16[2];
> +		u32 in32;
> +	} ports;
> +	u8 ip_proto;
> +
> +	if ((hash = skb_rx_queue_recorded(skb))) {
> +		while (hash >= dev->real_num_tx_queues)
> +			hash -= dev->real_num_tx_queues;
> +		return hash;
> +	}
> +
> +	switch (skb->protocol) {
> +	case __constant_htons(ETH_P_IP):
> +		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
> +			ip_proto = ip_hdr(skb)->protocol;
> +		else
> +			ip_proto = 0;

So fragments will get reordered?

> +		addr1 = ip_hdr(skb)->saddr;
> +		addr2 = ip_hdr(skb)->daddr;
> +		ihl = ip_hdr(skb)->ihl << 2;

ip_hdrlen()?

> +		break;
> +	case __constant_htons(ETH_P_IPV6):
> +		ip_proto = ipv6_hdr(skb)->nexthdr;
> +		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
> +		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
> +		ihl = 10;

Where does 10 come from?

> +		break;
> +	default:
> +		return 0;

Perhaps hash on skb->protocol here.

> +	}
> +	if (addr1 > addr2)
> +		swap(addr1, addr2);
> +
> +	switch (ip_proto) {
> +	case IPPROTO_TCP:
> +	case IPPROTO_UDP:
> +	case IPPROTO_DCCP:
> +	case IPPROTO_ESP:
> +	case IPPROTO_AH:
> +	case IPPROTO_SCTP:
> +	case IPPROTO_UDPLITE:
> +		ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
> +		if (ports.in16[0] > ports.in16[1])
> +			swap(ports.in16[0], ports.in16[1]);
> +		break;
> +
> +	default:
> +		ports.in32 = 0;
> +		break;
> +	}
> +
> +	hash = jhash_3words(addr1, addr2, ports.in32,
> +			    simple_tx_hashrnd ^ ip_proto);
> +
> +	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
> +}
> +
> +static int ifb_init(struct net_device *dev)
> +{
> +	struct ifb_private *dp = netdev_priv(dev);
> +	struct ifb_private_q *pq = dp->pq;
> +	int i;
> +
> +	pq = kmalloc(sizeof(*pq) * dev->real_num_tx_queues, GFP_KERNEL);

kcalloc()?

> +	if (pq == NULL)
> +		return -ENOMEM;
> +	dp->pq = pq;
> +
> +	for (i = 0; i < dev->real_num_tx_queues; i++) {
> +		pq[i].dev = dev;
> +		skb_queue_head_init(&pq[i].rq);
> +		skb_queue_head_init(&pq[i].tq);
> +		init_waitqueue_head(&pq[i].wq);
> +		pq[i].rx_packets = 0;
> +		pq[i].rx_bytes = 0;
> +		pq[i].rx_dropped = 0;
> +	}
> +
> +	return 0;
> +}

> +static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
> +			     unsigned int *num_tx_queues,
> +			     unsigned int *real_num_tx_queues)
> +{
> +	if (tb[IFLA_NTXQ]) {
> +		*num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);

We currently use unsigned ints for the queue number, so please
use an u32 for the attribute as well.

> +		*real_num_tx_queues = *num_tx_queues;
> +	} else {
> +		*num_tx_queues = numtxqs;
> +		*real_num_tx_queues = numtxqs;
> +	}
> +
> +	return 0;
> +}
> +
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao Nov. 12, 2009, 3:12 a.m. UTC | #5
On Wed, Nov 11, 2009 at 11:59 PM, Patrick McHardy <kaber@trash.net> wrote:
> Changli Gao wrote:
>> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
>> index 69c2566..ac04e85 100644
>> --- a/drivers/net/ifb.c
>> +++ b/drivers/net/ifb.c
>> ...
>> +/* Number of ifb devices to be set up by this module. */
>>  static int numifbs = 2;
>> +module_param(numifbs, int, 0444);
>> +MODULE_PARM_DESC(numifbs, "Number of ifb devices");
>>
>> -static void ri_tasklet(unsigned long dev);
>> -static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
>> -static int ifb_open(struct net_device *dev);
>> -static int ifb_close(struct net_device *dev);
>> +/* Number of TX queues per ifb */
>> +static int numtxqs = 1;
>> +module_param(numtxqs, int, 0444);
>> +MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
>
> unsigned?

Yea, unsigned is better, and I need to check whether its value is
smaller than 1 somewhere. The same will be done with IFLA_NTXQ.

>
>> +             while ((skb = skb_dequeue(&pq->tq)) != NULL) {
>> +                     u32 from = G_TC_FROM(skb->tc_verd);
>> +
>> +                     skb->tc_verd = 0;
>> +                     skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
>> +                     txq->tx_packets++;
>> +                     txq->tx_bytes +=skb->len;
>> +
>> +                     rcu_read_lock();
>> +                     skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
>> +                     if (!skb->dev) {
>> +                             rcu_read_unlock();
>> +                             dev_kfree_skb(skb);
>> +                             txq->tx_dropped++;
>> +                             break;
>> +                     }
>> +                     rcu_read_unlock();
>> +                     skb->iif = dev->ifindex;
>
> What protects the device from disappearing here and below during
> dev_queue_xmit() and netif_rx_ni()?

For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
problem. But for netif_rx_ni(), I don't know how to prevent the device
disappearing, and it seems that all the NIC drivers have this problem.
Maybe there was the assumption about the execution context of
netif_rx() before. Now softirq can't be executed by softirqd, so the
packet receiving path maybe interleaved. I don't know how to prevent
it happening.


>> +
>> +static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
>> +{
>> +     u32 addr1, addr2;
>> +     u32 hash, ihl;
>> +     union {
>> +             u16 in16[2];
>> +             u32 in32;
>> +     } ports;
>> +     u8 ip_proto;
>> +
>> +     if ((hash = skb_rx_queue_recorded(skb))) {
>> +             while (hash >= dev->real_num_tx_queues)
>> +                     hash -= dev->real_num_tx_queues;
>> +             return hash;
>> +     }
>> +
>> +     switch (skb->protocol) {
>> +     case __constant_htons(ETH_P_IP):
>> +             if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
>> +                     ip_proto = ip_hdr(skb)->protocol;
>> +             else
>> +                     ip_proto = 0;
>
> So fragments will get reordered?

Yea. It is OK, as the number of fragments isn't large. If we decide to
avoid this, we have to reasm them as netfilter does. and it isn't
efficiency.

>
>> +             addr1 = ip_hdr(skb)->saddr;
>> +             addr2 = ip_hdr(skb)->daddr;
>> +             ihl = ip_hdr(skb)->ihl << 2;
>
> ip_hdrlen()?

Yea, I'll use ip_hdrlen() instead.

>
>> +             break;
>> +     case __constant_htons(ETH_P_IPV6):
>> +             ip_proto = ipv6_hdr(skb)->nexthdr;
>> +             addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
>> +             addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
>> +             ihl = 10;
>
> Where does 10 come from?

It should be 40, after reviewing IPV6, I found that I need to loop
until finding the right protocol value.

>
>> +             break;
>> +     default:
>> +             return 0;
>
> Perhaps hash on skb->protocol here.

use return skb->protocol % dev->real_num_tx_queues; instead.

>
>> +     }
>> +     if (addr1 > addr2)
>> +             swap(addr1, addr2);
>> +
>> +     switch (ip_proto) {
>> +     case IPPROTO_TCP:
>> +     case IPPROTO_UDP:
>> +     case IPPROTO_DCCP:
>> +     case IPPROTO_ESP:
>> +     case IPPROTO_AH:
>> +     case IPPROTO_SCTP:
>> +     case IPPROTO_UDPLITE:
>> +             ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
>> +             if (ports.in16[0] > ports.in16[1])
>> +                     swap(ports.in16[0], ports.in16[1]);
>> +             break;
>> +
>> +     default:
>> +             ports.in32 = 0;
>> +             break;
>> +     }
>> +
>> +     hash = jhash_3words(addr1, addr2, ports.in32,
>> +                         simple_tx_hashrnd ^ ip_proto);
>> +
>> +     return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
>> +}
>> +
>> +static int ifb_init(struct net_device *dev)
>> +{
>> +     struct ifb_private *dp = netdev_priv(dev);
>> +     struct ifb_private_q *pq = dp->pq;
>> +     int i;
>> +
>> +     pq = kmalloc(sizeof(*pq) * dev->real_num_tx_queues, GFP_KERNEL);
>
> kcalloc()?

OK.

>> +     if (pq == NULL)
>> +             return -ENOMEM;
>> +     dp->pq = pq;
>> +
>> +     for (i = 0; i < dev->real_num_tx_queues; i++) {
>> +             pq[i].dev = dev;
>> +             skb_queue_head_init(&pq[i].rq);
>> +             skb_queue_head_init(&pq[i].tq);
>> +             init_waitqueue_head(&pq[i].wq);
>> +             pq[i].rx_packets = 0;
>> +             pq[i].rx_bytes = 0;
>> +             pq[i].rx_dropped = 0;
>> +     }
>> +
>> +     return 0;
>> +}
>
>> +static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
>> +                          unsigned int *num_tx_queues,
>> +                          unsigned int *real_num_tx_queues)
>> +{
>> +     if (tb[IFLA_NTXQ]) {
>> +             *num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);
>
> We currently use unsigned ints for the queue number, so please
> use an u32 for the attribute as well.
>
>> +             *real_num_tx_queues = *num_tx_queues;
>> +     } else {
>> +             *num_tx_queues = numtxqs;
>> +             *real_num_tx_queues = numtxqs;
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>
        u16                     (*ndo_select_queue)(struct net_device *dev,
                                                    struct sk_buff *skb);
use u16 as the return value so ..., and I think u16 is big enough. If
you insist on this, I'll use u32 instead.
Jarek Poplawski Nov. 12, 2009, 8:52 a.m. UTC | #6
On 12-11-2009 04:12, Changli Gao wrote:
> On Wed, Nov 11, 2009 at 11:59 PM, Patrick McHardy <kaber@trash.net> wrote:
...
>> What protects the device from disappearing here and below during
>> dev_queue_xmit() and netif_rx_ni()?
> 
> For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
> problem. But for netif_rx_ni(), I don't know how to prevent the device
> disappearing, and it seems that all the NIC drivers have this problem.
> Maybe there was the assumption about the execution context of
> netif_rx() before. Now softirq can't be executed by softirqd, so the
> packet receiving path maybe interleaved. I don't know how to prevent
> it happening.

Btw, maybe I miss something, but ifb with act_mirred is for many users
on the fast path. I'm not sure you proved enough why moving it to the
process context and additional multiqueue functionality (which seems
"pretty cool", but probably mostly for testing purposes) don't harm
its main use?

Regards,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao Nov. 12, 2009, 9:32 a.m. UTC | #7
On Thu, Nov 12, 2009 at 4:52 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> On 12-11-2009 04:12, Changli Gao wrote:
>> On Wed, Nov 11, 2009 at 11:59 PM, Patrick McHardy <kaber@trash.net> wrote:
> ...
>>> What protects the device from disappearing here and below during
>>> dev_queue_xmit() and netif_rx_ni()?
>>
>> For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
>> problem.

Oh, I'm wrong, and not all the skbs using dev_queue_xmit() has a valid
dst, such as AF_PACKET. As skb->dev doesn't own dev, so I think it is
AF_PACKETs fault. Too bad!

>> But for netif_rx_ni(), I don't know how to prevent the device
>> disappearing, and it seems that all the NIC drivers have this problem.
>> Maybe there was the assumption about the execution context of
>> netif_rx() before. Now softirq can't be executed by softirqd, so the
>> packet receiving path maybe interleaved. I don't know how to prevent
>> it happening.
>
> Btw, maybe I miss something, but ifb with act_mirred is for many users
> on the fast path. I'm not sure you proved enough why moving it to the
> process context and additional multiqueue functionality (which seems
> "pretty cool", but probably mostly for testing purposes) don't harm
> its main use?

Because process is more flexible than other activities, and you can
specify its priority and bind it to some CPU, and so on. If you have a
SMP system, and a NIC which doesn't support MQ. When you use it as a
firewall, you'll find all the work is nearly on a CPU. As throughput
increase, that CPU will be full and no more throughput can gain,
though the other CPUs are still idle. With IFB-MQ, you can distribute
the packets among all the CPUs, and archive good throughput.

I know you worry about the latency. From my experiments, It just OK.
No much explicit latency is added.
Patrick McHardy Nov. 12, 2009, 3:10 p.m. UTC | #8
Changli Gao wrote:
> On Wed, Nov 11, 2009 at 11:59 PM, Patrick McHardy <kaber@trash.net> wrote:
>>> +static int numtxqs = 1;
>>> +module_param(numtxqs, int, 0444);
>>> +MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
>> unsigned?
> 
> Yea, unsigned is better, and I need to check whether its value is
> smaller than 1 somewhere. The same will be done with IFLA_NTXQ.

Good point.

>>> +                     rcu_read_lock();
>>> +                     skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
>>> +                     if (!skb->dev) {
>>> +                             rcu_read_unlock();
>>> +                             dev_kfree_skb(skb);
>>> +                             txq->tx_dropped++;
>>> +                             break;
>>> +                     }
>>> +                     rcu_read_unlock();
>>> +                     skb->iif = dev->ifindex;
>> What protects the device from disappearing here and below during
>> dev_queue_xmit() and netif_rx_ni()?
> 
> For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
> problem. But for netif_rx_ni(), I don't know how to prevent the device
> disappearing, and it seems that all the NIC drivers have this problem.
> Maybe there was the assumption about the execution context of
> netif_rx() before. Now softirq can't be executed by softirqd, so the
> packet receiving path maybe interleaved. I don't know how to prevent
> it happening.

You can either take a reference of move the rcu_read_unlock()
after the skb submission.

>>> +             break;
>>> +     case __constant_htons(ETH_P_IPV6):
>>> +             ip_proto = ipv6_hdr(skb)->nexthdr;
>>> +             addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
>>> +             addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
>>> +             ihl = 10;
>> Where does 10 come from?
> 
> It should be 40, after reviewing IPV6, I found that I need to loop
> until finding the right protocol value.

There is a helper for this (ipv6_skip_exthdr).

>>> +static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
>>> +                          unsigned int *num_tx_queues,
>>> +                          unsigned int *real_num_tx_queues)
>>> +{
>>> +     if (tb[IFLA_NTXQ]) {
>>> +             *num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);
>> We currently use unsigned ints for the queue number, so please
>> use an u32 for the attribute as well.
>>
>>> +             *real_num_tx_queues = *num_tx_queues;
>>> +     } else {
>>> +             *num_tx_queues = numtxqs;
>>> +             *real_num_tx_queues = numtxqs;
>>> +     }
>>> +
>>> +     return 0;
>>> +}
>>> +
>         u16                     (*ndo_select_queue)(struct net_device *dev,
>                                                     struct sk_buff *skb);
> use u16 as the return value so ..., and I think u16 is big enough. If
> you insist on this, I'll use u32 instead.

Well, get_tx_queues() uses unsigned int, as does struct net_device.
I agree that it probably won't ever be needed, but there's no harm
in using the correct type, the attribute encoding doesn't even need
more room.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao Nov. 13, 2009, 1:28 a.m. UTC | #9
On Thu, Nov 12, 2009 at 11:10 PM, Patrick McHardy <kaber@trash.net> wrote:
> Changli Gao wrote:
>>>> +                     rcu_read_lock();
>>>> +                     skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
>>>> +                     if (!skb->dev) {
>>>> +                             rcu_read_unlock();
>>>> +                             dev_kfree_skb(skb);
>>>> +                             txq->tx_dropped++;
>>>> +                             break;
>>>> +                     }
>>>> +                     rcu_read_unlock();
>>>> +                     skb->iif = dev->ifindex;
>>> What protects the device from disappearing here and below during
>>> dev_queue_xmit() and netif_rx_ni()?
>>
>> For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
>> problem. But for netif_rx_ni(), I don't know how to prevent the device
>> disappearing, and it seems that all the NIC drivers have this problem.
>> Maybe there was the assumption about the execution context of
>> netif_rx() before. Now softirq can't be executed by softirqd, so the
>> packet receiving path maybe interleaved. I don't know how to prevent
>> it happening.
>
> You can either take a reference of move the rcu_read_unlock()
> after the skb submission.

good idea!

>
>>>> +             break;
>>>> +     case __constant_htons(ETH_P_IPV6):
>>>> +             ip_proto = ipv6_hdr(skb)->nexthdr;
>>>> +             addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
>>>> +             addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
>>>> +             ihl = 10;
>>> Where does 10 come from?
>>
>> It should be 40, after reviewing IPV6, I found that I need to loop
>> until finding the right protocol value.
>
> There is a helper for this (ipv6_skip_exthdr).
>

It is a little late. Thanks all the same, and I'll use it.

>>>> +static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
>>>> +                          unsigned int *num_tx_queues,
>>>> +                          unsigned int *real_num_tx_queues)
>>>> +{
>>>> +     if (tb[IFLA_NTXQ]) {
>>>> +             *num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);
>>> We currently use unsigned ints for the queue number, so please
>>> use an u32 for the attribute as well.
>>>
>>>> +             *real_num_tx_queues = *num_tx_queues;
>>>> +     } else {
>>>> +             *num_tx_queues = numtxqs;
>>>> +             *real_num_tx_queues = numtxqs;
>>>> +     }
>>>> +
>>>> +     return 0;
>>>> +}
>>>> +
>>         u16                     (*ndo_select_queue)(struct net_device *dev,
>>                                                     struct sk_buff *skb);
>> use u16 as the return value so ..., and I think u16 is big enough. If
>> you insist on this, I'll use u32 instead.
>
> Well, get_tx_queues() uses unsigned int, as does struct net_device.
> I agree that it probably won't ever be needed, but there's no harm
> in using the correct type, the attribute encoding doesn't even need
> more room.
>

So, in the last patch, I use it instead. :)
diff mbox

Patch

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 69c2566..ac04e85 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -33,161 +33,157 @@ 
 #include <linux/etherdevice.h>
 #include <linux/init.h>
 #include <linux/moduleparam.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
 #include <net/pkt_sched.h>
 #include <net/net_namespace.h>
 
-#define TX_TIMEOUT  (2*HZ)
-
 #define TX_Q_LIMIT    32
+
+struct ifb_private_q {
+	struct net_device	*dev;
+	struct sk_buff_head	rq;
+	struct sk_buff_head	tq;
+	wait_queue_head_t	wq;
+	struct task_struct	*task;
+	unsigned long		rx_packets;
+	unsigned long		rx_bytes;
+	unsigned long		rx_dropped;
+} ____cacheline_aligned_in_smp;
+
 struct ifb_private {
-	struct tasklet_struct   ifb_tasklet;
-	int     tasklet_pending;
-	/* mostly debug stats leave in for now */
-	unsigned long   st_task_enter; /* tasklet entered */
-	unsigned long   st_txq_refl_try; /* transmit queue refill attempt */
-	unsigned long   st_rxq_enter; /* receive queue entered */
-	unsigned long   st_rx2tx_tran; /* receive to trasmit transfers */
-	unsigned long   st_rxq_notenter; /*receiveQ not entered, resched */
-	unsigned long   st_rx_frm_egr; /* received from egress path */
-	unsigned long   st_rx_frm_ing; /* received from ingress path */
-	unsigned long   st_rxq_check;
-	unsigned long   st_rxq_rsch;
-	struct sk_buff_head     rq;
-	struct sk_buff_head     tq;
+	struct ifb_private_q	*pq;
 };
 
+/* Number of ifb devices to be set up by this module. */
 static int numifbs = 2;
+module_param(numifbs, int, 0444);
+MODULE_PARM_DESC(numifbs, "Number of ifb devices");
 
-static void ri_tasklet(unsigned long dev);
-static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
-static int ifb_open(struct net_device *dev);
-static int ifb_close(struct net_device *dev);
+/* Number of TX queues per ifb */
+static int numtxqs = 1;
+module_param(numtxqs, int, 0444);
+MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
 
-static void ri_tasklet(unsigned long dev)
+static int ifb_thread(void *priv)
 {
-
-	struct net_device *_dev = (struct net_device *)dev;
-	struct ifb_private *dp = netdev_priv(_dev);
-	struct net_device_stats *stats = &_dev->stats;
-	struct netdev_queue *txq;
+	struct ifb_private_q *pq = priv;
+	struct net_device *dev = pq->dev;
+	int num = pq - ((struct ifb_private *)netdev_priv(dev))->pq;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 	struct sk_buff *skb;
-
-	txq = netdev_get_tx_queue(_dev, 0);
-	dp->st_task_enter++;
-	if ((skb = skb_peek(&dp->tq)) == NULL) {
-		dp->st_txq_refl_try++;
-		if (__netif_tx_trylock(txq)) {
-			dp->st_rxq_enter++;
-			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
-				skb_queue_tail(&dp->tq, skb);
-				dp->st_rx2tx_tran++;
-			}
-			__netif_tx_unlock(txq);
-		} else {
-			/* reschedule */
-			dp->st_rxq_notenter++;
-			goto resched;
+	DEFINE_WAIT(wait);
+
+	while (1) {
+		/* move skb from rq to tq */
+		while (1) {
+			prepare_to_wait(&pq->wq, &wait, TASK_UNINTERRUPTIBLE);
+			__netif_tx_lock_bh(txq);
+			while ((skb = skb_dequeue(&pq->rq)) != NULL)
+				skb_queue_tail(&pq->tq, skb);
+			if (netif_queue_stopped(dev))
+				netif_wake_queue(dev);
+			__netif_tx_unlock_bh(txq);
+			if (kthread_should_stop() || !skb_queue_empty(&pq->tq))
+				break;
+			schedule();
 		}
-	}
-
-	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
-		u32 from = G_TC_FROM(skb->tc_verd);
-
-		skb->tc_verd = 0;
-		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
-		stats->tx_packets++;
-		stats->tx_bytes +=skb->len;
-
-		rcu_read_lock();
-		skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
-		if (!skb->dev) {
-			rcu_read_unlock();
-			dev_kfree_skb(skb);
-			stats->tx_dropped++;
+		finish_wait(&pq->wq, &wait);
+		if (kthread_should_stop())
 			break;
+		if (need_resched())
+			schedule();
+
+		/* transfer packets */
+		while ((skb = skb_dequeue(&pq->tq)) != NULL) {
+			u32 from = G_TC_FROM(skb->tc_verd);
+	
+			skb->tc_verd = 0;
+			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
+			txq->tx_packets++;
+			txq->tx_bytes +=skb->len;
+
+			rcu_read_lock();
+			skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
+			if (!skb->dev) {
+				rcu_read_unlock();
+				dev_kfree_skb(skb);
+				txq->tx_dropped++;
+				break;
+			}
+			rcu_read_unlock();
+			skb->iif = dev->ifindex;
+	
+			if (from & AT_EGRESS) {
+				dev_queue_xmit(skb);
+			} else if (from & AT_INGRESS) {
+				skb_pull(skb, skb->dev->hard_header_len);
+				netif_rx_ni(skb);
+			} else
+				BUG();
 		}
-		rcu_read_unlock();
-		skb->iif = _dev->ifindex;
-
-		if (from & AT_EGRESS) {
-			dp->st_rx_frm_egr++;
-			dev_queue_xmit(skb);
-		} else if (from & AT_INGRESS) {
-			dp->st_rx_frm_ing++;
-			skb_pull(skb, skb->dev->hard_header_len);
-			netif_rx(skb);
-		} else
-			BUG();
-	}
-
-	if (__netif_tx_trylock(txq)) {
-		dp->st_rxq_check++;
-		if ((skb = skb_peek(&dp->rq)) == NULL) {
-			dp->tasklet_pending = 0;
-			if (netif_queue_stopped(_dev))
-				netif_wake_queue(_dev);
-		} else {
-			dp->st_rxq_rsch++;
-			__netif_tx_unlock(txq);
-			goto resched;
-		}
-		__netif_tx_unlock(txq);
-	} else {
-resched:
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
 	}
 
+	return 0;
 }
 
-static const struct net_device_ops ifb_netdev_ops = {
-	.ndo_open	= ifb_open,
-	.ndo_stop	= ifb_close,
-	.ndo_start_xmit	= ifb_xmit,
-	.ndo_validate_addr = eth_validate_addr,
-};
-
-static void ifb_setup(struct net_device *dev)
+struct net_device_stats* ifb_get_stats(struct net_device *dev)
 {
-	/* Initialize the device structure. */
-	dev->destructor = free_netdev;
-	dev->netdev_ops = &ifb_netdev_ops;
+	struct net_device_stats *stats = &dev->stats;
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	struct netdev_queue *txq;
+	int i;
+	unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0;
+	unsigned long tx_packets = 0, tx_bytes = 0, tx_dropped = 0;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		rx_packets += pq[i].rx_packets;
+		rx_bytes += pq[i].rx_bytes;
+		rx_dropped += pq[i].rx_dropped;
+		txq = netdev_get_tx_queue(dev, i);
+		tx_packets += txq->tx_packets;
+		tx_bytes += txq->tx_bytes;
+		tx_dropped += txq->tx_dropped;
+	}
 
-	/* Fill in device structure with ethernet-generic values. */
-	ether_setup(dev);
-	dev->tx_queue_len = TX_Q_LIMIT;
+	stats->rx_packets = rx_packets;
+	stats->rx_bytes = rx_bytes;
+	stats->rx_dropped = rx_dropped;
+	stats->tx_packets = tx_packets;
+	stats->tx_bytes = tx_bytes;
+	stats->tx_dropped = tx_dropped;
 
-	dev->flags |= IFF_NOARP;
-	dev->flags &= ~IFF_MULTICAST;
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
-	random_ether_addr(dev->dev_addr);
+	return stats;
 }
 
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
 	u32 from = G_TC_FROM(skb->tc_verd);
+	int num = skb_get_queue_mapping(skb);
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq + num;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 
-	stats->rx_packets++;
-	stats->rx_bytes+=skb->len;
+	pq->rx_packets++;
+	pq->rx_bytes += skb->len;
 
 	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->iif) {
 		dev_kfree_skb(skb);
-		stats->rx_dropped++;
+		pq->rx_dropped++;
 		return NETDEV_TX_OK;
 	}
 
-	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len) {
+	txq->trans_start = jiffies;
+	skb_queue_tail(&pq->rq, skb);
+	if (skb_queue_len(&pq->rq) >= dev->tx_queue_len)
 		netif_stop_queue(dev);
-	}
-
-	dev->trans_start = jiffies;
-	skb_queue_tail(&dp->rq, skb);
-	if (!dp->tasklet_pending) {
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
+	if (skb_queue_len(&pq->rq) == 1)
+		wake_up(&pq->wq);
 
 	return NETDEV_TX_OK;
 }
@@ -195,26 +191,162 @@  static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 static int ifb_close(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
 
-	tasklet_kill(&dp->ifb_tasklet);
 	netif_stop_queue(dev);
-	skb_queue_purge(&dp->rq);
-	skb_queue_purge(&dp->tq);
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		kthread_stop(pq[i].task);
+		skb_queue_purge(&pq[i].tq);
+		skb_queue_purge(&pq[i].rq);
+	}
+
 	return 0;
 }
 
 static int ifb_open(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
-
-	tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
-	skb_queue_head_init(&dp->rq);
-	skb_queue_head_init(&dp->tq);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
+	
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		pq[i].task = kthread_run(ifb_thread, &pq[i], "%s/%d", dev->name,
+					 i);
+		if (IS_ERR(pq[i].task)) {
+			int err = PTR_ERR(pq[i].task);
+			while (--i >= 0)
+				kthread_stop(pq[i].task);
+			return err;
+		}
+	}
 	netif_start_queue(dev);
 
 	return 0;
 }
 
+static u32 simple_tx_hashrnd;
+
+static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2;
+	u32 hash, ihl;
+	union {
+		u16 in16[2];
+		u32 in32;
+	} ports;
+	u8 ip_proto;
+
+	if ((hash = skb_rx_queue_recorded(skb))) {
+		while (hash >= dev->real_num_tx_queues)
+			hash -= dev->real_num_tx_queues;
+		return hash;
+	}
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
+			ip_proto = ip_hdr(skb)->protocol;
+		else
+			ip_proto = 0;
+		addr1 = ip_hdr(skb)->saddr;
+		addr2 = ip_hdr(skb)->daddr;
+		ihl = ip_hdr(skb)->ihl << 2;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		ip_proto = ipv6_hdr(skb)->nexthdr;
+		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
+		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
+		ihl = 10;
+		break;
+	default:
+		return 0;
+	}
+	if (addr1 > addr2)
+		swap(addr1, addr2);
+
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
+		if (ports.in16[0] > ports.in16[1])
+			swap(ports.in16[0], ports.in16[1]);
+		break;
+
+	default:
+		ports.in32 = 0;
+		break;
+	}
+
+	hash = jhash_3words(addr1, addr2, ports.in32,
+			    simple_tx_hashrnd ^ ip_proto);
+
+	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+}
+
+static int ifb_init(struct net_device *dev)
+{
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
+
+	pq = kmalloc(sizeof(*pq) * dev->real_num_tx_queues, GFP_KERNEL);
+	if (pq == NULL)
+		return -ENOMEM;
+	dp->pq = pq;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		pq[i].dev = dev;
+		skb_queue_head_init(&pq[i].rq);
+		skb_queue_head_init(&pq[i].tq);
+		init_waitqueue_head(&pq[i].wq);
+		pq[i].rx_packets = 0;
+		pq[i].rx_bytes = 0;
+		pq[i].rx_dropped = 0;
+	}
+
+	return 0;
+}
+
+static void ifb_uninit(struct net_device *dev)
+{
+	struct ifb_private *dp = netdev_priv(dev);
+
+	kfree(dp->pq);
+}
+
+static const struct net_device_ops ifb_netdev_ops = {
+	.ndo_init		= ifb_init,
+	.ndo_uninit		= ifb_uninit,
+	.ndo_open		= ifb_open,
+	.ndo_stop		= ifb_close,
+	.ndo_start_xmit		= ifb_xmit,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_select_queue	= ifb_select_queue,
+	.ndo_get_stats		= ifb_get_stats,
+};
+
+static void ifb_setup(struct net_device *dev)
+{
+	/* Initialize the device structure. */
+	dev->destructor = free_netdev;
+	dev->netdev_ops = &ifb_netdev_ops;
+
+	/* Fill in device structure with ethernet-generic values. */
+	ether_setup(dev);
+	dev->tx_queue_len = TX_Q_LIMIT;
+
+	dev->flags |= IFF_NOARP;
+	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	random_ether_addr(dev->dev_addr);
+}
+
 static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 {
 	if (tb[IFLA_ADDRESS]) {
@@ -226,24 +358,36 @@  static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
+			     unsigned int *num_tx_queues,
+			     unsigned int *real_num_tx_queues)
+{
+	if (tb[IFLA_NTXQ]) {
+		*num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);
+		*real_num_tx_queues = *num_tx_queues;
+	} else {
+		*num_tx_queues = numtxqs;
+		*real_num_tx_queues = numtxqs;
+	}
+
+	return 0;
+}
+
 static struct rtnl_link_ops ifb_link_ops __read_mostly = {
 	.kind		= "ifb",
-	.priv_size	= sizeof(struct ifb_private),
 	.setup		= ifb_setup,
 	.validate	= ifb_validate,
+	.get_tx_queues	= ifb_get_tx_queues,
+	.priv_size	= sizeof(struct ifb_private),
 };
 
-/* Number of ifb devices to be set up by this module. */
-module_param(numifbs, int, 0);
-MODULE_PARM_DESC(numifbs, "Number of ifb devices");
-
 static int __init ifb_init_one(int index)
 {
 	struct net_device *dev_ifb;
 	int err;
 
-	dev_ifb = alloc_netdev(sizeof(struct ifb_private),
-				 "ifb%d", ifb_setup);
+	dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private), "ifb%d",
+				  ifb_setup, numtxqs);
 
 	if (!dev_ifb)
 		return -ENOMEM;
@@ -268,9 +412,9 @@  static int __init ifb_init_module(void)
 {
 	int i, err;
 
+	get_random_bytes(&simple_tx_hashrnd, 4);
 	rtnl_lock();
 	err = __rtnl_link_register(&ifb_link_ops);
-
 	for (i = 0; i < numifbs && !err; i++)
 		err = ifb_init_one(i);
 	if (err)
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 1d3b242..99da411 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -78,6 +78,7 @@  enum {
 #define IFLA_LINKINFO IFLA_LINKINFO
 	IFLA_NET_NS_PID,
 	IFLA_IFALIAS,
+	IFLA_NTXQ,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33148a5..7b94fd7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -597,6 +597,7 @@  static inline size_t if_nlmsg_size(const struct net_device *dev)
 	       + nla_total_size(4) /* IFLA_MASTER */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
+	       + nla_total_size(2) /* IFLA_NTXQ */
 	       + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
 }
 
@@ -627,6 +628,7 @@  static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 		   netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
 	NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
 	NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+	NLA_PUT_U16(skb, IFLA_NTXQ, dev->real_num_tx_queues);
 
 	if (dev->ifindex != dev->iflink)
 		NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
@@ -725,6 +727,7 @@  const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
 	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
+	[IFLA_NTXQ]		= { .type = NLA_U16 },
 };
 EXPORT_SYMBOL(ifla_policy);