Message ID | 1371620452-49349-2-git-send-email-jasowang@redhat.com |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote: > Currently, we use kcalloc to allocate rx/tx queues for a net device which could > be easily lead to a high order memory allocation request when initializing a > multiqueue net device. We can simply avoid this by switching to use flex array > which always allocate at order zero. > > Signed-off-by: Jason Wang <jasowang@redhat.com> > --- > include/linux/netdevice.h | 13 ++++++---- > net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------ > net/core/net-sysfs.c | 15 +++++++---- > 3 files changed, 58 insertions(+), 27 deletions(-) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 09b4188..c0b5d04 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -32,6 +32,7 @@ > #include <linux/atomic.h> > #include <asm/cache.h> > #include <asm/byteorder.h> > +#include <linux/flex_array.h> > > #include <linux/percpu.h> > #include <linux/rculist.h> > @@ -1230,7 +1231,7 @@ struct net_device { > > > #ifdef CONFIG_RPS > - struct netdev_rx_queue *_rx; > + struct flex_array *_rx; > > /* Number of RX queues allocated at register_netdev() time */ > unsigned int num_rx_queues; > @@ -1250,7 +1251,7 @@ struct net_device { > /* > * Cache lines mostly used on transmit path > */ > - struct netdev_queue *_tx ____cacheline_aligned_in_smp; > + struct flex_array *_tx ____cacheline_aligned_in_smp; > Using flex_array and adding overhead in this super critical part of network stack, only to avoid order-1 allocations done in GFP_KERNEL context is simply insane. We can revisit this in 2050 if we ever need order-4 allocations or so, and still use 4K pages. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/19/2013 02:31 PM, Eric Dumazet wrote: > On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote: >> Currently, we use kcalloc to allocate rx/tx queues for a net device which could >> be easily lead to a high order memory allocation request when initializing a >> multiqueue net device. We can simply avoid this by switching to use flex array >> which always allocate at order zero. >> >> Signed-off-by: Jason Wang <jasowang@redhat.com> >> --- >> include/linux/netdevice.h | 13 ++++++---- >> net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------ >> net/core/net-sysfs.c | 15 +++++++---- >> 3 files changed, 58 insertions(+), 27 deletions(-) >> >> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h >> index 09b4188..c0b5d04 100644 >> --- a/include/linux/netdevice.h >> +++ b/include/linux/netdevice.h >> @@ -32,6 +32,7 @@ >> #include <linux/atomic.h> >> #include <asm/cache.h> >> #include <asm/byteorder.h> >> +#include <linux/flex_array.h> >> >> #include <linux/percpu.h> >> #include <linux/rculist.h> >> @@ -1230,7 +1231,7 @@ struct net_device { >> >> >> #ifdef CONFIG_RPS >> - struct netdev_rx_queue *_rx; >> + struct flex_array *_rx; >> >> /* Number of RX queues allocated at register_netdev() time */ >> unsigned int num_rx_queues; >> @@ -1250,7 +1251,7 @@ struct net_device { >> /* >> * Cache lines mostly used on transmit path >> */ >> - struct netdev_queue *_tx ____cacheline_aligned_in_smp; >> + struct flex_array *_tx ____cacheline_aligned_in_smp; >> > Using flex_array and adding overhead in this super critical part of > network stack, only to avoid order-1 allocations done in GFP_KERNEL > context is simply insane. Yes, and I also miss the fact of GFP_KERNEL allocation. > We can revisit this in 2050 if we ever need order-4 allocations or so, > and still use 4K pages. > > Will drop this patch, thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 18, 2013 at 11:31:58PM -0700, Eric Dumazet wrote: > On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote: > > Currently, we use kcalloc to allocate rx/tx queues for a net device which could > > be easily lead to a high order memory allocation request when initializing a > > multiqueue net device. We can simply avoid this by switching to use flex array > > which always allocate at order zero. > > > > Signed-off-by: Jason Wang <jasowang@redhat.com> > > --- > > include/linux/netdevice.h | 13 ++++++---- > > net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------ > > net/core/net-sysfs.c | 15 +++++++---- > > 3 files changed, 58 insertions(+), 27 deletions(-) > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > > index 09b4188..c0b5d04 100644 > > --- a/include/linux/netdevice.h > > +++ b/include/linux/netdevice.h > > @@ -32,6 +32,7 @@ > > #include <linux/atomic.h> > > #include <asm/cache.h> > > #include <asm/byteorder.h> > > +#include <linux/flex_array.h> > > > > #include <linux/percpu.h> > > #include <linux/rculist.h> > > @@ -1230,7 +1231,7 @@ struct net_device { > > > > > > #ifdef CONFIG_RPS > > - struct netdev_rx_queue *_rx; > > + struct flex_array *_rx; > > > > /* Number of RX queues allocated at register_netdev() time */ > > unsigned int num_rx_queues; > > @@ -1250,7 +1251,7 @@ struct net_device { > > /* > > * Cache lines mostly used on transmit path > > */ > > - struct netdev_queue *_tx ____cacheline_aligned_in_smp; > > + struct flex_array *_tx ____cacheline_aligned_in_smp; > > > > Using flex_array and adding overhead in this super critical part of > network stack, only to avoid order-1 allocations done in GFP_KERNEL > context is simply insane. > > We can revisit this in 2050 if we ever need order-4 allocations or so, > and still use 4K pages. > > Well KVM supports up to 160 VCPUs on x86. Creating a queue per CPU is very reasonable, and assuming cache line size of 64 bytes, netdev_queue seems to be 320 bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation. I agree most people don't have such systems yet, but they do exist. We can cut the size of netdev_queue, moving out kobj - which does not seem to be used on data path to a separate structure. It's 64 byte in size so exactly 256 bytes. That will get us an order-3 allocation, and there's some padding there so we won't immediately increase it the moment we add some fields. Comments on this idea? Instead of always using a flex array, we could have + struct netdev_queue *_tx; /* Used with small # of queues */ +#ifdef CONFIG_NETDEV_HUGE_NUMBER_OR_QUEUES + struct flex_array *_tx_large; /* Used with large # of queues */ +#endif And fix wrappers to use _tx if not NULL, otherwise _tx_large. If configured in, it's an extra branch on data path but probably less costly than the extra indirection.
On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote: > Well KVM supports up to 160 VCPUs on x86. > > Creating a queue per CPU is very reasonable, and > assuming cache line size of 64 bytes, netdev_queue seems to be 320 > bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation. > I agree most people don't have such systems yet, but > they do exist. Even so, it will just work, like a fork() is likely to work, even if a process needs order-1 allocation for kernel stack. Some drivers still use order-10 allocations with kmalloc(), and nobody complained yet. We had complains with mlx4 driver lately only bcause kmalloc() now gives a warning if allocations above MAX_ORDER are attempted. Having a single pointer means that we can : - Attempts a regular kmalloc() call, it will work most of the time. - fallback to vmalloc() _if_ kmalloc() failed. Frankly, if you want one tx queue per cpu, I would rather use NETIF_F_LLTX, like some other virtual devices. This way, you can have real per cpu memory, with proper NUMA affinity. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 19, 2013 at 02:56:03AM -0700, Eric Dumazet wrote: > On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote: > > > Well KVM supports up to 160 VCPUs on x86. > > > > Creating a queue per CPU is very reasonable, and > > assuming cache line size of 64 bytes, netdev_queue seems to be 320 > > bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation. > > I agree most people don't have such systems yet, but > > they do exist. > > Even so, it will just work, like a fork() is likely to work, even if a > process needs order-1 allocation for kernel stack. > Some drivers still use order-10 allocations with kmalloc(), and nobody > complained yet. > > We had complains with mlx4 driver lately only bcause kmalloc() now gives > a warning if allocations above MAX_ORDER are attempted. > > Having a single pointer means that we can : > > - Attempts a regular kmalloc() call, it will work most of the time. > - fallback to vmalloc() _if_ kmalloc() failed. Most drivers create devices at boot time, when this is more likely to work. What makes tun (and macvlan) a bit special is that the device is created from userspace. Virt setups create/destroy them all the time. > > Frankly, if you want one tx queue per cpu, I would rather use > NETIF_F_LLTX, like some other virtual devices. > > This way, you can have real per cpu memory, with proper NUMA affinity. > Hmm good point, worth looking at. Thanks,
On Wed, Jun 19, 2013 at 02:56:03AM -0700, Eric Dumazet wrote: > On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote: > > > Well KVM supports up to 160 VCPUs on x86. > > > > Creating a queue per CPU is very reasonable, and > > assuming cache line size of 64 bytes, netdev_queue seems to be 320 > > bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation. > > I agree most people don't have such systems yet, but > > they do exist. > > Even so, it will just work, like a fork() is likely to work, even if a > process needs order-1 allocation for kernel stack. > > Some drivers still use order-10 allocations with kmalloc(), and nobody > complained yet. > > We had complains with mlx4 driver lately only bcause kmalloc() now gives > a warning if allocations above MAX_ORDER are attempted. > > Having a single pointer means that we can : > > - Attempts a regular kmalloc() call, it will work most of the time. > - fallback to vmalloc() _if_ kmalloc() failed. That's a good trick too - vmalloc memory is a bit slower on x86 since it's not using a huge page, but that's only when we have lots of CPUs/queues... Short term - how about switching to vmalloc if > 32 queues? > Frankly, if you want one tx queue per cpu, I would rather use > NETIF_F_LLTX, like some other virtual devices. > > This way, you can have real per cpu memory, with proper NUMA affinity. > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/19/2013 05:56 PM, Eric Dumazet wrote: > On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote: > >> Well KVM supports up to 160 VCPUs on x86. >> >> Creating a queue per CPU is very reasonable, and >> assuming cache line size of 64 bytes, netdev_queue seems to be 320 >> bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation. >> I agree most people don't have such systems yet, but >> they do exist. > Even so, it will just work, like a fork() is likely to work, even if a > process needs order-1 allocation for kernel stack. > > Some drivers still use order-10 allocations with kmalloc(), and nobody > complained yet. > > We had complains with mlx4 driver lately only bcause kmalloc() now gives > a warning if allocations above MAX_ORDER are attempted. > > Having a single pointer means that we can : > > - Attempts a regular kmalloc() call, it will work most of the time. > - fallback to vmalloc() _if_ kmalloc() failed. > > Frankly, if you want one tx queue per cpu, I would rather use > NETIF_F_LLTX, like some other virtual devices. A drawback of NETIF_F_LLTX is that we may contend on qdisc lock especially when we have a huge number of tx queues. > > This way, you can have real per cpu memory, with proper NUMA affinity. > > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2013-06-20 at 13:14 +0800, Jason Wang wrote: > A drawback of NETIF_F_LLTX is that we may contend on qdisc lock > especially when we have a huge number of tx queues. For your information loopback driver is LLTX, and there is no qdisc on it, unless you specifically add one. Once you add a qdisc on a device, you hit the typical contention on a spinlock. But its hard to design a parallel qdisc. So far nobody did that. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 09b4188..c0b5d04 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -32,6 +32,7 @@ #include <linux/atomic.h> #include <asm/cache.h> #include <asm/byteorder.h> +#include <linux/flex_array.h> #include <linux/percpu.h> #include <linux/rculist.h> @@ -1230,7 +1231,7 @@ struct net_device { #ifdef CONFIG_RPS - struct netdev_rx_queue *_rx; + struct flex_array *_rx; /* Number of RX queues allocated at register_netdev() time */ unsigned int num_rx_queues; @@ -1250,7 +1251,7 @@ struct net_device { /* * Cache lines mostly used on transmit path */ - struct netdev_queue *_tx ____cacheline_aligned_in_smp; + struct flex_array *_tx ____cacheline_aligned_in_smp; /* Number of TX queues allocated at alloc_netdev_mq() time */ unsigned int num_tx_queues; @@ -1434,7 +1435,7 @@ static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) { - return &dev->_tx[index]; + return flex_array_get(dev->_tx, index); } static inline void netdev_for_each_tx_queue(struct net_device *dev, @@ -1445,8 +1446,10 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, { unsigned int i; - for (i = 0; i < dev->num_tx_queues; i++) - f(dev, &dev->_tx[i], arg); + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = flex_array_get(dev->_tx, i); + f(dev, txq, arg); + } } extern struct netdev_queue *netdev_pick_tx(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index fa007db..3a4ecb1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -130,6 +130,7 @@ #include <linux/cpu_rmap.h> #include <linux/static_key.h> #include <linux/hashtable.h> +#include <linux/flex_array.h> #include "net-sysfs.h" @@ -2902,7 +2903,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (rxq_index == skb_get_rx_queue(skb)) goto out; - rxqueue = dev->_rx + rxq_index; + rxqueue = flex_array_get(dev->_rx, rxq_index); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; @@ -2950,9 +2951,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, dev->name, index, dev->real_num_rx_queues); goto done; } - rxqueue = dev->_rx + index; + rxqueue = flex_array_get(dev->_rx, index); } else - rxqueue = dev->_rx; + rxqueue = flex_array_get(dev->_rx, 0); map = rcu_dereference(rxqueue->rps_map); if (map) { @@ -3038,7 +3039,7 @@ done: bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id) { - struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct netdev_rx_queue *rxqueue = flex_array_get(dev->_rx, rxq_index); struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; @@ -5223,18 +5224,31 @@ EXPORT_SYMBOL(netif_stacked_transfer_operstate); static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; - struct netdev_rx_queue *rx; + struct flex_array *rx; + int err; BUG_ON(count < 1); - rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) + rx = flex_array_alloc(sizeof(struct netdev_rx_queue), count, + GFP_KERNEL | __GFP_ZERO); + if (!rx) { + pr_err("netdev: Unable to allocate flex array for rx queues\n"); return -ENOMEM; + } + + err = flex_array_prealloc(rx, 0, count, GFP_KERNEL | __GFP_ZERO); + if (err) { + pr_err("netdev, Unable to prealloc %u rx qeueus\n", count); + flex_array_free(rx); + return err; + } dev->_rx = rx; - for (i = 0; i < count; i++) - rx[i].dev = dev; + for (i = 0; i < count; i++) { + struct netdev_rx_queue *rxq = flex_array_get(rx, i); + rxq->dev = dev; + } return 0; } #endif @@ -5256,13 +5270,24 @@ static void netdev_init_one_queue(struct net_device *dev, static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; - struct netdev_queue *tx; + struct flex_array *tx; + int err; BUG_ON(count < 1); - tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); - if (!tx) + tx = flex_array_alloc(sizeof(struct netdev_queue), count, + GFP_KERNEL | __GFP_ZERO); + if (!tx) { + pr_err("netdev: Unable to allocate flex array for tx queues\n"); return -ENOMEM; + } + + err = flex_array_prealloc(tx, 0, count, GFP_KERNEL | __GFP_ZERO); + if (err) { + pr_err("netdev, Unable to prealloc %u tx qeueus\n", count); + flex_array_free(tx); + return err; + } dev->_tx = tx; @@ -5811,9 +5836,9 @@ free_all: free_pcpu: free_percpu(dev->pcpu_refcnt); - kfree(dev->_tx); + flex_array_free(dev->_tx); #ifdef CONFIG_RPS - kfree(dev->_rx); + flex_array_free(dev->_rx); #endif free_p: @@ -5836,9 +5861,9 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); - kfree(dev->_tx); + flex_array_free(dev->_tx); #ifdef CONFIG_RPS - kfree(dev->_rx); + flex_array_free(dev->_rx); #endif kfree(rcu_dereference_protected(dev->ingress_queue, 1)); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 981fed3..4328c3a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -22,6 +22,7 @@ #include <linux/export.h> #include <linux/jiffies.h> #include <linux/pm_runtime.h> +#include <linux/flex_array.h> #include "net-sysfs.h" @@ -717,7 +718,7 @@ static struct kobj_type rx_queue_ktype = { static int rx_queue_add_kobject(struct net_device *net, int index) { - struct netdev_rx_queue *queue = net->_rx + index; + struct netdev_rx_queue *queue = flex_array_get(net->_rx, index); struct kobject *kobj = &queue->kobj; int error = 0; @@ -751,8 +752,10 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } } - while (--i >= new_num) - kobject_put(&net->_rx[i].kobj); + while (--i >= new_num) { + struct netdev_rx_queue *rxq = flex_array_get(net->_rx, i); + kobject_put(&rxq->kobj); + } return error; #else @@ -939,7 +942,7 @@ static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) int i; for (i = 0; i < dev->num_tx_queues; i++) - if (queue == &dev->_tx[i]) + if (queue == (struct netdev_queue *)flex_array_get(dev->_tx, i)) break; BUG_ON(i >= dev->num_tx_queues); @@ -1051,7 +1054,7 @@ static struct kobj_type netdev_queue_ktype = { static int netdev_queue_add_kobject(struct net_device *net, int index) { - struct netdev_queue *queue = net->_tx + index; + struct netdev_queue *queue = flex_array_get(net->_tx, index); struct kobject *kobj = &queue->kobj; int error = 0; @@ -1093,7 +1096,7 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } while (--i >= new_num) { - struct netdev_queue *queue = net->_tx + i; + struct netdev_queue *queue = flex_array_get(net->_tx, i); #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group);
Currently, we use kcalloc to allocate rx/tx queues for a net device which could be easily lead to a high order memory allocation request when initializing a multiqueue net device. We can simply avoid this by switching to use flex array which always allocate at order zero. Signed-off-by: Jason Wang <jasowang@redhat.com> --- include/linux/netdevice.h | 13 ++++++---- net/core/dev.c | 57 ++++++++++++++++++++++++++++++++------------ net/core/net-sysfs.c | 15 +++++++---- 3 files changed, 58 insertions(+), 27 deletions(-)