Message ID | 1268318738.2986.518.camel@edumazet-laptop |
---|---|
State | Superseded, archived |
Delegated to: | David Miller |
Headers | show |
Eric, thanks for the great comments! > > - When a netdevice is freed, I believe you leak rps_maps if they were > previously allocated. I found following patch necessary to fix it. > Yes. > - Maybe use max(RPM_MAP_SIZE(x), L1_CACHE_BYTES) for allocations to > avoid false sharing for small maps. (Or else, other part of cache line > might be given to a user that might dirty it at high rate) > Okay. > - "struct netdev_rx_queue" being only read in fast path, I am not sure > an alignement is necessary, apart the false sharing that can be > separately addressed. > If we get rid of the double indirection like you describe below, then this can be allocated in the same way dev->_tx is (kcalloc). Also, this might a a good structure for writing stats, etc. > > - A double indirection to get a struct netdev_rx_queue pointer is > expensive, not for heavy load benchmarks, but for latencies in seldom > used devices. I understand you added this for kobject deferred release > and kfreeing, but this seems a high price to pay in our fast path. I > used another solution. > Yes, this is much better. The kobject documentation has dire warnings about placing more than one kobject in a data structure, I think I took that too literally! > > - WARN(1, "Recieved packet on %s for queue %u, " > -> Received > Yes, i before e expect after c... > - Use of cpumask_t for variable on stack is discouraged, we shall use > cpumask_var_t if possible. Easy for show/store commands. (see my patch) > Yes. > Yet, adding a memory allocation in net_rx_action() seems overkill, so we > might use two static masks per cpu, and perform a flip ? This would help > machines with 4096 cpus : To Be Done ? That is an interesting idea. I'll try it. > > > - RTNL to guard rps map exchange ??? > This sounds like BKL (Big Kernel Lock) syndrom to me : > Trying to avoid it if possible is better, because it wont exist anymore > in ten years with 0.99 probability. > Yes. > > For ease of discussion, I cooked following patch on top of yours : > I will apply it. > Thanks ! > > include/linux/netdevice.h | 8 ++- > net/core/dev.c | 64 ++++++++++------------------- > net/core/net-sysfs.c | 78 ++++++++++++++++++++++-------------- > 3 files changed, 77 insertions(+), 73 deletions(-) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 468da0a..3f4a986 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -544,9 +544,11 @@ struct rps_map { > > /* This structure contains an instance of an RX queue. */ > struct netdev_rx_queue { > - struct kobject kobj; > struct rps_map *rps_map; > -} ____cacheline_aligned_in_smp; > + struct kobject kobj; > + struct netdev_rx_queue *first; > + atomic_t count; > +}; > > /* > * This structure defines the management hooks for network devices. > @@ -897,7 +899,7 @@ struct net_device { > > struct kset *queues_kset; > > - struct netdev_rx_queue **_rx; > + struct netdev_rx_queue *_rx; > > /* Number of RX queues allocated at alloc_netdev_mq() time */ > unsigned int num_rx_queues; > diff --git a/net/core/dev.c b/net/core/dev.c > index 939b1a2..402f23a 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -2195,15 +2195,15 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) > u16 index = skb_get_rx_queue(skb); > if (unlikely(index >= dev->num_rx_queues)) { > if (net_ratelimit()) { > - WARN(1, "Recieved packet on %s for queue %u, " > + WARN(1, "Received packet on %s for queue %u, " > "but number of RX queues is %u\n", > dev->name, index, dev->num_rx_queues); > } > goto done; > } > - rxqueue = dev->_rx[index]; > + rxqueue = dev->_rx + index; > } else > - rxqueue = dev->_rx[0]; > + rxqueue = dev->_rx; > > if (!rxqueue->rps_map) > goto done; > @@ -5236,20 +5236,17 @@ int register_netdevice(struct net_device *dev) > * alloc_netdev_mq > */ > > - dev->_rx = kzalloc(sizeof(struct netdev_rx_queue *), > - GFP_KERNEL); > + dev->_rx = kzalloc(max_t(unsigned, > + sizeof(struct netdev_rx_queue), > + L1_CACHE_BYTES), > + GFP_KERNEL); > if (!dev->_rx) { > ret = -ENOMEM; > goto out; > } > > - dev->_rx[0] = kzalloc(sizeof(struct netdev_rx_queue), > - GFP_KERNEL); > - if (!dev->_rx[0]) { > - kfree(dev->_rx); > - ret = -ENOMEM; > - goto out; > - } > + dev->_rx[0].first = dev->_rx; > + atomic_set(&dev->_rx[0].count, 1); > dev->num_rx_queues = 1; > } > > @@ -5610,7 +5607,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, > void (*setup)(struct net_device *), unsigned int queue_count) > { > struct netdev_queue *tx; > - struct netdev_rx_queue **rx; > + struct netdev_rx_queue *rx; > struct net_device *dev; > size_t alloc_size; > struct net_device *p; > @@ -5635,37 +5632,30 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, > > tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); > if (!tx) { > - printk(KERN_ERR "alloc_netdev: Unable to allocate " > - "tx qdiscs.\n"); > + pr_err("alloc_netdev: Unable to allocate tx qdiscs.\n"); > goto free_p; > } > > - /* > - * Allocate RX queue structures, this includes an array of pointers > - * and netdev_queue_rx stuctures (individually allocated since > - * each has a kobject). > - */ > - rx = kzalloc(queue_count * > - sizeof(struct netdev_rx_queue *), GFP_KERNEL); > + rx = kzalloc(max_t(unsigned, > + queue_count * sizeof(struct netdev_rx_queue), > + L1_CACHE_BYTES), > + GFP_KERNEL); > if (!rx) { > - printk(KERN_ERR "alloc_netdev: Unable to allocate " > - "rx queues array.\n"); > + pr_err("alloc_netdev: Unable to allocate rx queues.\n"); > goto free_tx; > } > - for (i = 0; i < queue_count; i++) { > - rx[i] = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); > - if (!rx[i]) { > - printk(KERN_ERR "alloc_netdev: Unable to allocate " > - "rx queues.\n"); > - goto free_rx; > - } > - } > + atomic_set(&rx->count, queue_count); > + /* > + * Use counter is located in first element of this array > + */ > + for (i = 0; i < queue_count; i++) > + rx[i].first = rx; > > dev = PTR_ALIGN(p, NETDEV_ALIGN); > dev->padded = (char *)dev - (char *)p; > > if (dev_addr_init(dev)) > - goto free_tx; > + goto free_rx; > > dev_unicast_init(dev); > > @@ -5693,8 +5683,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, > return dev; > > free_rx: > - for (i = 0; i < queue_count; i++) > - kfree(rx[i]); > kfree(rx); > free_tx: > kfree(tx); > @@ -5720,12 +5708,6 @@ void free_netdev(struct net_device *dev) > > kfree(dev->_tx); > > - /* > - * Free RX queue pointer array. Actual netdev_rx_queue objects are > - * freed by kobject release. > - */ > - kfree(dev->_rx); > - > /* Flush device addresses */ > dev_addr_flush(dev); > > diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c > index a07d6ec..de75258 100644 > --- a/net/core/net-sysfs.c > +++ b/net/core/net-sysfs.c > @@ -516,24 +516,26 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, > size_t len = 0; > struct rps_map *map; > int i; > - cpumask_t mask; > + cpumask_var_t mask; > > - cpus_clear(mask); > + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) > + return -ENOMEM; > > rcu_read_lock(); > map = rcu_dereference(queue->rps_map); > if (map) { > for (i = 0; i < map->len; i++) > - cpu_set(map->cpus[i], mask); > + cpumask_set_cpu(map->cpus[i], mask); > > - len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask); > + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); > if (PAGE_SIZE - len < 3) { > rcu_read_unlock(); > + free_cpumask_var(mask); > return -EINVAL; > } > } > rcu_read_unlock(); > - > + free_cpumask_var(mask); > len += sprintf(buf + len, "\n"); > return len; > } > @@ -550,38 +552,48 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, > const char *buf, size_t len) > { > struct rps_map *old_map, *map; > - cpumask_t mask; > - int err, cpu, i, weight; > + cpumask_var_t mask; > + int err, cpu, i; > + static DEFINE_SPINLOCK(rps_map_lock); > > if (!capable(CAP_NET_ADMIN)) > return -EPERM; > > - err = bitmap_parse(buf, len, cpumask_bits(&mask), nr_cpumask_bits); > - if (err) > + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) > + return -ENOMEM; > + > + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); > + if (err) { > + free_cpumask_var(mask); > return err; > + } > + map = kzalloc(max_t(unsigned, > + RPS_MAP_SIZE(cpumask_weight(mask)), > + L1_CACHE_BYTES), > + GFP_KERNEL); > + if (!map) { > + free_cpumask_var(mask); > + return -ENOMEM; > + } > + i = 0; > + for_each_cpu_and(cpu, mask, cpu_online_mask) > + map->cpus[i++] = cpu; > + if (i) > + map->len = i; > + else { > + kfree(map); > + map = NULL; > + } > > - weight = cpumask_weight(&mask); > - if (weight) { > - map = kzalloc(RPS_MAP_SIZE(weight), GFP_KERNEL); > - if (!map) > - return -ENOMEM; > - > - cpus_and(mask, mask, cpu_online_map); > - i = 0; > - for_each_cpu_mask(cpu, mask) > - map->cpus[i++] = cpu; > - map->len = i; > - } else > - map = NULL; > - > - rtnl_lock(); > + spin_lock(&rps_map_lock); > old_map = queue->rps_map; > rcu_assign_pointer(queue->rps_map, map); > - rtnl_unlock(); > + spin_unlock(&rps_map_lock); > > if (old_map) > call_rcu(&old_map->rcu, rps_map_release); > > + free_cpumask_var(mask); > return len; > } > > @@ -596,8 +608,16 @@ static struct attribute *rx_queue_default_attrs[] = { > static void rx_queue_release(struct kobject *kobj) > { > struct netdev_rx_queue *queue = to_rx_queue(kobj); > + struct rps_map *map = queue->rps_map; > + struct netdev_rx_queue *first = queue->first; > > - kfree(queue); > + if (map) > + call_rcu(&map->rcu, rps_map_release); > + /* > + * Free the array containing us only if all elems were released > + */ > + if (atomic_dec_and_test(&first->count)) > + kfree(first); > } > > static struct kobj_type rx_queue_ktype = { > @@ -609,7 +629,7 @@ static struct kobj_type rx_queue_ktype = { > static int rx_queue_add_kobject(struct net_device *net, int index) > { > int error = 0; > - struct netdev_rx_queue *queue = net->_rx[index]; > + struct netdev_rx_queue *queue = net->_rx + index; > struct kobject *kobj = &queue->kobj; > > kobj->kset = net->queues_kset; > @@ -642,7 +662,7 @@ static int rx_queue_register_kobjects(struct net_device *net) > > if (error) > while (--i >= 0) > - kobject_put(&net->_rx[i]->kobj); > + kobject_put(&net->_rx[i].kobj); > > return error; > } > @@ -652,7 +672,7 @@ static void rx_queue_remove_kobjects(struct net_device *net) > int i; > > for (i = 0; i < net->num_rx_queues; i++) > - kobject_put(&net->_rx[i]->kobj); > + kobject_put(&net->_rx[i].kobj); > kset_unregister(net->queues_kset); > } > > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a07d6ec..ef82b9d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -596,7 +596,10 @@ static struct attribute *rx_queue_default_attrs[] = { static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct rps_map *map = queue->rps_map; + if (map) + call_rcu(&map->rcu, rps_map_release); kfree(queue); } - memory leak in alloc_netdev_mq() if dev_addr_init() fails -> must use goto free_rx; - Maybe use max(RPM_MAP_SIZE(x), L1_CACHE_BYTES) for allocations to avoid false sharing for small maps. (Or else, other part of cache line might be given to a user that might dirty it at high rate) - "struct netdev_rx_queue" being only read in fast path, I am not sure an alignement is necessary, apart the false sharing that can be separately addressed. - A double indirection to get a struct netdev_rx_queue pointer is expensive, not for heavy load benchmarks, but for latencies in seldom used devices. I understand you added this for kobject deferred release and kfreeing, but this seems a high price to pay in our fast path. I used another solution. - WARN(1, "Recieved packet on %s for queue %u, " -> Received - Use of cpumask_t for variable on stack is discouraged, we shall use cpumask_var_t if possible. Easy for show/store commands. (see my patch) Yet, adding a memory allocation in net_rx_action() seems overkill, so we might use two static masks per cpu, and perform a flip ? This would help machines with 4096 cpus : To Be Done ? - RTNL to guard rps map exchange ??? This sounds like BKL (Big Kernel Lock) syndrom to me : Trying to avoid it if possible is better, because it wont exist anymore in ten years with 0.99 probability. For ease of discussion, I cooked following patch on top of yours : Thanks ! include/linux/netdevice.h | 8 ++- net/core/dev.c | 64 ++++++++++------------------- net/core/net-sysfs.c | 78 ++++++++++++++++++++++-------------- 3 files changed, 77 insertions(+), 73 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 468da0a..3f4a986 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -544,9 +544,11 @@ struct rps_map { /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { - struct kobject kobj; struct rps_map *rps_map; -} ____cacheline_aligned_in_smp; + struct kobject kobj; + struct netdev_rx_queue *first; + atomic_t count; +}; /* * This structure defines the management hooks for network devices. @@ -897,7 +899,7 @@ struct net_device { struct kset *queues_kset; - struct netdev_rx_queue **_rx; + struct netdev_rx_queue *_rx; /* Number of RX queues allocated at alloc_netdev_mq() time */ unsigned int num_rx_queues; diff --git a/net/core/dev.c b/net/core/dev.c index 939b1a2..402f23a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2195,15 +2195,15 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->num_rx_queues)) { if (net_ratelimit()) { - WARN(1, "Recieved packet on %s for queue %u, " + WARN(1, "Received packet on %s for queue %u, " "but number of RX queues is %u\n", dev->name, index, dev->num_rx_queues); } goto done; } - rxqueue = dev->_rx[index]; + rxqueue = dev->_rx + index; } else - rxqueue = dev->_rx[0]; + rxqueue = dev->_rx; if (!rxqueue->rps_map) goto done; @@ -5236,20 +5236,17 @@ int register_netdevice(struct net_device *dev) * alloc_netdev_mq */ - dev->_rx = kzalloc(sizeof(struct netdev_rx_queue *), - GFP_KERNEL); + dev->_rx = kzalloc(max_t(unsigned, + sizeof(struct netdev_rx_queue), + L1_CACHE_BYTES), + GFP_KERNEL); if (!dev->_rx) { ret = -ENOMEM; goto out; } - dev->_rx[0] = kzalloc(sizeof(struct netdev_rx_queue), - GFP_KERNEL); - if (!dev->_rx[0]) { - kfree(dev->_rx); - ret = -ENOMEM; - goto out; - } + dev->_rx[0].first = dev->_rx; + atomic_set(&dev->_rx[0].count, 1); dev->num_rx_queues = 1; } @@ -5610,7 +5607,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int queue_count) { struct netdev_queue *tx; - struct netdev_rx_queue **rx; + struct netdev_rx_queue *rx; struct net_device *dev; size_t alloc_size; struct net_device *p; @@ -5635,37 +5632,30 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); if (!tx) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "tx qdiscs.\n"); + pr_err("alloc_netdev: Unable to allocate tx qdiscs.\n"); goto free_p; } - /* - * Allocate RX queue structures, this includes an array of pointers - * and netdev_queue_rx stuctures (individually allocated since - * each has a kobject). - */ - rx = kzalloc(queue_count * - sizeof(struct netdev_rx_queue *), GFP_KERNEL); + rx = kzalloc(max_t(unsigned, + queue_count * sizeof(struct netdev_rx_queue), + L1_CACHE_BYTES), + GFP_KERNEL); if (!rx) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "rx queues array.\n"); + pr_err("alloc_netdev: Unable to allocate rx queues.\n"); goto free_tx; } - for (i = 0; i < queue_count; i++) { - rx[i] = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx[i]) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "rx queues.\n"); - goto free_rx; - } - } + atomic_set(&rx->count, queue_count); + /* + * Use counter is located in first element of this array + */ + for (i = 0; i < queue_count; i++) + rx[i].first = rx; dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; if (dev_addr_init(dev)) - goto free_tx; + goto free_rx; dev_unicast_init(dev); @@ -5693,8 +5683,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, return dev; free_rx: - for (i = 0; i < queue_count; i++) - kfree(rx[i]); kfree(rx); free_tx: kfree(tx); @@ -5720,12 +5708,6 @@ void free_netdev(struct net_device *dev) kfree(dev->_tx); - /* - * Free RX queue pointer array. Actual netdev_rx_queue objects are - * freed by kobject release. - */ - kfree(dev->_rx); - /* Flush device addresses */ dev_addr_flush(dev); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a07d6ec..de75258 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -516,24 +516,26 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, size_t len = 0; struct rps_map *map; int i; - cpumask_t mask; + cpumask_var_t mask; - cpus_clear(mask); + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; rcu_read_lock(); map = rcu_dereference(queue->rps_map); if (map) { for (i = 0; i < map->len; i++) - cpu_set(map->cpus[i], mask); + cpumask_set_cpu(map->cpus[i], mask); - len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask); + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); if (PAGE_SIZE - len < 3) { rcu_read_unlock(); + free_cpumask_var(mask); return -EINVAL; } } rcu_read_unlock(); - + free_cpumask_var(mask); len += sprintf(buf + len, "\n"); return len; } @@ -550,38 +552,48 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, const char *buf, size_t len) { struct rps_map *old_map, *map; - cpumask_t mask; - int err, cpu, i, weight; + cpumask_var_t mask; + int err, cpu, i; + static DEFINE_SPINLOCK(rps_map_lock); if (!capable(CAP_NET_ADMIN)) return -EPERM; - err = bitmap_parse(buf, len, cpumask_bits(&mask), nr_cpumask_bits); - if (err) + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); return err; + } + map = kzalloc(max_t(unsigned, + RPS_MAP_SIZE(cpumask_weight(mask)), + L1_CACHE_BYTES), + GFP_KERNEL); + if (!map) { + free_cpumask_var(mask); + return -ENOMEM; + } + i = 0; + for_each_cpu_and(cpu, mask, cpu_online_mask) + map->cpus[i++] = cpu; + if (i) + map->len = i; + else { + kfree(map); + map = NULL; + } - weight = cpumask_weight(&mask); - if (weight) { - map = kzalloc(RPS_MAP_SIZE(weight), GFP_KERNEL); - if (!map) - return -ENOMEM; - - cpus_and(mask, mask, cpu_online_map); - i = 0; - for_each_cpu_mask(cpu, mask) - map->cpus[i++] = cpu; - map->len = i; - } else - map = NULL; - - rtnl_lock(); + spin_lock(&rps_map_lock); old_map = queue->rps_map; rcu_assign_pointer(queue->rps_map, map); - rtnl_unlock(); + spin_unlock(&rps_map_lock); if (old_map) call_rcu(&old_map->rcu, rps_map_release); + free_cpumask_var(mask); return len; } @@ -596,8 +608,16 @@ static struct attribute *rx_queue_default_attrs[] = { static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct rps_map *map = queue->rps_map; + struct netdev_rx_queue *first = queue->first; - kfree(queue); + if (map) + call_rcu(&map->rcu, rps_map_release); + /* + * Free the array containing us only if all elems were released + */ + if (atomic_dec_and_test(&first->count)) + kfree(first); } static struct kobj_type rx_queue_ktype = { @@ -609,7 +629,7 @@ static struct kobj_type rx_queue_ktype = { static int rx_queue_add_kobject(struct net_device *net, int index) { int error = 0; - struct netdev_rx_queue *queue = net->_rx[index]; + struct netdev_rx_queue *queue = net->_rx + index; struct kobject *kobj = &queue->kobj; kobj->kset = net->queues_kset; @@ -642,7 +662,7 @@ static int rx_queue_register_kobjects(struct net_device *net) if (error) while (--i >= 0) - kobject_put(&net->_rx[i]->kobj); + kobject_put(&net->_rx[i].kobj); return error; } @@ -652,7 +672,7 @@ static void rx_queue_remove_kobjects(struct net_device *net) int i; for (i = 0; i < net->num_rx_queues; i++) - kobject_put(&net->_rx[i]->kobj); + kobject_put(&net->_rx[i].kobj); kset_unregister(net->queues_kset); }