Message ID | 156026784005.26748.1807371376992707392.stgit@alrua-x1 |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | xdp: Allow lookup into devmaps before redirect | expand |
On Tue, Jun 11, 2019 at 9:42 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote: > > From: Toke Høiland-Jørgensen <toke@redhat.com> > > The socket map uses a linked list instead of a bitmap to keep track of > which entries to flush. Do the same for devmap and cpumap, as this means we > don't have to care about the map index when enqueueing things into the > map (and so we can cache the map lookup). > > Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com> > --- > kernel/bpf/cpumap.c | 97 ++++++++++++++++++++++++--------------------------- > kernel/bpf/devmap.c | 93 ++++++++++++++++++++++--------------------------- > net/core/filter.c | 2 - > 3 files changed, 87 insertions(+), 105 deletions(-) > > diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c > index b31a71909307..02dc3503ef4d 100644 > --- a/kernel/bpf/cpumap.c > +++ b/kernel/bpf/cpumap.c > @@ -38,8 +38,13 @@ > */ > > #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ > +struct bpf_cpu_map_entry; > +struct bpf_cpu_map; > + > struct xdp_bulk_queue { > void *q[CPU_MAP_BULK_SIZE]; > + struct list_head flush_node; > + struct bpf_cpu_map_entry *obj; > unsigned int count; > }; > > @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { > /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ > struct xdp_bulk_queue __percpu *bulkq; > > + struct bpf_cpu_map *cmap; > + > /* Queue with potential multi-producers, and single-consumer kthread */ > struct ptr_ring *queue; > struct task_struct *kthread; > @@ -65,23 +72,17 @@ struct bpf_cpu_map { > struct bpf_map map; > /* Below members specific for map type */ > struct bpf_cpu_map_entry **cpu_map; > - unsigned long __percpu *flush_needed; > + struct list_head __percpu *flush_list; > }; > > -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, > - struct xdp_bulk_queue *bq, bool in_napi_ctx); > - > -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) > -{ > - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); > -} > +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); > > static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) > { > struct bpf_cpu_map *cmap; > int err = -ENOMEM; > + int ret, cpu; > u64 cost; > - int ret; > > if (!capable(CAP_SYS_ADMIN)) > return ERR_PTR(-EPERM); > @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) > > /* make sure page count doesn't overflow */ > cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); > - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); > + cost += sizeof(struct list_head) * num_possible_cpus(); > > /* Notice returns -EPERM on if map size is larger than memlock limit */ > ret = bpf_map_charge_init(&cmap->map.memory, cost); > @@ -115,11 +116,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) > } > > /* A per cpu bitfield with a bit per possible CPU in map */ Comment is wrong now. > - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), > - __alignof__(unsigned long)); > - if (!cmap->flush_needed) > + cmap->flush_list = alloc_percpu(struct list_head); > + if (!cmap->flush_list) > goto free_charge; > > + for_each_possible_cpu(cpu) > + INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); > + > /* Alloc array for possible remote "destination" CPUs */ > cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * > sizeof(struct bpf_cpu_map_entry *), > @@ -129,7 +132,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) > > return &cmap->map; > free_percpu: > - free_percpu(cmap->flush_needed); > + free_percpu(cmap->flush_list); > free_charge: > bpf_map_charge_finish(&cmap->map.memory); > free_cmap: > @@ -331,7 +334,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, > { > gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; > struct bpf_cpu_map_entry *rcpu; > - int numa, err; > + struct xdp_bulk_queue *bq; > + int numa, err, i; > > /* Have map->numa_node, but choose node of redirect target CPU */ > numa = cpu_to_node(cpu); > @@ -346,6 +350,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, > if (!rcpu->bulkq) > goto free_rcu; > > + for_each_possible_cpu(i) { > + bq = per_cpu_ptr(rcpu->bulkq, i); > + bq->obj = rcpu; > + } > + > /* Alloc queue */ > rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); > if (!rcpu->queue) > @@ -402,7 +411,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) > struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); > > /* No concurrent bq_enqueue can run at this point */ > - bq_flush_to_queue(rcpu, bq, false); > + bq_flush_to_queue(bq, false); > } > free_percpu(rcpu->bulkq); > /* Cannot kthread_stop() here, last put free rcpu resources */ > @@ -485,6 +494,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, > rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); > if (!rcpu) > return -ENOMEM; > + rcpu->cmap = cmap; > } > rcu_read_lock(); > __cpu_map_entry_replace(cmap, key_cpu, rcpu); > @@ -516,9 +526,9 @@ static void cpu_map_free(struct bpf_map *map) > * from the program we can assume no new bits will be set. > */ > for_each_online_cpu(cpu) { > - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); > + struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); > > - while (!bitmap_empty(bitmap, cmap->map.max_entries)) > + while (!list_empty(flush_list)) > cond_resched(); > } > > @@ -535,7 +545,7 @@ static void cpu_map_free(struct bpf_map *map) > /* bq flush and cleanup happens after RCU graze-period */ > __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ > } > - free_percpu(cmap->flush_needed); > + free_percpu(cmap->flush_list); > bpf_map_area_free(cmap->cpu_map); > kfree(cmap); > } > @@ -587,9 +597,9 @@ const struct bpf_map_ops cpu_map_ops = { > .map_check_btf = map_check_no_btf, > }; > > -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, > - struct xdp_bulk_queue *bq, bool in_napi_ctx) > +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) > { > + struct bpf_cpu_map_entry *rcpu = bq->obj; > unsigned int processed = 0, drops = 0; > const int to_cpu = rcpu->cpu; > struct ptr_ring *q; > @@ -618,6 +628,9 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, > bq->count = 0; > spin_unlock(&q->producer_lock); > > + __list_del(bq->flush_node.prev, bq->flush_node.next); > + bq->flush_node.prev = NULL; > + > /* Feedback loop via tracepoints */ > trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); > return 0; > @@ -628,10 +641,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, > */ > static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) > { > + struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); > struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); > > if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) > - bq_flush_to_queue(rcpu, bq, true); > + bq_flush_to_queue(bq, true); > > /* Notice, xdp_buff/page MUST be queued here, long enough for > * driver to code invoking us to finished, due to driver > @@ -643,6 +657,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) > * operation, when completing napi->poll call. > */ > bq->q[bq->count++] = xdpf; > + > + if (!bq->flush_node.prev) > + list_add(&bq->flush_node, flush_list); > + > return 0; > } > > @@ -662,41 +680,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, > return 0; > } > > -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) > -{ > - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); > - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); > - > - __set_bit(bit, bitmap); > -} > - > void __cpu_map_flush(struct bpf_map *map) > { > struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); > - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); > - u32 bit; > - > - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() > - * and __cpu_map_flush() happen on same CPU. Thus, the percpu > - * bitmap indicate which percpu bulkq have packets. > - */ > - for_each_set_bit(bit, bitmap, map->max_entries) { > - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); > - struct xdp_bulk_queue *bq; > - > - /* This is possible if entry is removed by user space > - * between xdp redirect and flush op. > - */ > - if (unlikely(!rcpu)) > - continue; > - > - __clear_bit(bit, bitmap); > + struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); > + struct xdp_bulk_queue *bq, *tmp; > > - /* Flush all frames in bulkq to real queue */ > - bq = this_cpu_ptr(rcpu->bulkq); > - bq_flush_to_queue(rcpu, bq, true); > + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { > + bq_flush_to_queue(bq, true); > > /* If already running, costs spin_lock_irqsave + smb_mb */ > - wake_up_process(rcpu->kthread); > + wake_up_process(bq->obj->kthread); > } > } > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c > index 5ae7cce5ef16..c945518225f3 100644 > --- a/kernel/bpf/devmap.c > +++ b/kernel/bpf/devmap.c > @@ -56,9 +56,13 @@ > (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) > > #define DEV_MAP_BULK_SIZE 16 > +struct bpf_dtab_netdev; > + > struct xdp_bulk_queue { > struct xdp_frame *q[DEV_MAP_BULK_SIZE]; > + struct list_head flush_node; > struct net_device *dev_rx; > + struct bpf_dtab_netdev *obj; > unsigned int count; > }; > > @@ -73,23 +77,19 @@ struct bpf_dtab_netdev { > struct bpf_dtab { > struct bpf_map map; > struct bpf_dtab_netdev **netdev_map; > - unsigned long __percpu *flush_needed; > + struct list_head __percpu *flush_list; > struct list_head list; > }; > > static DEFINE_SPINLOCK(dev_map_lock); > static LIST_HEAD(dev_map_list); > > -static u64 dev_map_bitmap_size(const union bpf_attr *attr) > -{ > - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); > -} > - > static struct bpf_map *dev_map_alloc(union bpf_attr *attr) > { > struct bpf_dtab *dtab; > int err = -EINVAL; > u64 cost; > + int cpu; > > if (!capable(CAP_NET_ADMIN)) > return ERR_PTR(-EPERM); > @@ -107,7 +107,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) > > /* make sure page count doesn't overflow */ > cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); > - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); > + cost += sizeof(struct list_head) * num_possible_cpus(); > > /* if map size is larger than memlock limit, reject it */ > err = bpf_map_charge_init(&dtab->map.memory, cost); > @@ -116,28 +116,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) > > err = -ENOMEM; > > - /* A per cpu bitfield with a bit per possible net device */ > - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), > - __alignof__(unsigned long), > - GFP_KERNEL | __GFP_NOWARN); > - if (!dtab->flush_needed) > + dtab->flush_list = alloc_percpu(struct list_head); Is it ok to lose __GFP_NOWARN bit, which was previously provided? > + if (!dtab->flush_list) > goto free_charge; > > + for_each_possible_cpu(cpu) > + INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); > + > dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * > sizeof(struct bpf_dtab_netdev *), > dtab->map.numa_node); > if (!dtab->netdev_map) > - goto free_charge; > + goto free_percpu; > > spin_lock(&dev_map_lock); > list_add_tail_rcu(&dtab->list, &dev_map_list); > spin_unlock(&dev_map_lock); > > return &dtab->map; > + > +free_percpu: > + free_percpu(dtab->flush_list); > free_charge: > bpf_map_charge_finish(&dtab->map.memory); > free_dtab: > - free_percpu(dtab->flush_needed); > kfree(dtab); > return ERR_PTR(err); > } > @@ -171,9 +173,9 @@ static void dev_map_free(struct bpf_map *map) > * from the program we can assume no new bits will be set. > */ > for_each_online_cpu(cpu) { > - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); > + struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); > > - while (!bitmap_empty(bitmap, dtab->map.max_entries)) > + while (!list_empty(flush_list)) > cond_resched(); > } > > @@ -188,7 +190,7 @@ static void dev_map_free(struct bpf_map *map) > kfree(dev); > } > > - free_percpu(dtab->flush_needed); > + free_percpu(dtab->flush_list); > bpf_map_area_free(dtab->netdev_map); > kfree(dtab); > } > @@ -210,18 +212,11 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) > return 0; > } > > -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) > -{ > - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); > - > - __set_bit(bit, bitmap); > -} > - > -static int bq_xmit_all(struct bpf_dtab_netdev *obj, > - struct xdp_bulk_queue *bq, u32 flags, > +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, > bool in_napi_ctx) > { > + extra line? > + struct bpf_dtab_netdev *obj = bq->obj; > struct net_device *dev = obj->dev; > int sent = 0, drops = 0, err = 0; > int i; > @@ -248,6 +243,8 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, > trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, > sent, drops, bq->dev_rx, dev, err); > bq->dev_rx = NULL; > + __list_del(bq->flush_node.prev, bq->flush_node.next); > + bq->flush_node.prev = NULL; > return 0; > error: > /* If ndo_xdp_xmit fails with an errno, no frames have been > @@ -276,24 +273,11 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, > void __dev_map_flush(struct bpf_map *map) > { > struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); > - u32 bit; > + struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); > + struct xdp_bulk_queue *bq, *tmp; > > - for_each_set_bit(bit, bitmap, map->max_entries) { > - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); > - struct xdp_bulk_queue *bq; > - > - /* This is possible if the dev entry is removed by user space > - * between xdp redirect and flush op. > - */ > - if (unlikely(!dev)) > - continue; > - > - __clear_bit(bit, bitmap); > - > - bq = this_cpu_ptr(dev->bulkq); > - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); > - } > + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) > + bq_xmit_all(bq, XDP_XMIT_FLUSH, true); > } > > /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or > @@ -319,10 +303,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, > struct net_device *dev_rx) > > { > + struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); > struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); > > if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) > - bq_xmit_all(obj, bq, 0, true); > + bq_xmit_all(bq, 0, true); > > /* Ingress dev_rx will be the same for all xdp_frame's in > * bulk_queue, because bq stored per-CPU and must be flushed > @@ -332,6 +317,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, > bq->dev_rx = dev_rx; > > bq->q[bq->count++] = xdpf; > + > + if (!bq->flush_node.prev) > + list_add(&bq->flush_node, flush_list); > + > return 0; > } > > @@ -382,16 +371,11 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) > { > if (dev->dev->netdev_ops->ndo_xdp_xmit) { > struct xdp_bulk_queue *bq; > - unsigned long *bitmap; > - > int cpu; > > for_each_online_cpu(cpu) { > - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); > - __clear_bit(dev->bit, bitmap); > - > bq = per_cpu_ptr(dev->bulkq, cpu); > - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); > + bq_xmit_all(bq, XDP_XMIT_FLUSH, false); > } > } > } > @@ -439,6 +423,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, > struct bpf_dtab_netdev *dev, *old_dev; > u32 i = *(u32 *)key; > u32 ifindex = *(u32 *)value; can you please fix reverse Christmas tree order, while you are here? > + struct xdp_bulk_queue *bq; > + int cpu; > > if (unlikely(map_flags > BPF_EXIST)) > return -EINVAL; > @@ -461,6 +447,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, > return -ENOMEM; > } > > + for_each_possible_cpu(cpu) { > + bq = per_cpu_ptr(dev->bulkq, cpu); > + bq->obj = dev; > + } > + > dev->dev = dev_get_by_index(net, ifindex); > if (!dev->dev) { > free_percpu(dev->bulkq); > diff --git a/net/core/filter.c b/net/core/filter.c > index 55bfc941d17a..7a996887c500 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -3526,7 +3526,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, > err = dev_map_enqueue(dst, xdp, dev_rx); > if (unlikely(err)) > return err; > - __dev_map_insert_ctx(map, index); > break; > } > case BPF_MAP_TYPE_CPUMAP: { > @@ -3535,7 +3534,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, > err = cpu_map_enqueue(rcpu, xdp, dev_rx); > if (unlikely(err)) > return err; > - __cpu_map_insert_ctx(map, index); > break; > } > case BPF_MAP_TYPE_XSKMAP: { >
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes: > On Tue, Jun 11, 2019 at 9:42 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote: >> >> From: Toke Høiland-Jørgensen <toke@redhat.com> >> >> The socket map uses a linked list instead of a bitmap to keep track of >> which entries to flush. Do the same for devmap and cpumap, as this means we >> don't have to care about the map index when enqueueing things into the >> map (and so we can cache the map lookup). >> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com> >> --- >> kernel/bpf/cpumap.c | 97 ++++++++++++++++++++++++--------------------------- >> kernel/bpf/devmap.c | 93 ++++++++++++++++++++++--------------------------- >> net/core/filter.c | 2 - >> 3 files changed, 87 insertions(+), 105 deletions(-) >> >> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c >> index b31a71909307..02dc3503ef4d 100644 >> --- a/kernel/bpf/cpumap.c >> +++ b/kernel/bpf/cpumap.c >> @@ -38,8 +38,13 @@ >> */ >> >> #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ >> +struct bpf_cpu_map_entry; >> +struct bpf_cpu_map; >> + >> struct xdp_bulk_queue { >> void *q[CPU_MAP_BULK_SIZE]; >> + struct list_head flush_node; >> + struct bpf_cpu_map_entry *obj; >> unsigned int count; >> }; >> >> @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { >> /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ >> struct xdp_bulk_queue __percpu *bulkq; >> >> + struct bpf_cpu_map *cmap; >> + >> /* Queue with potential multi-producers, and single-consumer kthread */ >> struct ptr_ring *queue; >> struct task_struct *kthread; >> @@ -65,23 +72,17 @@ struct bpf_cpu_map { >> struct bpf_map map; >> /* Below members specific for map type */ >> struct bpf_cpu_map_entry **cpu_map; >> - unsigned long __percpu *flush_needed; >> + struct list_head __percpu *flush_list; >> }; >> >> -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, >> - struct xdp_bulk_queue *bq, bool in_napi_ctx); >> - >> -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) >> -{ >> - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); >> -} >> +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); >> >> static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) >> { >> struct bpf_cpu_map *cmap; >> int err = -ENOMEM; >> + int ret, cpu; >> u64 cost; >> - int ret; >> >> if (!capable(CAP_SYS_ADMIN)) >> return ERR_PTR(-EPERM); >> @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) >> >> /* make sure page count doesn't overflow */ >> cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); >> - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); >> + cost += sizeof(struct list_head) * num_possible_cpus(); >> >> /* Notice returns -EPERM on if map size is larger than memlock limit */ >> ret = bpf_map_charge_init(&cmap->map.memory, cost); >> @@ -115,11 +116,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) >> } >> >> /* A per cpu bitfield with a bit per possible CPU in map */ > > Comment is wrong now. Good catch! And not the only comment that is wrong; will fix them all. >> - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), >> - __alignof__(unsigned long)); >> - if (!cmap->flush_needed) >> + cmap->flush_list = alloc_percpu(struct list_head); >> + if (!cmap->flush_list) >> goto free_charge; >> >> + for_each_possible_cpu(cpu) >> + INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); >> + >> /* Alloc array for possible remote "destination" CPUs */ >> cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * >> sizeof(struct bpf_cpu_map_entry *), >> @@ -129,7 +132,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) >> >> return &cmap->map; >> free_percpu: >> - free_percpu(cmap->flush_needed); >> + free_percpu(cmap->flush_list); >> free_charge: >> bpf_map_charge_finish(&cmap->map.memory); >> free_cmap: >> @@ -331,7 +334,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, >> { >> gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; >> struct bpf_cpu_map_entry *rcpu; >> - int numa, err; >> + struct xdp_bulk_queue *bq; >> + int numa, err, i; >> >> /* Have map->numa_node, but choose node of redirect target CPU */ >> numa = cpu_to_node(cpu); >> @@ -346,6 +350,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, >> if (!rcpu->bulkq) >> goto free_rcu; >> >> + for_each_possible_cpu(i) { >> + bq = per_cpu_ptr(rcpu->bulkq, i); >> + bq->obj = rcpu; >> + } >> + >> /* Alloc queue */ >> rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); >> if (!rcpu->queue) >> @@ -402,7 +411,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) >> struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); >> >> /* No concurrent bq_enqueue can run at this point */ >> - bq_flush_to_queue(rcpu, bq, false); >> + bq_flush_to_queue(bq, false); >> } >> free_percpu(rcpu->bulkq); >> /* Cannot kthread_stop() here, last put free rcpu resources */ >> @@ -485,6 +494,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, >> rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); >> if (!rcpu) >> return -ENOMEM; >> + rcpu->cmap = cmap; >> } >> rcu_read_lock(); >> __cpu_map_entry_replace(cmap, key_cpu, rcpu); >> @@ -516,9 +526,9 @@ static void cpu_map_free(struct bpf_map *map) >> * from the program we can assume no new bits will be set. >> */ >> for_each_online_cpu(cpu) { >> - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); >> + struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); >> >> - while (!bitmap_empty(bitmap, cmap->map.max_entries)) >> + while (!list_empty(flush_list)) >> cond_resched(); >> } >> >> @@ -535,7 +545,7 @@ static void cpu_map_free(struct bpf_map *map) >> /* bq flush and cleanup happens after RCU graze-period */ >> __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ >> } >> - free_percpu(cmap->flush_needed); >> + free_percpu(cmap->flush_list); >> bpf_map_area_free(cmap->cpu_map); >> kfree(cmap); >> } >> @@ -587,9 +597,9 @@ const struct bpf_map_ops cpu_map_ops = { >> .map_check_btf = map_check_no_btf, >> }; >> >> -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, >> - struct xdp_bulk_queue *bq, bool in_napi_ctx) >> +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) >> { >> + struct bpf_cpu_map_entry *rcpu = bq->obj; >> unsigned int processed = 0, drops = 0; >> const int to_cpu = rcpu->cpu; >> struct ptr_ring *q; >> @@ -618,6 +628,9 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, >> bq->count = 0; >> spin_unlock(&q->producer_lock); >> >> + __list_del(bq->flush_node.prev, bq->flush_node.next); >> + bq->flush_node.prev = NULL; >> + >> /* Feedback loop via tracepoints */ >> trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); >> return 0; >> @@ -628,10 +641,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, >> */ >> static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) >> { >> + struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); >> struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); >> >> if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) >> - bq_flush_to_queue(rcpu, bq, true); >> + bq_flush_to_queue(bq, true); >> >> /* Notice, xdp_buff/page MUST be queued here, long enough for >> * driver to code invoking us to finished, due to driver >> @@ -643,6 +657,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) >> * operation, when completing napi->poll call. >> */ >> bq->q[bq->count++] = xdpf; >> + >> + if (!bq->flush_node.prev) >> + list_add(&bq->flush_node, flush_list); >> + >> return 0; >> } >> >> @@ -662,41 +680,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, >> return 0; >> } >> >> -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) >> -{ >> - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); >> - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); >> - >> - __set_bit(bit, bitmap); >> -} >> - >> void __cpu_map_flush(struct bpf_map *map) >> { >> struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); >> - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); >> - u32 bit; >> - >> - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() >> - * and __cpu_map_flush() happen on same CPU. Thus, the percpu >> - * bitmap indicate which percpu bulkq have packets. >> - */ >> - for_each_set_bit(bit, bitmap, map->max_entries) { >> - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); >> - struct xdp_bulk_queue *bq; >> - >> - /* This is possible if entry is removed by user space >> - * between xdp redirect and flush op. >> - */ >> - if (unlikely(!rcpu)) >> - continue; >> - >> - __clear_bit(bit, bitmap); >> + struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); >> + struct xdp_bulk_queue *bq, *tmp; >> >> - /* Flush all frames in bulkq to real queue */ >> - bq = this_cpu_ptr(rcpu->bulkq); >> - bq_flush_to_queue(rcpu, bq, true); >> + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { >> + bq_flush_to_queue(bq, true); >> >> /* If already running, costs spin_lock_irqsave + smb_mb */ >> - wake_up_process(rcpu->kthread); >> + wake_up_process(bq->obj->kthread); >> } >> } >> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c >> index 5ae7cce5ef16..c945518225f3 100644 >> --- a/kernel/bpf/devmap.c >> +++ b/kernel/bpf/devmap.c >> @@ -56,9 +56,13 @@ >> (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) >> >> #define DEV_MAP_BULK_SIZE 16 >> +struct bpf_dtab_netdev; >> + >> struct xdp_bulk_queue { >> struct xdp_frame *q[DEV_MAP_BULK_SIZE]; >> + struct list_head flush_node; >> struct net_device *dev_rx; >> + struct bpf_dtab_netdev *obj; >> unsigned int count; >> }; >> >> @@ -73,23 +77,19 @@ struct bpf_dtab_netdev { >> struct bpf_dtab { >> struct bpf_map map; >> struct bpf_dtab_netdev **netdev_map; >> - unsigned long __percpu *flush_needed; >> + struct list_head __percpu *flush_list; >> struct list_head list; >> }; >> >> static DEFINE_SPINLOCK(dev_map_lock); >> static LIST_HEAD(dev_map_list); >> >> -static u64 dev_map_bitmap_size(const union bpf_attr *attr) >> -{ >> - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); >> -} >> - >> static struct bpf_map *dev_map_alloc(union bpf_attr *attr) >> { >> struct bpf_dtab *dtab; >> int err = -EINVAL; >> u64 cost; >> + int cpu; >> >> if (!capable(CAP_NET_ADMIN)) >> return ERR_PTR(-EPERM); >> @@ -107,7 +107,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) >> >> /* make sure page count doesn't overflow */ >> cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); >> - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); >> + cost += sizeof(struct list_head) * num_possible_cpus(); >> >> /* if map size is larger than memlock limit, reject it */ >> err = bpf_map_charge_init(&dtab->map.memory, cost); >> @@ -116,28 +116,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) >> >> err = -ENOMEM; >> >> - /* A per cpu bitfield with a bit per possible net device */ >> - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), >> - __alignof__(unsigned long), >> - GFP_KERNEL | __GFP_NOWARN); >> - if (!dtab->flush_needed) >> + dtab->flush_list = alloc_percpu(struct list_head); > > Is it ok to lose __GFP_NOWARN bit, which was previously provided? Yeah, I believe so. This is a pretty standard allocation failure, so no real reason to mask the warning. The devmap is the only place that sets the NOWARN bit anyway (neither cpumap nor xskmap do), so I think it's probably just a coincidental setting here... >> + if (!dtab->flush_list) >> goto free_charge; >> >> + for_each_possible_cpu(cpu) >> + INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); >> + >> dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * >> sizeof(struct bpf_dtab_netdev *), >> dtab->map.numa_node); >> if (!dtab->netdev_map) >> - goto free_charge; >> + goto free_percpu; >> >> spin_lock(&dev_map_lock); >> list_add_tail_rcu(&dtab->list, &dev_map_list); >> spin_unlock(&dev_map_lock); >> >> return &dtab->map; >> + >> +free_percpu: >> + free_percpu(dtab->flush_list); >> free_charge: >> bpf_map_charge_finish(&dtab->map.memory); >> free_dtab: >> - free_percpu(dtab->flush_needed); >> kfree(dtab); >> return ERR_PTR(err); >> } >> @@ -171,9 +173,9 @@ static void dev_map_free(struct bpf_map *map) >> * from the program we can assume no new bits will be set. >> */ >> for_each_online_cpu(cpu) { >> - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); >> + struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); >> >> - while (!bitmap_empty(bitmap, dtab->map.max_entries)) >> + while (!list_empty(flush_list)) >> cond_resched(); >> } >> >> @@ -188,7 +190,7 @@ static void dev_map_free(struct bpf_map *map) >> kfree(dev); >> } >> >> - free_percpu(dtab->flush_needed); >> + free_percpu(dtab->flush_list); >> bpf_map_area_free(dtab->netdev_map); >> kfree(dtab); >> } >> @@ -210,18 +212,11 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) >> return 0; >> } >> >> -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) >> -{ >> - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); >> - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); >> - >> - __set_bit(bit, bitmap); >> -} >> - >> -static int bq_xmit_all(struct bpf_dtab_netdev *obj, >> - struct xdp_bulk_queue *bq, u32 flags, >> +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, >> bool in_napi_ctx) >> { >> + > > extra line? Right, will remove. >> + struct bpf_dtab_netdev *obj = bq->obj; >> struct net_device *dev = obj->dev; >> int sent = 0, drops = 0, err = 0; >> int i; >> @@ -248,6 +243,8 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, >> trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, >> sent, drops, bq->dev_rx, dev, err); >> bq->dev_rx = NULL; >> + __list_del(bq->flush_node.prev, bq->flush_node.next); >> + bq->flush_node.prev = NULL; >> return 0; >> error: >> /* If ndo_xdp_xmit fails with an errno, no frames have been >> @@ -276,24 +273,11 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, >> void __dev_map_flush(struct bpf_map *map) >> { >> struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); >> - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); >> - u32 bit; >> + struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); >> + struct xdp_bulk_queue *bq, *tmp; >> >> - for_each_set_bit(bit, bitmap, map->max_entries) { >> - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); >> - struct xdp_bulk_queue *bq; >> - >> - /* This is possible if the dev entry is removed by user space >> - * between xdp redirect and flush op. >> - */ >> - if (unlikely(!dev)) >> - continue; >> - >> - __clear_bit(bit, bitmap); >> - >> - bq = this_cpu_ptr(dev->bulkq); >> - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); >> - } >> + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) >> + bq_xmit_all(bq, XDP_XMIT_FLUSH, true); >> } >> >> /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or >> @@ -319,10 +303,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, >> struct net_device *dev_rx) >> >> { >> + struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); >> struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); >> >> if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) >> - bq_xmit_all(obj, bq, 0, true); >> + bq_xmit_all(bq, 0, true); >> >> /* Ingress dev_rx will be the same for all xdp_frame's in >> * bulk_queue, because bq stored per-CPU and must be flushed >> @@ -332,6 +317,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, >> bq->dev_rx = dev_rx; >> >> bq->q[bq->count++] = xdpf; >> + >> + if (!bq->flush_node.prev) >> + list_add(&bq->flush_node, flush_list); >> + >> return 0; >> } >> >> @@ -382,16 +371,11 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) >> { >> if (dev->dev->netdev_ops->ndo_xdp_xmit) { >> struct xdp_bulk_queue *bq; >> - unsigned long *bitmap; >> - >> int cpu; >> >> for_each_online_cpu(cpu) { >> - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); >> - __clear_bit(dev->bit, bitmap); >> - >> bq = per_cpu_ptr(dev->bulkq, cpu); >> - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); >> + bq_xmit_all(bq, XDP_XMIT_FLUSH, false); >> } >> } >> } >> @@ -439,6 +423,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, >> struct bpf_dtab_netdev *dev, *old_dev; >> u32 i = *(u32 *)key; >> u32 ifindex = *(u32 *)value; > > can you please fix reverse Christmas tree order, while you are here? Sure! >> + struct xdp_bulk_queue *bq; >> + int cpu; >> >> if (unlikely(map_flags > BPF_EXIST)) >> return -EINVAL; >> @@ -461,6 +447,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, >> return -ENOMEM; >> } >> >> + for_each_possible_cpu(cpu) { >> + bq = per_cpu_ptr(dev->bulkq, cpu); >> + bq->obj = dev; >> + } >> + >> dev->dev = dev_get_by_index(net, ifindex); >> if (!dev->dev) { >> free_percpu(dev->bulkq); >> diff --git a/net/core/filter.c b/net/core/filter.c >> index 55bfc941d17a..7a996887c500 100644 >> --- a/net/core/filter.c >> +++ b/net/core/filter.c >> @@ -3526,7 +3526,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, >> err = dev_map_enqueue(dst, xdp, dev_rx); >> if (unlikely(err)) >> return err; >> - __dev_map_insert_ctx(map, index); >> break; >> } >> case BPF_MAP_TYPE_CPUMAP: { >> @@ -3535,7 +3534,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, >> err = cpu_map_enqueue(rcpu, xdp, dev_rx); >> if (unlikely(err)) >> return err; >> - __cpu_map_insert_ctx(map, index); >> break; >> } >> case BPF_MAP_TYPE_XSKMAP: { >> -Toke
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b31a71909307..02dc3503ef4d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -38,8 +38,13 @@ */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct bpf_cpu_map_entry; +struct bpf_cpu_map; + struct xdp_bulk_queue { void *q[CPU_MAP_BULK_SIZE]; + struct list_head flush_node; + struct bpf_cpu_map_entry *obj; unsigned int count; }; @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; + struct bpf_cpu_map *cmap; + /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; @@ -65,23 +72,17 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx); - -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); -} +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; + int ret, cpu; u64 cost; - int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -115,11 +116,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) } /* A per cpu bitfield with a bit per possible CPU in map */ - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), - __alignof__(unsigned long)); - if (!cmap->flush_needed) + cmap->flush_list = alloc_percpu(struct list_head); + if (!cmap->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); + /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), @@ -129,7 +132,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -331,7 +334,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; - int numa, err; + struct xdp_bulk_queue *bq; + int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -346,6 +350,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->bulkq) goto free_rcu; + for_each_possible_cpu(i) { + bq = per_cpu_ptr(rcpu->bulkq, i); + bq->obj = rcpu; + } + /* Alloc queue */ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); if (!rcpu->queue) @@ -402,7 +411,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq, false); + bq_flush_to_queue(bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -485,6 +494,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); if (!rcpu) return -ENOMEM; + rcpu->cmap = cmap; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); @@ -516,9 +526,9 @@ static void cpu_map_free(struct bpf_map *map) * from the program we can assume no new bits will be set. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - while (!bitmap_empty(bitmap, cmap->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -535,7 +545,7 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU graze-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -587,9 +597,9 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) { + struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; struct ptr_ring *q; @@ -618,6 +628,9 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, bq->count = 0; spin_unlock(&q->producer_lock); + __list_del(bq->flush_node.prev, bq->flush_node.next); + bq->flush_node.prev = NULL; + /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; @@ -628,10 +641,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq, true); + bq_flush_to_queue(bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -643,6 +657,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) * operation, when completing napi->poll call. */ bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -662,41 +680,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - - __set_bit(bit, bitmap); -} - void __cpu_map_flush(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - u32 bit; - - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() - * and __cpu_map_flush() happen on same CPU. Thus, the percpu - * bitmap indicate which percpu bulkq have packets. - */ - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!rcpu)) - continue; - - __clear_bit(bit, bitmap); + struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct xdp_bulk_queue *bq, *tmp; - /* Flush all frames in bulkq to real queue */ - bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq, true); + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + bq_flush_to_queue(bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ - wake_up_process(rcpu->kthread); + wake_up_process(bq->obj->kthread); } } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 5ae7cce5ef16..c945518225f3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -56,9 +56,13 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 +struct bpf_dtab_netdev; + struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct list_head flush_node; struct net_device *dev_rx; + struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -73,23 +77,19 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; struct list_head list; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static u64 dev_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); -} - static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; int err = -EINVAL; u64 cost; + int cpu; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -107,7 +107,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); @@ -116,28 +116,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) err = -ENOMEM; - /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), - __alignof__(unsigned long), - GFP_KERNEL | __GFP_NOWARN); - if (!dtab->flush_needed) + dtab->flush_list = alloc_percpu(struct list_head); + if (!dtab->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_charge; + goto free_percpu; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; + +free_percpu: + free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); free_dtab: - free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -171,9 +173,9 @@ static void dev_map_free(struct bpf_map *map) * from the program we can assume no new bits will be set. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - while (!bitmap_empty(bitmap, dtab->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -188,7 +190,7 @@ static void dev_map_free(struct bpf_map *map) kfree(dev); } - free_percpu(dtab->flush_needed); + free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -210,18 +212,11 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - - __set_bit(bit, bitmap); -} - -static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags, +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { + + struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; int i; @@ -248,6 +243,8 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; + __list_del(bq->flush_node.prev, bq->flush_node.next); + bq->flush_node.prev = NULL; return 0; error: /* If ndo_xdp_xmit fails with an errno, no frames have been @@ -276,24 +273,11 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, void __dev_map_flush(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - u32 bit; + struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct xdp_bulk_queue *bq, *tmp; - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if the dev entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!dev)) - continue; - - __clear_bit(bit, bitmap); - - bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); - } + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + bq_xmit_all(bq, XDP_XMIT_FLUSH, true); } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or @@ -319,10 +303,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { + struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0, true); + bq_xmit_all(bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -332,6 +317,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, bq->dev_rx = dev_rx; bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -382,16 +371,11 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { struct xdp_bulk_queue *bq; - unsigned long *bitmap; - int cpu; for_each_online_cpu(cpu) { - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); - __clear_bit(dev->bit, bitmap); - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); + bq_xmit_all(bq, XDP_XMIT_FLUSH, false); } } } @@ -439,6 +423,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_dtab_netdev *dev, *old_dev; u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; + struct xdp_bulk_queue *bq; + int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -461,6 +447,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; } + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { free_percpu(dev->bulkq); diff --git a/net/core/filter.c b/net/core/filter.c index 55bfc941d17a..7a996887c500 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3526,7 +3526,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = dev_map_enqueue(dst, xdp, dev_rx); if (unlikely(err)) return err; - __dev_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_CPUMAP: { @@ -3535,7 +3534,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (unlikely(err)) return err; - __cpu_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_XSKMAP: {