diff mbox series

[bpf-next,v2,4/6] xdp: Add devmap_hash map type for looking up devices by hashed index

Message ID 156240283595.10171.8867063497268976931.stgit@alrua-x1
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series xdp: Add devmap_hash map type | expand

Commit Message

Toke Høiland-Jørgensen July 6, 2019, 8:47 a.m. UTC
From: Toke Høiland-Jørgensen <toke@redhat.com>

A common pattern when using xdp_redirect_map() is to create a device map
where the lookup key is simply ifindex. Because device maps are arrays,
this leaves holes in the map, and the map has to be sized to fit the
largest ifindex, regardless of how many devices actually are actually
needed in the map.

This patch adds a second type of device map where the key is looked up
using a hashmap, instead of being used as an array index. This allows maps
to be densely packed, so they can be smaller.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 include/linux/bpf.h        |    7 ++
 include/linux/bpf_types.h  |    1 
 include/trace/events/xdp.h |    3 -
 kernel/bpf/devmap.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c      |    2 
 net/core/filter.c          |    9 ++
 6 files changed, 211 insertions(+), 3 deletions(-)

Comments

Y Song July 6, 2019, 7:14 p.m. UTC | #1
On Sat, Jul 6, 2019 at 1:48 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> From: Toke Høiland-Jørgensen <toke@redhat.com>
>
> A common pattern when using xdp_redirect_map() is to create a device map
> where the lookup key is simply ifindex. Because device maps are arrays,
> this leaves holes in the map, and the map has to be sized to fit the
> largest ifindex, regardless of how many devices actually are actually
> needed in the map.
>
> This patch adds a second type of device map where the key is looked up
> using a hashmap, instead of being used as an array index. This allows maps
> to be densely packed, so they can be smaller.
>
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> ---
>  include/linux/bpf.h        |    7 ++
>  include/linux/bpf_types.h  |    1
>  include/trace/events/xdp.h |    3 -
>  kernel/bpf/devmap.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/verifier.c      |    2
>  net/core/filter.c          |    9 ++
>  6 files changed, 211 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index bfdb54dd2ad1..f9a506147c8a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -713,6 +713,7 @@ struct xdp_buff;
>  struct sk_buff;
>
>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
>  void __dev_map_flush(struct bpf_map *map);
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>                     struct net_device *dev_rx);
> @@ -799,6 +800,12 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
>         return NULL;
>  }
>
> +static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
> +                                                            u32 key)
> +{
> +       return NULL;
> +}
> +
>  static inline void __dev_map_flush(struct bpf_map *map)
>  {
>  }
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index eec5aeeeaf92..36a9c2325176 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
>  #ifdef CONFIG_NET
>  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
> +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
>  #if defined(CONFIG_BPF_STREAM_PARSER)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> index 68899fdc985b..8c8420230a10 100644
> --- a/include/trace/events/xdp.h
> +++ b/include/trace/events/xdp.h
> @@ -175,7 +175,8 @@ struct _bpf_dtab_netdev {
>  #endif /* __DEVMAP_OBJ_TYPE */
>
>  #define devmap_ifindex(fwd, map)                               \
> -       ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?               \
> +       ((map->map_type == BPF_MAP_TYPE_DEVMAP ||               \
> +         map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ?          \
>           ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
>
>  #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)               \
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index a2fe16362129..341af02f049d 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -37,6 +37,12 @@
>   * notifier hook walks the map we know that new dev references can not be
>   * added by the user because core infrastructure ensures dev_get_by_index()
>   * calls will fail at this point.
> + *
> + * The devmap_hash type is a map type which interprets keys as ifindexes and
> + * indexes these using a hashmap. This allows maps that use ifindex as key to be
> + * densely packed instead of having holes in the lookup array for unused
> + * ifindexes. The setup and packet enqueue/send code is shared between the two
> + * types of devmap; only the lookup and insertion is different.
>   */
>  #include <linux/bpf.h>
>  #include <net/xdp.h>
> @@ -59,6 +65,7 @@ struct xdp_bulk_queue {
>
>  struct bpf_dtab_netdev {
>         struct net_device *dev; /* must be first member, due to tracepoint */
> +       struct hlist_node index_hlist;
>         struct bpf_dtab *dtab;
>         unsigned int idx; /* keep track of map index for tracepoint */
>         struct xdp_bulk_queue __percpu *bulkq;
> @@ -70,11 +77,29 @@ struct bpf_dtab {
>         struct bpf_dtab_netdev **netdev_map;
>         struct list_head __percpu *flush_list;
>         struct list_head list;
> +
> +       /* these are only used for DEVMAP_HASH type maps */
> +       unsigned int items;
> +       struct hlist_head *dev_index_head;
> +       spinlock_t index_lock;
>  };
>
>  static DEFINE_SPINLOCK(dev_map_lock);
>  static LIST_HEAD(dev_map_list);
>
> +static struct hlist_head *dev_map_create_hash(void)
> +{
> +       int i;
> +       struct hlist_head *hash;
> +
> +       hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
> +       if (hash != NULL)
> +               for (i = 0; i < NETDEV_HASHENTRIES; i++)
> +                       INIT_HLIST_HEAD(&hash[i]);
> +
> +       return hash;
> +}
> +
>  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>                             bool check_memlock)
>  {
> @@ -98,6 +123,9 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>         cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>         cost += sizeof(struct list_head) * num_possible_cpus();
>
> +       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
> +               cost += sizeof(struct hlist_head) * NETDEV_HASHENTRIES;
> +
>         /* if map size is larger than memlock limit, reject it */
>         err = bpf_map_charge_init(&dtab->map.memory, cost);
>         if (err)
> @@ -116,8 +144,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>         if (!dtab->netdev_map)
>                 goto free_percpu;
>
> +       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
> +               dtab->dev_index_head = dev_map_create_hash();
> +               if (!dtab->dev_index_head)
> +                       goto free_map_area;
> +
> +               spin_lock_init(&dtab->index_lock);
> +       }
> +
>         return 0;
>
> +free_map_area:
> +       bpf_map_area_free(dtab->netdev_map);
>  free_percpu:
>         free_percpu(dtab->flush_list);
>  free_charge:
> @@ -199,6 +237,7 @@ static void dev_map_free(struct bpf_map *map)
>
>         free_percpu(dtab->flush_list);
>         bpf_map_area_free(dtab->netdev_map);
> +       kfree(dtab->dev_index_head);
>         kfree(dtab);
>  }
>
> @@ -219,6 +258,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>         return 0;
>  }
>
> +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
> +                                                   int idx)
> +{
> +       return &dtab->dev_index_head[idx & (NETDEV_HASHENTRIES - 1)];
> +}
> +
> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
> +{
> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +       struct hlist_head *head = dev_map_index_hash(dtab, key);
> +       struct bpf_dtab_netdev *dev;
> +
> +       hlist_for_each_entry_rcu(dev, head, index_hlist)
> +               if (dev->idx == key)
> +                       return dev;
> +
> +       return NULL;
> +}
> +
> +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
> +                                   void *next_key)
> +{
> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +       u32 idx, *next = next_key;
> +       struct bpf_dtab_netdev *dev, *next_dev;
> +       struct hlist_head *head;
> +       int i = 0;
> +
> +       if (!key)
> +               goto find_first;
> +
> +       idx = *(u32 *)key;
> +
> +       dev = __dev_map_hash_lookup_elem(map, idx);
> +       if (!dev)
> +               goto find_first;
> +
> +       next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
> +                                   struct bpf_dtab_netdev, index_hlist);

Just want to get a better understanding. Why do you want
hlist_entry_safe instead of hlist_entry?
Also, maybe rcu_dereference instead of rcu_dereference_raw?
dev_map_hash_get_next_key() is called in syscall.c within rcu_read_lock region.

> +
> +       if (next_dev) {
> +               *next = next_dev->idx;
> +               return 0;
> +       }
> +
> +       i = idx & (NETDEV_HASHENTRIES - 1);
> +       i++;
> +
> + find_first:
> +       for (; i < NETDEV_HASHENTRIES; i++) {
> +               head = dev_map_index_hash(dtab, i);
> +
> +               next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
> +                                           struct bpf_dtab_netdev,
> +                                           index_hlist);

ditto. The same question as the above.

> +               if (next_dev) {
> +                       *next = next_dev->idx;
> +                       return 0;
> +               }
> +       }
> +
> +       return -ENOENT;
> +}
> +
>  static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
>                        bool in_napi_ctx)
>  {
> @@ -374,6 +477,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
>         return dev ? &dev->ifindex : NULL;
>  }
>
> +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
> +{
> +       struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
> +                                                               *(u32 *)key);
> +       struct net_device *dev = obj ? obj->dev : NULL;

When obj->dev could be NULL?

> +
> +       return dev ? &dev->ifindex : NULL;
> +}
> +
>  static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
>  {
>         if (dev->dev->netdev_ops->ndo_xdp_xmit) {
> @@ -423,6 +535,27 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
>         return 0;
>  }
>
> +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
> +{
> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +       struct bpf_dtab_netdev *old_dev;
> +       int k = *(u32 *)key;
> +
> +       old_dev = __dev_map_hash_lookup_elem(map, k);
> +       if (!old_dev)
> +               return 0;
> +
> +       spin_lock(&dtab->index_lock);
> +       if (!hlist_unhashed(&old_dev->index_hlist)) {
> +               dtab->items--;
> +               hlist_del_init_rcu(&old_dev->index_hlist);
> +       }
> +       spin_unlock(&dtab->index_lock);
> +
> +       call_rcu(&old_dev->rcu, __dev_map_entry_free);
> +       return 0;
> +}
> +
>  static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
>                                                     struct bpf_dtab *dtab,
>                                                     u32 ifindex,
> @@ -503,6 +636,55 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>                                      map, key, value, map_flags);
>  }
>
> +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
> +                                    void *key, void *value, u64 map_flags)
> +{
> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +       struct bpf_dtab_netdev *dev, *old_dev;
> +       u32 ifindex = *(u32 *)value;
> +       u32 idx = *(u32 *)key;
> +
> +       if (unlikely(map_flags > BPF_EXIST || !ifindex))
> +               return -EINVAL;
> +
> +       old_dev = __dev_map_hash_lookup_elem(map, idx);
> +       if (old_dev && (map_flags & BPF_NOEXIST))
> +               return -EEXIST;
> +
> +       dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
> +       if (IS_ERR(dev))
> +               return PTR_ERR(dev);
> +
> +       spin_lock(&dtab->index_lock);
> +
> +       if (old_dev) {
> +               hlist_del_rcu(&old_dev->index_hlist);
> +       } else {
> +               if (dtab->items >= dtab->map.max_entries) {
> +                       spin_unlock(&dtab->index_lock);
> +                       call_rcu(&dev->rcu, __dev_map_entry_free);
> +                       return -ENOSPC;
> +               }
> +               dtab->items++;
> +       }
> +
> +       hlist_add_head_rcu(&dev->index_hlist,
> +                          dev_map_index_hash(dtab, idx));
> +       spin_unlock(&dtab->index_lock);
> +
> +       if (old_dev)
> +               call_rcu(&old_dev->rcu, __dev_map_entry_free);
> +
> +       return 0;
> +}
> +
> +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
> +                                  u64 map_flags)
> +{
> +       return __dev_map_hash_update_elem(current->nsproxy->net_ns,
> +                                        map, key, value, map_flags);
> +}
> +
>  const struct bpf_map_ops dev_map_ops = {
>         .map_alloc = dev_map_alloc,
>         .map_free = dev_map_free,
> @@ -513,6 +695,16 @@ const struct bpf_map_ops dev_map_ops = {
>         .map_check_btf = map_check_no_btf,
>  };
>
> +const struct bpf_map_ops dev_map_hash_ops = {
> +       .map_alloc = dev_map_alloc,
> +       .map_free = dev_map_free,
> +       .map_get_next_key = dev_map_hash_get_next_key,
> +       .map_lookup_elem = dev_map_hash_lookup_elem,
> +       .map_update_elem = dev_map_hash_update_elem,
> +       .map_delete_elem = dev_map_hash_delete_elem,
> +       .map_check_btf = map_check_no_btf,
> +};
> +
>  static int dev_map_notification(struct notifier_block *notifier,
>                                 ulong event, void *ptr)
>  {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index a2e763703c30..231b9e22827c 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3460,6 +3460,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
>                         goto error;
>                 break;
>         case BPF_MAP_TYPE_DEVMAP:
> +       case BPF_MAP_TYPE_DEVMAP_HASH:
>                 if (func_id != BPF_FUNC_redirect_map &&
>                     func_id != BPF_FUNC_map_lookup_elem)
>                         goto error;
> @@ -3542,6 +3543,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
>                 break;
>         case BPF_FUNC_redirect_map:
>                 if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
> +                   map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
>                     map->map_type != BPF_MAP_TYPE_CPUMAP &&
>                     map->map_type != BPF_MAP_TYPE_XSKMAP)
>                         goto error;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 089aaea0ccc6..276961c4e0d4 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3517,7 +3517,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>         int err;
>
>         switch (map->map_type) {
> -       case BPF_MAP_TYPE_DEVMAP: {
> +       case BPF_MAP_TYPE_DEVMAP:
> +       case BPF_MAP_TYPE_DEVMAP_HASH: {
>                 struct bpf_dtab_netdev *dst = fwd;
>
>                 err = dev_map_enqueue(dst, xdp, dev_rx);
> @@ -3554,6 +3555,7 @@ void xdp_do_flush_map(void)
>         if (map) {
>                 switch (map->map_type) {
>                 case BPF_MAP_TYPE_DEVMAP:
> +               case BPF_MAP_TYPE_DEVMAP_HASH:
>                         __dev_map_flush(map);
>                         break;
>                 case BPF_MAP_TYPE_CPUMAP:
> @@ -3574,6 +3576,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
>         switch (map->map_type) {
>         case BPF_MAP_TYPE_DEVMAP:
>                 return __dev_map_lookup_elem(map, index);
> +       case BPF_MAP_TYPE_DEVMAP_HASH:
> +               return __dev_map_hash_lookup_elem(map, index);
>         case BPF_MAP_TYPE_CPUMAP:
>                 return __cpu_map_lookup_elem(map, index);
>         case BPF_MAP_TYPE_XSKMAP:
> @@ -3655,7 +3659,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
>         ri->tgt_value = NULL;
>         WRITE_ONCE(ri->map, NULL);
>
> -       if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
> +       if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
> +           map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
>                 struct bpf_dtab_netdev *dst = fwd;
>
>                 err = dev_map_generic_redirect(dst, skb, xdp_prog);
>
Toke Høiland-Jørgensen July 8, 2019, 9:55 a.m. UTC | #2
Y Song <ys114321@gmail.com> writes:

> On Sat, Jul 6, 2019 at 1:48 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> From: Toke Høiland-Jørgensen <toke@redhat.com>
>>
>> A common pattern when using xdp_redirect_map() is to create a device map
>> where the lookup key is simply ifindex. Because device maps are arrays,
>> this leaves holes in the map, and the map has to be sized to fit the
>> largest ifindex, regardless of how many devices actually are actually
>> needed in the map.
>>
>> This patch adds a second type of device map where the key is looked up
>> using a hashmap, instead of being used as an array index. This allows maps
>> to be densely packed, so they can be smaller.
>>
>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>> ---
>>  include/linux/bpf.h        |    7 ++
>>  include/linux/bpf_types.h  |    1
>>  include/trace/events/xdp.h |    3 -
>>  kernel/bpf/devmap.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
>>  kernel/bpf/verifier.c      |    2
>>  net/core/filter.c          |    9 ++
>>  6 files changed, 211 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index bfdb54dd2ad1..f9a506147c8a 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -713,6 +713,7 @@ struct xdp_buff;
>>  struct sk_buff;
>>
>>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
>> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
>>  void __dev_map_flush(struct bpf_map *map);
>>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>>                     struct net_device *dev_rx);
>> @@ -799,6 +800,12 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
>>         return NULL;
>>  }
>>
>> +static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
>> +                                                            u32 key)
>> +{
>> +       return NULL;
>> +}
>> +
>>  static inline void __dev_map_flush(struct bpf_map *map)
>>  {
>>  }
>> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
>> index eec5aeeeaf92..36a9c2325176 100644
>> --- a/include/linux/bpf_types.h
>> +++ b/include/linux/bpf_types.h
>> @@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
>>  #ifdef CONFIG_NET
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
>> +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
>>  #if defined(CONFIG_BPF_STREAM_PARSER)
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
>> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
>> index 68899fdc985b..8c8420230a10 100644
>> --- a/include/trace/events/xdp.h
>> +++ b/include/trace/events/xdp.h
>> @@ -175,7 +175,8 @@ struct _bpf_dtab_netdev {
>>  #endif /* __DEVMAP_OBJ_TYPE */
>>
>>  #define devmap_ifindex(fwd, map)                               \
>> -       ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?               \
>> +       ((map->map_type == BPF_MAP_TYPE_DEVMAP ||               \
>> +         map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ?          \
>>           ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
>>
>>  #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)               \
>> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>> index a2fe16362129..341af02f049d 100644
>> --- a/kernel/bpf/devmap.c
>> +++ b/kernel/bpf/devmap.c
>> @@ -37,6 +37,12 @@
>>   * notifier hook walks the map we know that new dev references can not be
>>   * added by the user because core infrastructure ensures dev_get_by_index()
>>   * calls will fail at this point.
>> + *
>> + * The devmap_hash type is a map type which interprets keys as ifindexes and
>> + * indexes these using a hashmap. This allows maps that use ifindex as key to be
>> + * densely packed instead of having holes in the lookup array for unused
>> + * ifindexes. The setup and packet enqueue/send code is shared between the two
>> + * types of devmap; only the lookup and insertion is different.
>>   */
>>  #include <linux/bpf.h>
>>  #include <net/xdp.h>
>> @@ -59,6 +65,7 @@ struct xdp_bulk_queue {
>>
>>  struct bpf_dtab_netdev {
>>         struct net_device *dev; /* must be first member, due to tracepoint */
>> +       struct hlist_node index_hlist;
>>         struct bpf_dtab *dtab;
>>         unsigned int idx; /* keep track of map index for tracepoint */
>>         struct xdp_bulk_queue __percpu *bulkq;
>> @@ -70,11 +77,29 @@ struct bpf_dtab {
>>         struct bpf_dtab_netdev **netdev_map;
>>         struct list_head __percpu *flush_list;
>>         struct list_head list;
>> +
>> +       /* these are only used for DEVMAP_HASH type maps */
>> +       unsigned int items;
>> +       struct hlist_head *dev_index_head;
>> +       spinlock_t index_lock;
>>  };
>>
>>  static DEFINE_SPINLOCK(dev_map_lock);
>>  static LIST_HEAD(dev_map_list);
>>
>> +static struct hlist_head *dev_map_create_hash(void)
>> +{
>> +       int i;
>> +       struct hlist_head *hash;
>> +
>> +       hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
>> +       if (hash != NULL)
>> +               for (i = 0; i < NETDEV_HASHENTRIES; i++)
>> +                       INIT_HLIST_HEAD(&hash[i]);
>> +
>> +       return hash;
>> +}
>> +
>>  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>>                             bool check_memlock)
>>  {
>> @@ -98,6 +123,9 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>>         cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>>         cost += sizeof(struct list_head) * num_possible_cpus();
>>
>> +       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
>> +               cost += sizeof(struct hlist_head) * NETDEV_HASHENTRIES;
>> +
>>         /* if map size is larger than memlock limit, reject it */
>>         err = bpf_map_charge_init(&dtab->map.memory, cost);
>>         if (err)
>> @@ -116,8 +144,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>>         if (!dtab->netdev_map)
>>                 goto free_percpu;
>>
>> +       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
>> +               dtab->dev_index_head = dev_map_create_hash();
>> +               if (!dtab->dev_index_head)
>> +                       goto free_map_area;
>> +
>> +               spin_lock_init(&dtab->index_lock);
>> +       }
>> +
>>         return 0;
>>
>> +free_map_area:
>> +       bpf_map_area_free(dtab->netdev_map);
>>  free_percpu:
>>         free_percpu(dtab->flush_list);
>>  free_charge:
>> @@ -199,6 +237,7 @@ static void dev_map_free(struct bpf_map *map)
>>
>>         free_percpu(dtab->flush_list);
>>         bpf_map_area_free(dtab->netdev_map);
>> +       kfree(dtab->dev_index_head);
>>         kfree(dtab);
>>  }
>>
>> @@ -219,6 +258,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>>         return 0;
>>  }
>>
>> +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
>> +                                                   int idx)
>> +{
>> +       return &dtab->dev_index_head[idx & (NETDEV_HASHENTRIES - 1)];
>> +}
>> +
>> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
>> +{
>> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> +       struct hlist_head *head = dev_map_index_hash(dtab, key);
>> +       struct bpf_dtab_netdev *dev;
>> +
>> +       hlist_for_each_entry_rcu(dev, head, index_hlist)
>> +               if (dev->idx == key)
>> +                       return dev;
>> +
>> +       return NULL;
>> +}
>> +
>> +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
>> +                                   void *next_key)
>> +{
>> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> +       u32 idx, *next = next_key;
>> +       struct bpf_dtab_netdev *dev, *next_dev;
>> +       struct hlist_head *head;
>> +       int i = 0;
>> +
>> +       if (!key)
>> +               goto find_first;
>> +
>> +       idx = *(u32 *)key;
>> +
>> +       dev = __dev_map_hash_lookup_elem(map, idx);
>> +       if (!dev)
>> +               goto find_first;
>> +
>> +       next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
>> +                                   struct bpf_dtab_netdev, index_hlist);
>
> Just want to get a better understanding. Why do you want
> hlist_entry_safe instead of hlist_entry?

Erm, because the entry might not exist? The _safe variant just checks
the list pointer before casting to the containing struct.

> Also, maybe rcu_dereference instead of rcu_dereference_raw?

Well, the _raw() variant comes from the equivalent function in
hashtab.c, where I originally nicked most of this function, so think it
makes more sense to keep them the same.

> dev_map_hash_get_next_key() is called in syscall.c within rcu_read_lock region.
>
>> +
>> +       if (next_dev) {
>> +               *next = next_dev->idx;
>> +               return 0;
>> +       }
>> +
>> +       i = idx & (NETDEV_HASHENTRIES - 1);
>> +       i++;
>> +
>> + find_first:
>> +       for (; i < NETDEV_HASHENTRIES; i++) {
>> +               head = dev_map_index_hash(dtab, i);
>> +
>> +               next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
>> +                                           struct bpf_dtab_netdev,
>> +                                           index_hlist);
>
> ditto. The same question as the above.
>
>> +               if (next_dev) {
>> +                       *next = next_dev->idx;
>> +                       return 0;
>> +               }
>> +       }
>> +
>> +       return -ENOENT;
>> +}
>> +
>>  static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
>>                        bool in_napi_ctx)
>>  {
>> @@ -374,6 +477,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
>>         return dev ? &dev->ifindex : NULL;
>>  }
>>
>> +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
>> +{
>> +       struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
>> +                                                               *(u32 *)key);
>> +       struct net_device *dev = obj ? obj->dev : NULL;
>
> When obj->dev could be NULL?

Never. This could just as well be written as

return obj ? obj->dev->ifindex : NULL;

but writing it this way keeps it consistent with the existing
dev_map_lookup_elem() function.

>> +
>> +       return dev ? &dev->ifindex : NULL;
>> +}
>> +
>>  static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
>>  {
>>         if (dev->dev->netdev_ops->ndo_xdp_xmit) {
>> @@ -423,6 +535,27 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
>>         return 0;
>>  }
>>
>> +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
>> +{
>> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> +       struct bpf_dtab_netdev *old_dev;
>> +       int k = *(u32 *)key;
>> +
>> +       old_dev = __dev_map_hash_lookup_elem(map, k);
>> +       if (!old_dev)
>> +               return 0;
>> +
>> +       spin_lock(&dtab->index_lock);
>> +       if (!hlist_unhashed(&old_dev->index_hlist)) {
>> +               dtab->items--;
>> +               hlist_del_init_rcu(&old_dev->index_hlist);
>> +       }
>> +       spin_unlock(&dtab->index_lock);
>> +
>> +       call_rcu(&old_dev->rcu, __dev_map_entry_free);
>> +       return 0;
>> +}
>> +
>>  static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
>>                                                     struct bpf_dtab *dtab,
>>                                                     u32 ifindex,
>> @@ -503,6 +636,55 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>>                                      map, key, value, map_flags);
>>  }
>>
>> +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
>> +                                    void *key, void *value, u64 map_flags)
>> +{
>> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> +       struct bpf_dtab_netdev *dev, *old_dev;
>> +       u32 ifindex = *(u32 *)value;
>> +       u32 idx = *(u32 *)key;
>> +
>> +       if (unlikely(map_flags > BPF_EXIST || !ifindex))
>> +               return -EINVAL;
>> +
>> +       old_dev = __dev_map_hash_lookup_elem(map, idx);
>> +       if (old_dev && (map_flags & BPF_NOEXIST))
>> +               return -EEXIST;
>> +
>> +       dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
>> +       if (IS_ERR(dev))
>> +               return PTR_ERR(dev);
>> +
>> +       spin_lock(&dtab->index_lock);
>> +
>> +       if (old_dev) {
>> +               hlist_del_rcu(&old_dev->index_hlist);
>> +       } else {
>> +               if (dtab->items >= dtab->map.max_entries) {
>> +                       spin_unlock(&dtab->index_lock);
>> +                       call_rcu(&dev->rcu, __dev_map_entry_free);
>> +                       return -ENOSPC;
>> +               }
>> +               dtab->items++;
>> +       }
>> +
>> +       hlist_add_head_rcu(&dev->index_hlist,
>> +                          dev_map_index_hash(dtab, idx));
>> +       spin_unlock(&dtab->index_lock);
>> +
>> +       if (old_dev)
>> +               call_rcu(&old_dev->rcu, __dev_map_entry_free);
>> +
>> +       return 0;
>> +}
>> +
>> +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
>> +                                  u64 map_flags)
>> +{
>> +       return __dev_map_hash_update_elem(current->nsproxy->net_ns,
>> +                                        map, key, value, map_flags);
>> +}
>> +
>>  const struct bpf_map_ops dev_map_ops = {
>>         .map_alloc = dev_map_alloc,
>>         .map_free = dev_map_free,
>> @@ -513,6 +695,16 @@ const struct bpf_map_ops dev_map_ops = {
>>         .map_check_btf = map_check_no_btf,
>>  };
>>
>> +const struct bpf_map_ops dev_map_hash_ops = {
>> +       .map_alloc = dev_map_alloc,
>> +       .map_free = dev_map_free,
>> +       .map_get_next_key = dev_map_hash_get_next_key,
>> +       .map_lookup_elem = dev_map_hash_lookup_elem,
>> +       .map_update_elem = dev_map_hash_update_elem,
>> +       .map_delete_elem = dev_map_hash_delete_elem,
>> +       .map_check_btf = map_check_no_btf,
>> +};
>> +
>>  static int dev_map_notification(struct notifier_block *notifier,
>>                                 ulong event, void *ptr)
>>  {
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index a2e763703c30..231b9e22827c 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -3460,6 +3460,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
>>                         goto error;
>>                 break;
>>         case BPF_MAP_TYPE_DEVMAP:
>> +       case BPF_MAP_TYPE_DEVMAP_HASH:
>>                 if (func_id != BPF_FUNC_redirect_map &&
>>                     func_id != BPF_FUNC_map_lookup_elem)
>>                         goto error;
>> @@ -3542,6 +3543,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
>>                 break;
>>         case BPF_FUNC_redirect_map:
>>                 if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
>> +                   map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
>>                     map->map_type != BPF_MAP_TYPE_CPUMAP &&
>>                     map->map_type != BPF_MAP_TYPE_XSKMAP)
>>                         goto error;
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 089aaea0ccc6..276961c4e0d4 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -3517,7 +3517,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>>         int err;
>>
>>         switch (map->map_type) {
>> -       case BPF_MAP_TYPE_DEVMAP: {
>> +       case BPF_MAP_TYPE_DEVMAP:
>> +       case BPF_MAP_TYPE_DEVMAP_HASH: {
>>                 struct bpf_dtab_netdev *dst = fwd;
>>
>>                 err = dev_map_enqueue(dst, xdp, dev_rx);
>> @@ -3554,6 +3555,7 @@ void xdp_do_flush_map(void)
>>         if (map) {
>>                 switch (map->map_type) {
>>                 case BPF_MAP_TYPE_DEVMAP:
>> +               case BPF_MAP_TYPE_DEVMAP_HASH:
>>                         __dev_map_flush(map);
>>                         break;
>>                 case BPF_MAP_TYPE_CPUMAP:
>> @@ -3574,6 +3576,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
>>         switch (map->map_type) {
>>         case BPF_MAP_TYPE_DEVMAP:
>>                 return __dev_map_lookup_elem(map, index);
>> +       case BPF_MAP_TYPE_DEVMAP_HASH:
>> +               return __dev_map_hash_lookup_elem(map, index);
>>         case BPF_MAP_TYPE_CPUMAP:
>>                 return __cpu_map_lookup_elem(map, index);
>>         case BPF_MAP_TYPE_XSKMAP:
>> @@ -3655,7 +3659,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
>>         ri->tgt_value = NULL;
>>         WRITE_ONCE(ri->map, NULL);
>>
>> -       if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
>> +       if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
>> +           map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
>>                 struct bpf_dtab_netdev *dst = fwd;
>>
>>                 err = dev_map_generic_redirect(dst, skb, xdp_prog);
>>
Y Song July 8, 2019, 3:01 p.m. UTC | #3
On Mon, Jul 8, 2019 at 2:55 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Y Song <ys114321@gmail.com> writes:
>
> > On Sat, Jul 6, 2019 at 1:48 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >>
> >> From: Toke Høiland-Jørgensen <toke@redhat.com>
> >>
> >> A common pattern when using xdp_redirect_map() is to create a device map
> >> where the lookup key is simply ifindex. Because device maps are arrays,
> >> this leaves holes in the map, and the map has to be sized to fit the
> >> largest ifindex, regardless of how many devices actually are actually
> >> needed in the map.
> >>
> >> This patch adds a second type of device map where the key is looked up
> >> using a hashmap, instead of being used as an array index. This allows maps
> >> to be densely packed, so they can be smaller.
> >>
> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> >> ---
> >>  include/linux/bpf.h        |    7 ++
> >>  include/linux/bpf_types.h  |    1
> >>  include/trace/events/xdp.h |    3 -
> >>  kernel/bpf/devmap.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
> >>  kernel/bpf/verifier.c      |    2
> >>  net/core/filter.c          |    9 ++
> >>  6 files changed, 211 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index bfdb54dd2ad1..f9a506147c8a 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -713,6 +713,7 @@ struct xdp_buff;
> >>  struct sk_buff;
> >>
> >>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
> >> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
> >>  void __dev_map_flush(struct bpf_map *map);
> >>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> >>                     struct net_device *dev_rx);
> >> @@ -799,6 +800,12 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
> >>         return NULL;
> >>  }
> >>
> >> +static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
> >> +                                                            u32 key)
> >> +{
> >> +       return NULL;
> >> +}
> >> +
> >>  static inline void __dev_map_flush(struct bpf_map *map)
> >>  {
> >>  }
> >> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> >> index eec5aeeeaf92..36a9c2325176 100644
> >> --- a/include/linux/bpf_types.h
> >> +++ b/include/linux/bpf_types.h
> >> @@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
> >>  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
> >>  #ifdef CONFIG_NET
> >>  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
> >> +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
> >>  BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
> >>  #if defined(CONFIG_BPF_STREAM_PARSER)
> >>  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
> >> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> >> index 68899fdc985b..8c8420230a10 100644
> >> --- a/include/trace/events/xdp.h
> >> +++ b/include/trace/events/xdp.h
> >> @@ -175,7 +175,8 @@ struct _bpf_dtab_netdev {
> >>  #endif /* __DEVMAP_OBJ_TYPE */
> >>
> >>  #define devmap_ifindex(fwd, map)                               \
> >> -       ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?               \
> >> +       ((map->map_type == BPF_MAP_TYPE_DEVMAP ||               \
> >> +         map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ?          \
> >>           ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
> >>
> >>  #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)               \
> >> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> >> index a2fe16362129..341af02f049d 100644
> >> --- a/kernel/bpf/devmap.c
> >> +++ b/kernel/bpf/devmap.c
> >> @@ -37,6 +37,12 @@
> >>   * notifier hook walks the map we know that new dev references can not be
> >>   * added by the user because core infrastructure ensures dev_get_by_index()
> >>   * calls will fail at this point.
> >> + *
> >> + * The devmap_hash type is a map type which interprets keys as ifindexes and
> >> + * indexes these using a hashmap. This allows maps that use ifindex as key to be
> >> + * densely packed instead of having holes in the lookup array for unused
> >> + * ifindexes. The setup and packet enqueue/send code is shared between the two
> >> + * types of devmap; only the lookup and insertion is different.
> >>   */
> >>  #include <linux/bpf.h>
> >>  #include <net/xdp.h>
> >> @@ -59,6 +65,7 @@ struct xdp_bulk_queue {
> >>
> >>  struct bpf_dtab_netdev {
> >>         struct net_device *dev; /* must be first member, due to tracepoint */
> >> +       struct hlist_node index_hlist;
> >>         struct bpf_dtab *dtab;
> >>         unsigned int idx; /* keep track of map index for tracepoint */
> >>         struct xdp_bulk_queue __percpu *bulkq;
> >> @@ -70,11 +77,29 @@ struct bpf_dtab {
> >>         struct bpf_dtab_netdev **netdev_map;
> >>         struct list_head __percpu *flush_list;
> >>         struct list_head list;
> >> +
> >> +       /* these are only used for DEVMAP_HASH type maps */
> >> +       unsigned int items;
> >> +       struct hlist_head *dev_index_head;
> >> +       spinlock_t index_lock;
> >>  };
> >>
> >>  static DEFINE_SPINLOCK(dev_map_lock);
> >>  static LIST_HEAD(dev_map_list);
> >>
> >> +static struct hlist_head *dev_map_create_hash(void)
> >> +{
> >> +       int i;
> >> +       struct hlist_head *hash;
> >> +
> >> +       hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
> >> +       if (hash != NULL)
> >> +               for (i = 0; i < NETDEV_HASHENTRIES; i++)
> >> +                       INIT_HLIST_HEAD(&hash[i]);
> >> +
> >> +       return hash;
> >> +}
> >> +
> >>  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> >>                             bool check_memlock)
> >>  {
> >> @@ -98,6 +123,9 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> >>         cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
> >>         cost += sizeof(struct list_head) * num_possible_cpus();
> >>
> >> +       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
> >> +               cost += sizeof(struct hlist_head) * NETDEV_HASHENTRIES;
> >> +
> >>         /* if map size is larger than memlock limit, reject it */
> >>         err = bpf_map_charge_init(&dtab->map.memory, cost);
> >>         if (err)
> >> @@ -116,8 +144,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> >>         if (!dtab->netdev_map)
> >>                 goto free_percpu;
> >>
> >> +       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
> >> +               dtab->dev_index_head = dev_map_create_hash();
> >> +               if (!dtab->dev_index_head)
> >> +                       goto free_map_area;
> >> +
> >> +               spin_lock_init(&dtab->index_lock);
> >> +       }
> >> +
> >>         return 0;
> >>
> >> +free_map_area:
> >> +       bpf_map_area_free(dtab->netdev_map);
> >>  free_percpu:
> >>         free_percpu(dtab->flush_list);
> >>  free_charge:
> >> @@ -199,6 +237,7 @@ static void dev_map_free(struct bpf_map *map)
> >>
> >>         free_percpu(dtab->flush_list);
> >>         bpf_map_area_free(dtab->netdev_map);
> >> +       kfree(dtab->dev_index_head);
> >>         kfree(dtab);
> >>  }
> >>
> >> @@ -219,6 +258,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> >>         return 0;
> >>  }
> >>
> >> +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
> >> +                                                   int idx)
> >> +{
> >> +       return &dtab->dev_index_head[idx & (NETDEV_HASHENTRIES - 1)];
> >> +}
> >> +
> >> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
> >> +{
> >> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> >> +       struct hlist_head *head = dev_map_index_hash(dtab, key);
> >> +       struct bpf_dtab_netdev *dev;
> >> +
> >> +       hlist_for_each_entry_rcu(dev, head, index_hlist)
> >> +               if (dev->idx == key)
> >> +                       return dev;
> >> +
> >> +       return NULL;
> >> +}
> >> +
> >> +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
> >> +                                   void *next_key)
> >> +{
> >> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> >> +       u32 idx, *next = next_key;
> >> +       struct bpf_dtab_netdev *dev, *next_dev;
> >> +       struct hlist_head *head;
> >> +       int i = 0;
> >> +
> >> +       if (!key)
> >> +               goto find_first;
> >> +
> >> +       idx = *(u32 *)key;
> >> +
> >> +       dev = __dev_map_hash_lookup_elem(map, idx);
> >> +       if (!dev)
> >> +               goto find_first;
> >> +
> >> +       next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
> >> +                                   struct bpf_dtab_netdev, index_hlist);
> >
> > Just want to get a better understanding. Why do you want
> > hlist_entry_safe instead of hlist_entry?
>
> Erm, because the entry might not exist? The _safe variant just checks
> the list pointer before casting to the containing struct.

hlist_entry_safe is certainly correct. Just want to better understand when
the entry might not exist. By looking at hashtab.c implementation, it has
the same possible-null-entry assumption, so we should be ok here.

>
> > Also, maybe rcu_dereference instead of rcu_dereference_raw?
>
> Well, the _raw() variant comes from the equivalent function in
> hashtab.c, where I originally nicked most of this function, so think it
> makes more sense to keep them the same.

Okay.

>
> > dev_map_hash_get_next_key() is called in syscall.c within rcu_read_lock region.
> >
> >> +
> >> +       if (next_dev) {
> >> +               *next = next_dev->idx;
> >> +               return 0;
> >> +       }
> >> +
> >> +       i = idx & (NETDEV_HASHENTRIES - 1);
> >> +       i++;
> >> +
> >> + find_first:
> >> +       for (; i < NETDEV_HASHENTRIES; i++) {
> >> +               head = dev_map_index_hash(dtab, i);
> >> +
> >> +               next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
> >> +                                           struct bpf_dtab_netdev,
> >> +                                           index_hlist);
> >
> > ditto. The same question as the above.
> >
> >> +               if (next_dev) {
> >> +                       *next = next_dev->idx;
> >> +                       return 0;
> >> +               }
> >> +       }
> >> +
> >> +       return -ENOENT;
> >> +}
> >> +
> >>  static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
> >>                        bool in_napi_ctx)
> >>  {
> >> @@ -374,6 +477,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
> >>         return dev ? &dev->ifindex : NULL;
> >>  }
> >>
> >> +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
> >> +{
> >> +       struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
> >> +                                                               *(u32 *)key);
> >> +       struct net_device *dev = obj ? obj->dev : NULL;
> >
> > When obj->dev could be NULL?
>
> Never. This could just as well be written as
>
> return obj ? obj->dev->ifindex : NULL;
>
> but writing it this way keeps it consistent with the existing
> dev_map_lookup_elem() function.

Okay then in order to be consistent with the existing code base.

Thanks for all the explanation!
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bfdb54dd2ad1..f9a506147c8a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -713,6 +713,7 @@  struct xdp_buff;
 struct sk_buff;
 
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_flush(struct bpf_map *map);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
@@ -799,6 +800,12 @@  static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 	return NULL;
 }
 
+static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
+							     u32 key)
+{
+	return NULL;
+}
+
 static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index eec5aeeeaf92..36a9c2325176 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -62,6 +62,7 @@  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
 #if defined(CONFIG_BPF_STREAM_PARSER)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 68899fdc985b..8c8420230a10 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -175,7 +175,8 @@  struct _bpf_dtab_netdev {
 #endif /* __DEVMAP_OBJ_TYPE */
 
 #define devmap_ifindex(fwd, map)				\
-	((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	((map->map_type == BPF_MAP_TYPE_DEVMAP ||		\
+	  map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ?		\
 	  ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a2fe16362129..341af02f049d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -37,6 +37,12 @@ 
  * notifier hook walks the map we know that new dev references can not be
  * added by the user because core infrastructure ensures dev_get_by_index()
  * calls will fail at this point.
+ *
+ * The devmap_hash type is a map type which interprets keys as ifindexes and
+ * indexes these using a hashmap. This allows maps that use ifindex as key to be
+ * densely packed instead of having holes in the lookup array for unused
+ * ifindexes. The setup and packet enqueue/send code is shared between the two
+ * types of devmap; only the lookup and insertion is different.
  */
 #include <linux/bpf.h>
 #include <net/xdp.h>
@@ -59,6 +65,7 @@  struct xdp_bulk_queue {
 
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
+	struct hlist_node index_hlist;
 	struct bpf_dtab *dtab;
 	unsigned int idx; /* keep track of map index for tracepoint */
 	struct xdp_bulk_queue __percpu *bulkq;
@@ -70,11 +77,29 @@  struct bpf_dtab {
 	struct bpf_dtab_netdev **netdev_map;
 	struct list_head __percpu *flush_list;
 	struct list_head list;
+
+	/* these are only used for DEVMAP_HASH type maps */
+	unsigned int items;
+	struct hlist_head *dev_index_head;
+	spinlock_t index_lock;
 };
 
 static DEFINE_SPINLOCK(dev_map_lock);
 static LIST_HEAD(dev_map_list);
 
+static struct hlist_head *dev_map_create_hash(void)
+{
+	int i;
+	struct hlist_head *hash;
+
+	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
+	if (hash != NULL)
+		for (i = 0; i < NETDEV_HASHENTRIES; i++)
+			INIT_HLIST_HEAD(&hash[i]);
+
+	return hash;
+}
+
 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
 			    bool check_memlock)
 {
@@ -98,6 +123,9 @@  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 	cost += sizeof(struct list_head) * num_possible_cpus();
 
+	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
+		cost += sizeof(struct hlist_head) * NETDEV_HASHENTRIES;
+
 	/* if map size is larger than memlock limit, reject it */
 	err = bpf_map_charge_init(&dtab->map.memory, cost);
 	if (err)
@@ -116,8 +144,18 @@  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
 	if (!dtab->netdev_map)
 		goto free_percpu;
 
+	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+		dtab->dev_index_head = dev_map_create_hash();
+		if (!dtab->dev_index_head)
+			goto free_map_area;
+
+		spin_lock_init(&dtab->index_lock);
+	}
+
 	return 0;
 
+free_map_area:
+	bpf_map_area_free(dtab->netdev_map);
 free_percpu:
 	free_percpu(dtab->flush_list);
 free_charge:
@@ -199,6 +237,7 @@  static void dev_map_free(struct bpf_map *map)
 
 	free_percpu(dtab->flush_list);
 	bpf_map_area_free(dtab->netdev_map);
+	kfree(dtab->dev_index_head);
 	kfree(dtab);
 }
 
@@ -219,6 +258,70 @@  static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
+static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
+						    int idx)
+{
+	return &dtab->dev_index_head[idx & (NETDEV_HASHENTRIES - 1)];
+}
+
+struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct hlist_head *head = dev_map_index_hash(dtab, key);
+	struct bpf_dtab_netdev *dev;
+
+	hlist_for_each_entry_rcu(dev, head, index_hlist)
+		if (dev->idx == key)
+			return dev;
+
+	return NULL;
+}
+
+static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
+				    void *next_key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	u32 idx, *next = next_key;
+	struct bpf_dtab_netdev *dev, *next_dev;
+	struct hlist_head *head;
+	int i = 0;
+
+	if (!key)
+		goto find_first;
+
+	idx = *(u32 *)key;
+
+	dev = __dev_map_hash_lookup_elem(map, idx);
+	if (!dev)
+		goto find_first;
+
+	next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
+				    struct bpf_dtab_netdev, index_hlist);
+
+	if (next_dev) {
+		*next = next_dev->idx;
+		return 0;
+	}
+
+	i = idx & (NETDEV_HASHENTRIES - 1);
+	i++;
+
+ find_first:
+	for (; i < NETDEV_HASHENTRIES; i++) {
+		head = dev_map_index_hash(dtab, i);
+
+		next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+					    struct bpf_dtab_netdev,
+					    index_hlist);
+		if (next_dev) {
+			*next = next_dev->idx;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
 static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
 		       bool in_napi_ctx)
 {
@@ -374,6 +477,15 @@  static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 	return dev ? &dev->ifindex : NULL;
 }
 
+static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
+								*(u32 *)key);
+	struct net_device *dev = obj ? obj->dev : NULL;
+
+	return dev ? &dev->ifindex : NULL;
+}
+
 static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
 	if (dev->dev->netdev_ops->ndo_xdp_xmit) {
@@ -423,6 +535,27 @@  static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
+static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *old_dev;
+	int k = *(u32 *)key;
+
+	old_dev = __dev_map_hash_lookup_elem(map, k);
+	if (!old_dev)
+		return 0;
+
+	spin_lock(&dtab->index_lock);
+	if (!hlist_unhashed(&old_dev->index_hlist)) {
+		dtab->items--;
+		hlist_del_init_rcu(&old_dev->index_hlist);
+	}
+	spin_unlock(&dtab->index_lock);
+
+	call_rcu(&old_dev->rcu, __dev_map_entry_free);
+	return 0;
+}
+
 static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 						    struct bpf_dtab *dtab,
 						    u32 ifindex,
@@ -503,6 +636,55 @@  static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 				     map, key, value, map_flags);
 }
 
+static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+				     void *key, void *value, u64 map_flags)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *dev, *old_dev;
+	u32 ifindex = *(u32 *)value;
+	u32 idx = *(u32 *)key;
+
+	if (unlikely(map_flags > BPF_EXIST || !ifindex))
+		return -EINVAL;
+
+	old_dev = __dev_map_hash_lookup_elem(map, idx);
+	if (old_dev && (map_flags & BPF_NOEXIST))
+		return -EEXIST;
+
+	dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	spin_lock(&dtab->index_lock);
+
+	if (old_dev) {
+		hlist_del_rcu(&old_dev->index_hlist);
+	} else {
+		if (dtab->items >= dtab->map.max_entries) {
+			spin_unlock(&dtab->index_lock);
+			call_rcu(&dev->rcu, __dev_map_entry_free);
+			return -ENOSPC;
+		}
+		dtab->items++;
+	}
+
+	hlist_add_head_rcu(&dev->index_hlist,
+			   dev_map_index_hash(dtab, idx));
+	spin_unlock(&dtab->index_lock);
+
+	if (old_dev)
+		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+	return 0;
+}
+
+static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+				   u64 map_flags)
+{
+	return __dev_map_hash_update_elem(current->nsproxy->net_ns,
+					 map, key, value, map_flags);
+}
+
 const struct bpf_map_ops dev_map_ops = {
 	.map_alloc = dev_map_alloc,
 	.map_free = dev_map_free,
@@ -513,6 +695,16 @@  const struct bpf_map_ops dev_map_ops = {
 	.map_check_btf = map_check_no_btf,
 };
 
+const struct bpf_map_ops dev_map_hash_ops = {
+	.map_alloc = dev_map_alloc,
+	.map_free = dev_map_free,
+	.map_get_next_key = dev_map_hash_get_next_key,
+	.map_lookup_elem = dev_map_hash_lookup_elem,
+	.map_update_elem = dev_map_hash_update_elem,
+	.map_delete_elem = dev_map_hash_delete_elem,
+	.map_check_btf = map_check_no_btf,
+};
+
 static int dev_map_notification(struct notifier_block *notifier,
 				ulong event, void *ptr)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2e763703c30..231b9e22827c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3460,6 +3460,7 @@  static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_HASH:
 		if (func_id != BPF_FUNC_redirect_map &&
 		    func_id != BPF_FUNC_map_lookup_elem)
 			goto error;
@@ -3542,6 +3543,7 @@  static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_redirect_map:
 		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+		    map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
 		    map->map_type != BPF_MAP_TYPE_CPUMAP &&
 		    map->map_type != BPF_MAP_TYPE_XSKMAP)
 			goto error;
diff --git a/net/core/filter.c b/net/core/filter.c
index 089aaea0ccc6..276961c4e0d4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3517,7 +3517,8 @@  static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 	int err;
 
 	switch (map->map_type) {
-	case BPF_MAP_TYPE_DEVMAP: {
+	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_HASH: {
 		struct bpf_dtab_netdev *dst = fwd;
 
 		err = dev_map_enqueue(dst, xdp, dev_rx);
@@ -3554,6 +3555,7 @@  void xdp_do_flush_map(void)
 	if (map) {
 		switch (map->map_type) {
 		case BPF_MAP_TYPE_DEVMAP:
+		case BPF_MAP_TYPE_DEVMAP_HASH:
 			__dev_map_flush(map);
 			break;
 		case BPF_MAP_TYPE_CPUMAP:
@@ -3574,6 +3576,8 @@  static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP:
 		return __dev_map_lookup_elem(map, index);
+	case BPF_MAP_TYPE_DEVMAP_HASH:
+		return __dev_map_hash_lookup_elem(map, index);
 	case BPF_MAP_TYPE_CPUMAP:
 		return __cpu_map_lookup_elem(map, index);
 	case BPF_MAP_TYPE_XSKMAP:
@@ -3655,7 +3659,8 @@  static int xdp_do_generic_redirect_map(struct net_device *dev,
 	ri->tgt_value = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
-	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
+	    map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 		struct bpf_dtab_netdev *dst = fwd;
 
 		err = dev_map_generic_redirect(dst, skb, xdp_prog);