Message ID | 20180423135619.7179-3-bjorn.topel@gmail.com |
---|---|
State | Changes Requested, archived |
Delegated to: | BPF Maintainers |
Headers | show |
Series | Introducing AF_XDP support | expand |
On Mon, Apr 23, 2018 at 03:56:06PM +0200, Björn Töpel wrote: > From: Björn Töpel <bjorn.topel@intel.com> > > In this commit the base structure of the AF_XDP address family is set > up. Further, we introduce the abilty register a window of user memory > to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory > window is viewed by an AF_XDP socket as a set of equally large > frames. After a user memory registration all frames are "owned" by the > user application, and not the kernel. > > Co-authored-by: Magnus Karlsson <magnus.karlsson@intel.com> > Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com> > Signed-off-by: Björn Töpel <bjorn.topel@intel.com> > --- > include/uapi/linux/if_xdp.h | 34 +++++++ > net/Makefile | 1 + > net/xdp/Makefile | 2 + > net/xdp/xdp_umem.c | 237 ++++++++++++++++++++++++++++++++++++++++++++ > net/xdp/xdp_umem.h | 42 ++++++++ > net/xdp/xdp_umem_props.h | 23 +++++ > net/xdp/xsk.c | 223 +++++++++++++++++++++++++++++++++++++++++ > 7 files changed, 562 insertions(+) > create mode 100644 include/uapi/linux/if_xdp.h > create mode 100644 net/xdp/Makefile > create mode 100644 net/xdp/xdp_umem.c > create mode 100644 net/xdp/xdp_umem.h > create mode 100644 net/xdp/xdp_umem_props.h > create mode 100644 net/xdp/xsk.c > > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h > new file mode 100644 > index 000000000000..41252135a0fe > --- /dev/null > +++ b/include/uapi/linux/if_xdp.h > @@ -0,0 +1,34 @@ > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note > + * > + * if_xdp: XDP socket user-space interface > + * Copyright(c) 2018 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * Author(s): Björn Töpel <bjorn.topel@intel.com> > + * Magnus Karlsson <magnus.karlsson@intel.com> > + */ > + > +#ifndef _LINUX_IF_XDP_H > +#define _LINUX_IF_XDP_H > + > +#include <linux/types.h> > + > +/* XDP socket options */ > +#define XDP_UMEM_REG 3 > + > +struct xdp_umem_reg { > + __u64 addr; /* Start of packet data area */ > + __u64 len; /* Length of packet data area */ > + __u32 frame_size; /* Frame size */ > + __u32 frame_headroom; /* Frame head room */ > +}; > + > +#endif /* _LINUX_IF_XDP_H */ > diff --git a/net/Makefile b/net/Makefile > index a6147c61b174..77aaddedbd29 100644 > --- a/net/Makefile > +++ b/net/Makefile > @@ -85,3 +85,4 @@ obj-y += l3mdev/ > endif > obj-$(CONFIG_QRTR) += qrtr/ > obj-$(CONFIG_NET_NCSI) += ncsi/ > +obj-$(CONFIG_XDP_SOCKETS) += xdp/ > diff --git a/net/xdp/Makefile b/net/xdp/Makefile > new file mode 100644 > index 000000000000..a5d736640a0f > --- /dev/null > +++ b/net/xdp/Makefile > @@ -0,0 +1,2 @@ > +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o > + > diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c > new file mode 100644 > index 000000000000..bff058f5a769 > --- /dev/null > +++ b/net/xdp/xdp_umem.c > @@ -0,0 +1,237 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* XDP user-space packet buffer > + * Copyright(c) 2018 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + */ > + > +#include <linux/init.h> > +#include <linux/sched/mm.h> > +#include <linux/sched/signal.h> > +#include <linux/sched/task.h> > +#include <linux/uaccess.h> > +#include <linux/slab.h> > +#include <linux/bpf.h> > +#include <linux/mm.h> > + > +#include "xdp_umem.h" > + > +#define XDP_UMEM_MIN_FRAME_SIZE 2048 > + > +int xdp_umem_create(struct xdp_umem **umem) > +{ > + *umem = kzalloc(sizeof(**umem), GFP_KERNEL); > + > + if (!(*umem)) > + return -ENOMEM; > + > + return 0; > +} > + > +static void xdp_umem_unpin_pages(struct xdp_umem *umem) > +{ > + unsigned int i; > + > + if (umem->pgs) { > + for (i = 0; i < umem->npgs; i++) Since you pin them with FOLL_WRITE, I assume these pages are written to. Don't you need set_page_dirty_lock here? > + put_page(umem->pgs[i]); > + > + kfree(umem->pgs); > + umem->pgs = NULL; > + } > +} > + > +static void xdp_umem_unaccount_pages(struct xdp_umem *umem) > +{ > + if (umem->user) { > + atomic_long_sub(umem->npgs, &umem->user->locked_vm); > + free_uid(umem->user); > + } > +} > + > +static void xdp_umem_release(struct xdp_umem *umem) > +{ > + struct task_struct *task; > + struct mm_struct *mm; > + unsigned long diff; > + > + if (umem->pgs) { > + xdp_umem_unpin_pages(umem); > + > + task = get_pid_task(umem->pid, PIDTYPE_PID); > + put_pid(umem->pid); > + if (!task) > + goto out; > + mm = get_task_mm(task); > + put_task_struct(task); > + if (!mm) > + goto out; > + > + diff = umem->size >> PAGE_SHIFT; > + > + down_write(&mm->mmap_sem); > + mm->pinned_vm -= diff; > + up_write(&mm->mmap_sem); > + mmput(mm); > + umem->pgs = NULL; > + } > + > + xdp_umem_unaccount_pages(umem); > +out: > + kfree(umem); > +} > + > +void xdp_put_umem(struct xdp_umem *umem) > +{ > + if (!umem) > + return; > + > + if (atomic_dec_and_test(&umem->users)) > + xdp_umem_release(umem); > +} > + > +static int xdp_umem_pin_pages(struct xdp_umem *umem) > +{ > + unsigned int gup_flags = FOLL_WRITE; > + long npgs; > + int err; > + > + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); > + if (!umem->pgs) > + return -ENOMEM; > + > + npgs = get_user_pages(umem->address, umem->npgs, > + gup_flags, &umem->pgs[0], NULL); > + if (npgs != umem->npgs) { > + if (npgs >= 0) { > + umem->npgs = npgs; > + err = -ENOMEM; > + goto out_pin; > + } > + err = npgs; > + goto out_pgs; > + } > + return 0; > + > +out_pin: > + xdp_umem_unpin_pages(umem); > +out_pgs: > + kfree(umem->pgs); > + umem->pgs = NULL; > + return err; > +} > + > +static int xdp_umem_account_pages(struct xdp_umem *umem) > +{ > + unsigned long lock_limit, new_npgs, old_npgs; > + > + if (capable(CAP_IPC_LOCK)) > + return 0; > + > + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + umem->user = get_uid(current_user()); > + > + do { > + old_npgs = atomic_long_read(&umem->user->locked_vm); > + new_npgs = old_npgs + umem->npgs; > + if (new_npgs > lock_limit) { > + free_uid(umem->user); > + umem->user = NULL; > + return -ENOBUFS; > + } > + } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, > + new_npgs) != old_npgs); > + return 0; > +} > + > +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) > +{ > + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; > + u64 addr = mr->addr, size = mr->len; > + unsigned int nframes; > + int size_chk, err; > + > + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { > + /* Strictly speaking we could support this, if: > + * - huge pages, or* what does "or*" here mean? > + * - using an IOMMU, or > + * - making sure the memory area is consecutive > + * but for now, we simply say "computer says no". > + */ > + return -EINVAL; > + } > + > + if (!is_power_of_2(frame_size)) > + return -EINVAL; > + > + if (!PAGE_ALIGNED(addr)) { > + /* Memory area has to be page size aligned. For > + * simplicity, this might change. > + */ > + return -EINVAL; > + } > + > + if ((addr + size) < addr) > + return -EINVAL; > + > + nframes = size / frame_size; > + if (nframes == 0 || nframes > UINT_MAX) > + return -EINVAL; > + > + frame_headroom = ALIGN(frame_headroom, 64); > + > + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; > + if (size_chk < 0) > + return -EINVAL; > + > + umem->pid = get_task_pid(current, PIDTYPE_PID); > + umem->size = (size_t)size; > + umem->address = (unsigned long)addr; > + umem->props.frame_size = frame_size; > + umem->props.nframes = nframes; > + umem->frame_headroom = frame_headroom; > + umem->npgs = size / PAGE_SIZE; > + umem->pgs = NULL; > + umem->user = NULL; > + > + umem->frame_size_log2 = ilog2(frame_size); > + umem->nfpp_mask = (PAGE_SIZE / frame_size) - 1; > + umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size); > + atomic_set(&umem->users, 1); > + > + err = xdp_umem_account_pages(umem); > + if (err) > + goto out; > + > + err = xdp_umem_pin_pages(umem); > + if (err) > + goto out; > + return 0; > + > +out: > + put_pid(umem->pid); > + return err; > +} > + > +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) > +{ > + int err; > + > + if (!umem) > + return -EINVAL; > + > + down_write(¤t->mm->mmap_sem); > + > + err = __xdp_umem_reg(umem, mr); > + > + up_write(¤t->mm->mmap_sem); > + return err; > +} > + > diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h > new file mode 100644 > index 000000000000..58714f4f7f25 > --- /dev/null > +++ b/net/xdp/xdp_umem.h > @@ -0,0 +1,42 @@ > +/* SPDX-License-Identifier: GPL-2.0 > + * XDP user-space packet buffer > + * Copyright(c) 2018 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + */ > + > +#ifndef XDP_UMEM_H_ > +#define XDP_UMEM_H_ > + > +#include <linux/mm.h> > +#include <linux/if_xdp.h> > + > +#include "xdp_umem_props.h" > + > +struct xdp_umem { > + struct page **pgs; > + struct xdp_umem_props props; > + u32 npgs; > + u32 frame_headroom; > + u32 nfpp_mask; > + u32 nfpplog2; > + u32 frame_size_log2; > + struct user_struct *user; > + struct pid *pid; > + unsigned long address; > + size_t size; > + atomic_t users; > +}; > + > +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); > +void xdp_put_umem(struct xdp_umem *umem); > +int xdp_umem_create(struct xdp_umem **umem); > + > +#endif /* XDP_UMEM_H_ */ > diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h > new file mode 100644 > index 000000000000..77fb5daf29f3 > --- /dev/null > +++ b/net/xdp/xdp_umem_props.h > @@ -0,0 +1,23 @@ > +/* SPDX-License-Identifier: GPL-2.0 > + * XDP user-space packet buffer > + * Copyright(c) 2018 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + */ > + > +#ifndef XDP_UMEM_PROPS_H_ > +#define XDP_UMEM_PROPS_H_ > + > +struct xdp_umem_props { > + u32 frame_size; > + u32 nframes; > +}; > + > +#endif /* XDP_UMEM_PROPS_H_ */ > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c > new file mode 100644 > index 000000000000..19fc719cbe0d > --- /dev/null > +++ b/net/xdp/xsk.c > @@ -0,0 +1,223 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* XDP sockets > + * > + * AF_XDP sockets allows a channel between XDP programs and userspace > + * applications. > + * Copyright(c) 2018 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * Author(s): Björn Töpel <bjorn.topel@intel.com> > + * Magnus Karlsson <magnus.karlsson@intel.com> > + */ > + > +#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ > + > +#include <linux/if_xdp.h> > +#include <linux/init.h> > +#include <linux/sched/mm.h> > +#include <linux/sched/signal.h> > +#include <linux/sched/task.h> > +#include <linux/socket.h> > +#include <linux/file.h> > +#include <linux/uaccess.h> > +#include <linux/net.h> > +#include <linux/netdevice.h> > +#include <net/sock.h> > + > +#include "xdp_umem.h" > + > +struct xdp_sock { > + /* struct sock must be the first member of struct xdp_sock */ > + struct sock sk; > + struct xdp_umem *umem; > + /* Protects multiple processes in the control path */ > + struct mutex mutex; > +}; > + > +static struct xdp_sock *xdp_sk(struct sock *sk) > +{ > + return (struct xdp_sock *)sk; > +} > + > +static int xsk_release(struct socket *sock) > +{ > + struct sock *sk = sock->sk; > + struct net *net; > + > + if (!sk) > + return 0; > + > + net = sock_net(sk); > + > + local_bh_disable(); > + sock_prot_inuse_add(net, sk->sk_prot, -1); > + local_bh_enable(); > + > + sock_orphan(sk); > + sock->sk = NULL; > + > + sk_refcnt_debug_release(sk); > + sock_put(sk); > + > + return 0; > +} > + > +static int xsk_setsockopt(struct socket *sock, int level, int optname, > + char __user *optval, unsigned int optlen) > +{ > + struct sock *sk = sock->sk; > + struct xdp_sock *xs = xdp_sk(sk); > + int err; > + > + if (level != SOL_XDP) > + return -ENOPROTOOPT; > + > + switch (optname) { > + case XDP_UMEM_REG: > + { > + struct xdp_umem_reg mr; > + struct xdp_umem *umem; > + > + if (xs->umem) > + return -EBUSY; > + > + if (copy_from_user(&mr, optval, sizeof(mr))) > + return -EFAULT; > + > + mutex_lock(&xs->mutex); > + err = xdp_umem_create(&umem); > + > + err = xdp_umem_reg(umem, &mr); > + if (err) { > + kfree(umem); > + mutex_unlock(&xs->mutex); > + return err; > + } > + > + /* Make sure umem is ready before it can be seen by others */ > + smp_wmb(); > + > + xs->umem = umem; > + mutex_unlock(&xs->mutex); > + return 0; > + } > + default: > + break; > + } > + > + return -ENOPROTOOPT; > +} > + > +static struct proto xsk_proto = { > + .name = "XDP", > + .owner = THIS_MODULE, > + .obj_size = sizeof(struct xdp_sock), > +}; > + > +static const struct proto_ops xsk_proto_ops = { > + .family = PF_XDP, > + .owner = THIS_MODULE, > + .release = xsk_release, > + .bind = sock_no_bind, > + .connect = sock_no_connect, > + .socketpair = sock_no_socketpair, > + .accept = sock_no_accept, > + .getname = sock_no_getname, > + .poll = sock_no_poll, > + .ioctl = sock_no_ioctl, > + .listen = sock_no_listen, > + .shutdown = sock_no_shutdown, > + .setsockopt = xsk_setsockopt, > + .getsockopt = sock_no_getsockopt, > + .sendmsg = sock_no_sendmsg, > + .recvmsg = sock_no_recvmsg, > + .mmap = sock_no_mmap, > + .sendpage = sock_no_sendpage, > +}; > + > +static void xsk_destruct(struct sock *sk) > +{ > + struct xdp_sock *xs = xdp_sk(sk); > + > + if (!sock_flag(sk, SOCK_DEAD)) > + return; > + > + xdp_put_umem(xs->umem); > + > + sk_refcnt_debug_dec(sk); > +} > + > +static int xsk_create(struct net *net, struct socket *sock, int protocol, > + int kern) > +{ > + struct sock *sk; > + struct xdp_sock *xs; > + > + if (!ns_capable(net->user_ns, CAP_NET_RAW)) > + return -EPERM; > + if (sock->type != SOCK_RAW) > + return -ESOCKTNOSUPPORT; > + > + if (protocol) > + return -EPROTONOSUPPORT; > + > + sock->state = SS_UNCONNECTED; > + > + sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); > + if (!sk) > + return -ENOBUFS; > + > + sock->ops = &xsk_proto_ops; > + > + sock_init_data(sock, sk); > + > + sk->sk_family = PF_XDP; > + > + sk->sk_destruct = xsk_destruct; > + sk_refcnt_debug_inc(sk); > + > + xs = xdp_sk(sk); > + mutex_init(&xs->mutex); > + > + local_bh_disable(); > + sock_prot_inuse_add(net, &xsk_proto, 1); > + local_bh_enable(); > + > + return 0; > +} > + > +static const struct net_proto_family xsk_family_ops = { > + .family = PF_XDP, > + .create = xsk_create, > + .owner = THIS_MODULE, > +}; > + > +static int __init xsk_init(void) > +{ > + int err; > + > + err = proto_register(&xsk_proto, 0 /* no slab */); > + if (err) > + goto out; > + > + err = sock_register(&xsk_family_ops); > + if (err) > + goto out_proto; > + > + return 0; > + > +out_proto: > + proto_unregister(&xsk_proto); > +out: > + return err; > +} > + > +fs_initcall(xsk_init); > -- > 2.14.1
2018-04-23 18:18 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>: [...] >> +static void xdp_umem_unpin_pages(struct xdp_umem *umem) >> +{ >> + unsigned int i; >> + >> + if (umem->pgs) { >> + for (i = 0; i < umem->npgs; i++) > > Since you pin them with FOLL_WRITE, I assume these pages > are written to. > Don't you need set_page_dirty_lock here? > Hmm, I actually *removed* it from the RFC V2, but after doing some homework, I think you're right. Thanks for pointing this out! Thinking more about this; This function is called from sk_destruct, and in the Tx case the sk_destruct can be called from interrupt context, where set_page_dirty_lock cannot be called. Are there any preferred ways of solving this? Scheduling the whole xsk_destruct call to a workqueue is one way (I think). Any cleaner/better way? [...] >> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) >> +{ >> + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; >> + u64 addr = mr->addr, size = mr->len; >> + unsigned int nframes; >> + int size_chk, err; >> + >> + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { >> + /* Strictly speaking we could support this, if: >> + * - huge pages, or* > > what does "or*" here mean? > Oops, I'll change to just 'or' in the next revision. Thanks! Björn
On Mon, Apr 23, 2018 at 10:00:15PM +0200, Björn Töpel wrote: > 2018-04-23 18:18 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>: > > [...] > > >> +static void xdp_umem_unpin_pages(struct xdp_umem *umem) > >> +{ > >> + unsigned int i; > >> + > >> + if (umem->pgs) { > >> + for (i = 0; i < umem->npgs; i++) > > > > Since you pin them with FOLL_WRITE, I assume these pages > > are written to. > > Don't you need set_page_dirty_lock here? > > > > Hmm, I actually *removed* it from the RFC V2, but after doing some > homework, I think you're right. Thanks for pointing this out! > > Thinking more about this; This function is called from sk_destruct, > and in the Tx case the sk_destruct can be called from interrupt > context, where set_page_dirty_lock cannot be called. > > Are there any preferred ways of solving this? Scheduling the whole > xsk_destruct call to a workqueue is one way (I think). Any > cleaner/better way? > > [...] Defer unpinning pages until the next tx call? > >> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) > >> +{ > >> + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; > >> + u64 addr = mr->addr, size = mr->len; > >> + unsigned int nframes; > >> + int size_chk, err; > >> + > >> + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { > >> + /* Strictly speaking we could support this, if: > >> + * - huge pages, or* > > > > what does "or*" here mean? > > > > Oops, I'll change to just 'or' in the next revision. > > > Thanks! > Björn
2018-04-23 22:11 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>: > On Mon, Apr 23, 2018 at 10:00:15PM +0200, Björn Töpel wrote: >> 2018-04-23 18:18 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>: >> >> [...] >> >> >> +static void xdp_umem_unpin_pages(struct xdp_umem *umem) >> >> +{ >> >> + unsigned int i; >> >> + >> >> + if (umem->pgs) { >> >> + for (i = 0; i < umem->npgs; i++) >> > >> > Since you pin them with FOLL_WRITE, I assume these pages >> > are written to. >> > Don't you need set_page_dirty_lock here? >> > >> >> Hmm, I actually *removed* it from the RFC V2, but after doing some >> homework, I think you're right. Thanks for pointing this out! >> >> Thinking more about this; This function is called from sk_destruct, >> and in the Tx case the sk_destruct can be called from interrupt >> context, where set_page_dirty_lock cannot be called. >> >> Are there any preferred ways of solving this? Scheduling the whole >> xsk_destruct call to a workqueue is one way (I think). Any >> cleaner/better way? >> >> [...] > > Defer unpinning pages until the next tx call? > If the sock is released, there wont be another tx call. Or am I missing something obvious? > >> >> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) >> >> +{ >> >> + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; >> >> + u64 addr = mr->addr, size = mr->len; >> >> + unsigned int nframes; >> >> + int size_chk, err; >> >> + >> >> + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { >> >> + /* Strictly speaking we could support this, if: >> >> + * - huge pages, or* >> > >> > what does "or*" here mean? >> > >> >> Oops, I'll change to just 'or' in the next revision. >> >> >> Thanks! >> Björn
On Mon, Apr 23, 2018 at 10:15:18PM +0200, Björn Töpel wrote: > 2018-04-23 22:11 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>: > > On Mon, Apr 23, 2018 at 10:00:15PM +0200, Björn Töpel wrote: > >> 2018-04-23 18:18 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>: > >> > >> [...] > >> > >> >> +static void xdp_umem_unpin_pages(struct xdp_umem *umem) > >> >> +{ > >> >> + unsigned int i; > >> >> + > >> >> + if (umem->pgs) { > >> >> + for (i = 0; i < umem->npgs; i++) > >> > > >> > Since you pin them with FOLL_WRITE, I assume these pages > >> > are written to. > >> > Don't you need set_page_dirty_lock here? > >> > > >> > >> Hmm, I actually *removed* it from the RFC V2, but after doing some > >> homework, I think you're right. Thanks for pointing this out! > >> > >> Thinking more about this; This function is called from sk_destruct, > >> and in the Tx case the sk_destruct can be called from interrupt > >> context, where set_page_dirty_lock cannot be called. > >> > >> Are there any preferred ways of solving this? Scheduling the whole > >> xsk_destruct call to a workqueue is one way (I think). Any > >> cleaner/better way? > >> > >> [...] > > > > Defer unpinning pages until the next tx call? > > > > If the sock is released, there wont be another tx call. unpin them on socket release too? > Or am I > missing something obvious? > > > > >> >> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) > >> >> +{ > >> >> + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; > >> >> + u64 addr = mr->addr, size = mr->len; > >> >> + unsigned int nframes; > >> >> + int size_chk, err; > >> >> + > >> >> + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { > >> >> + /* Strictly speaking we could support this, if: > >> >> + * - huge pages, or* > >> > > >> > what does "or*" here mean? > >> > > >> > >> Oops, I'll change to just 'or' in the next revision. > >> > >> > >> Thanks! > >> Björn
On Mon, Apr 23, 2018 at 9:56 AM, Björn Töpel <bjorn.topel@gmail.com> wrote: > From: Björn Töpel <bjorn.topel@intel.com> > > In this commit the base structure of the AF_XDP address family is set > up. Further, we introduce the abilty register a window of user memory > to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory > window is viewed by an AF_XDP socket as a set of equally large > frames. After a user memory registration all frames are "owned" by the > user application, and not the kernel. > > Co-authored-by: Magnus Karlsson <magnus.karlsson@intel.com> > Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com> > Signed-off-by: Björn Töpel <bjorn.topel@intel.com> > +static void xdp_umem_release(struct xdp_umem *umem) > +{ > + struct task_struct *task; > + struct mm_struct *mm; > + unsigned long diff; > + > + if (umem->pgs) { > + xdp_umem_unpin_pages(umem); > + > + task = get_pid_task(umem->pid, PIDTYPE_PID); > + put_pid(umem->pid); > + if (!task) > + goto out; > + mm = get_task_mm(task); > + put_task_struct(task); > + if (!mm) > + goto out; > + > + diff = umem->size >> PAGE_SHIFT; Need to round up or size must always be a multiple of PAGE_SIZE. > + > + down_write(&mm->mmap_sem); > + mm->pinned_vm -= diff; > + up_write(&mm->mmap_sem); When using user->locked_vm for resource limit checks, no need to also update mm->pinned_vm? > +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) > +{ > + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; > + u64 addr = mr->addr, size = mr->len; > + unsigned int nframes; > + int size_chk, err; > + > + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { > + /* Strictly speaking we could support this, if: > + * - huge pages, or* > + * - using an IOMMU, or > + * - making sure the memory area is consecutive > + * but for now, we simply say "computer says no". > + */ > + return -EINVAL; > + } Ideally, AF_XDP subsumes all packet socket use cases. It does not have packet v3's small packet optimizations of variable sized frames and block signaling. I don't suggest adding that now. But for the non-zerocopy case, it may make sense to ensure that nothing is blocking a later addition of these features. Especially for header-only (snaplen) workloads. So far, I don't see any issues. > + if (!is_power_of_2(frame_size)) > + return -EINVAL; > + > + if (!PAGE_ALIGNED(addr)) { > + /* Memory area has to be page size aligned. For > + * simplicity, this might change. > + */ > + return -EINVAL; > + } > + > + if ((addr + size) < addr) > + return -EINVAL; > + > + nframes = size / frame_size; > + if (nframes == 0 || nframes > UINT_MAX) > + return -EINVAL; You may also want a check here that nframes * frame_size is at least PAGE_SIZE and probably a multiple of that. > + frame_headroom = ALIGN(frame_headroom, 64); > + > + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; > + if (size_chk < 0) > + return -EINVAL; > + > + umem->pid = get_task_pid(current, PIDTYPE_PID); > + umem->size = (size_t)size; > + umem->address = (unsigned long)addr; > + umem->props.frame_size = frame_size; > + umem->props.nframes = nframes; > + umem->frame_headroom = frame_headroom; > + umem->npgs = size / PAGE_SIZE; > + umem->pgs = NULL; > + umem->user = NULL; > + > + umem->frame_size_log2 = ilog2(frame_size); > + umem->nfpp_mask = (PAGE_SIZE / frame_size) - 1; > + umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size); > + atomic_set(&umem->users, 1); > + > + err = xdp_umem_account_pages(umem); > + if (err) > + goto out; > + > + err = xdp_umem_pin_pages(umem); > + if (err) need to call xdp_umem_unaccount_pages on error > + goto out; > + return 0; > + > +out: > + put_pid(umem->pid); > + return err; > +}
2018-04-23 22:26 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>: > On Mon, Apr 23, 2018 at 10:15:18PM +0200, Björn Töpel wrote: >> 2018-04-23 22:11 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>: >> > On Mon, Apr 23, 2018 at 10:00:15PM +0200, Björn Töpel wrote: >> >> 2018-04-23 18:18 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>: >> >> >> >> [...] >> >> >> >> >> +static void xdp_umem_unpin_pages(struct xdp_umem *umem) >> >> >> +{ >> >> >> + unsigned int i; >> >> >> + >> >> >> + if (umem->pgs) { >> >> >> + for (i = 0; i < umem->npgs; i++) >> >> > >> >> > Since you pin them with FOLL_WRITE, I assume these pages >> >> > are written to. >> >> > Don't you need set_page_dirty_lock here? >> >> > >> >> >> >> Hmm, I actually *removed* it from the RFC V2, but after doing some >> >> homework, I think you're right. Thanks for pointing this out! >> >> >> >> Thinking more about this; This function is called from sk_destruct, >> >> and in the Tx case the sk_destruct can be called from interrupt >> >> context, where set_page_dirty_lock cannot be called. >> >> >> >> Are there any preferred ways of solving this? Scheduling the whole >> >> xsk_destruct call to a workqueue is one way (I think). Any >> >> cleaner/better way? >> >> >> >> [...] >> > >> > Defer unpinning pages until the next tx call? >> > >> >> If the sock is released, there wont be another tx call. > > unpin them on socket release too? > AF_XDP pins all memory up front, and unpins it when the socket is released (final sock_put), which in this case is in the skb destructor. So there's no later point from a sock lifetime perspective. I'll make a stab at doing umem clean up in a worker queue. >> Or am I >> missing something obvious? >> >> > >> >> >> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) >> >> >> +{ >> >> >> + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; >> >> >> + u64 addr = mr->addr, size = mr->len; >> >> >> + unsigned int nframes; >> >> >> + int size_chk, err; >> >> >> + >> >> >> + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { >> >> >> + /* Strictly speaking we could support this, if: >> >> >> + * - huge pages, or* >> >> > >> >> > what does "or*" here mean? >> >> > >> >> >> >> Oops, I'll change to just 'or' in the next revision. >> >> >> >> >> >> Thanks! >> >> Björn
2018-04-24 1:04 GMT+02:00 Willem de Bruijn <willemdebruijn.kernel@gmail.com>: > On Mon, Apr 23, 2018 at 9:56 AM, Björn Töpel <bjorn.topel@gmail.com> wrote: >> From: Björn Töpel <bjorn.topel@intel.com> >> >> In this commit the base structure of the AF_XDP address family is set >> up. Further, we introduce the abilty register a window of user memory >> to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory >> window is viewed by an AF_XDP socket as a set of equally large >> frames. After a user memory registration all frames are "owned" by the >> user application, and not the kernel. >> >> Co-authored-by: Magnus Karlsson <magnus.karlsson@intel.com> >> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com> >> Signed-off-by: Björn Töpel <bjorn.topel@intel.com> > >> +static void xdp_umem_release(struct xdp_umem *umem) >> +{ >> + struct task_struct *task; >> + struct mm_struct *mm; >> + unsigned long diff; >> + >> + if (umem->pgs) { >> + xdp_umem_unpin_pages(umem); >> + >> + task = get_pid_task(umem->pid, PIDTYPE_PID); >> + put_pid(umem->pid); >> + if (!task) >> + goto out; >> + mm = get_task_mm(task); >> + put_task_struct(task); >> + if (!mm) >> + goto out; >> + >> + diff = umem->size >> PAGE_SHIFT; > > Need to round up or size must always be a multiple of PAGE_SIZE. > Yes, you're right! I'll add constraints to the umem setup. See further down in the reply. >> + >> + down_write(&mm->mmap_sem); >> + mm->pinned_vm -= diff; >> + up_write(&mm->mmap_sem); > > When using user->locked_vm for resource limit checks, no need > to also update mm->pinned_vm? > Hmm, dug around in the code, and it looks like you're correct -- i.e. if user->locked_vm is used, we shouldn't update the mm->pinned_vm. I'll need to check a bit more, so that I'm certain, but if so, I'll remove it in the next revision. >> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) >> +{ >> + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; >> + u64 addr = mr->addr, size = mr->len; >> + unsigned int nframes; >> + int size_chk, err; >> + >> + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { >> + /* Strictly speaking we could support this, if: >> + * - huge pages, or* >> + * - using an IOMMU, or >> + * - making sure the memory area is consecutive >> + * but for now, we simply say "computer says no". >> + */ >> + return -EINVAL; >> + } > > Ideally, AF_XDP subsumes all packet socket use cases. It does not > have packet v3's small packet optimizations of variable sized frames > and block signaling. > > I don't suggest adding that now. But for the non-zerocopy case, it may > make sense to ensure that nothing is blocking a later addition of these > features. Especially for header-only (snaplen) workloads. So far, I don't > see any issues. > Ok. Block signaling is sort of ring batching, so I think we're good for that case. As for variable sized frames *within* a umem, that's trickier. To support different sizes, multiple umems (and multiple queues) -- if that makes sense? >> + if (!is_power_of_2(frame_size)) >> + return -EINVAL; >> + >> + if (!PAGE_ALIGNED(addr)) { >> + /* Memory area has to be page size aligned. For >> + * simplicity, this might change. >> + */ >> + return -EINVAL; >> + } >> + >> + if ((addr + size) < addr) >> + return -EINVAL; >> + >> + nframes = size / frame_size; >> + if (nframes == 0 || nframes > UINT_MAX) >> + return -EINVAL; > > You may also want a check here that nframes * frame_size is at least > PAGE_SIZE and probably a multiple of that. > Yup! I'll add those checks. This will make the "diff shift" in the release code safe as well. Thanks! >> + frame_headroom = ALIGN(frame_headroom, 64); >> + >> + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; >> + if (size_chk < 0) >> + return -EINVAL; >> + >> + umem->pid = get_task_pid(current, PIDTYPE_PID); >> + umem->size = (size_t)size; >> + umem->address = (unsigned long)addr; >> + umem->props.frame_size = frame_size; >> + umem->props.nframes = nframes; >> + umem->frame_headroom = frame_headroom; >> + umem->npgs = size / PAGE_SIZE; >> + umem->pgs = NULL; >> + umem->user = NULL; >> + >> + umem->frame_size_log2 = ilog2(frame_size); >> + umem->nfpp_mask = (PAGE_SIZE / frame_size) - 1; >> + umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size); >> + atomic_set(&umem->users, 1); >> + >> + err = xdp_umem_account_pages(umem); >> + if (err) >> + goto out; >> + >> + err = xdp_umem_pin_pages(umem); >> + if (err) > > need to call xdp_umem_unaccount_pages on error Indeed! I'll fix that! >> + goto out; >> + return 0; >> + >> +out: >> + put_pid(umem->pid); >> + return err; >> +}
Hi Björn,
I love your patch! Yet something to improve:
[auto build test ERROR on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Bj-rn-T-pel/Introducing-AF_XDP-support/20180424-085240
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: m68k-allyesconfig (attached as .config)
compiler: m68k-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=m68k
All errors (new ones prefixed by >>):
net/xdp/xdp_umem.o: In function `xdp_umem_reg':
>> xdp_umem.c:(.text+0x200): undefined reference to `__udivdi3'
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h new file mode 100644 index 000000000000..41252135a0fe --- /dev/null +++ b/include/uapi/linux/if_xdp.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note + * + * if_xdp: XDP socket user-space interface + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Author(s): Björn Töpel <bjorn.topel@intel.com> + * Magnus Karlsson <magnus.karlsson@intel.com> + */ + +#ifndef _LINUX_IF_XDP_H +#define _LINUX_IF_XDP_H + +#include <linux/types.h> + +/* XDP socket options */ +#define XDP_UMEM_REG 3 + +struct xdp_umem_reg { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 frame_size; /* Frame size */ + __u32 frame_headroom; /* Frame head room */ +}; + +#endif /* _LINUX_IF_XDP_H */ diff --git a/net/Makefile b/net/Makefile index a6147c61b174..77aaddedbd29 100644 --- a/net/Makefile +++ b/net/Makefile @@ -85,3 +85,4 @@ obj-y += l3mdev/ endif obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ +obj-$(CONFIG_XDP_SOCKETS) += xdp/ diff --git a/net/xdp/Makefile b/net/xdp/Makefile new file mode 100644 index 000000000000..a5d736640a0f --- /dev/null +++ b/net/xdp/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o + diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c new file mode 100644 index 000000000000..bff058f5a769 --- /dev/null +++ b/net/xdp/xdp_umem.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/init.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/mm.h> + +#include "xdp_umem.h" + +#define XDP_UMEM_MIN_FRAME_SIZE 2048 + +int xdp_umem_create(struct xdp_umem **umem) +{ + *umem = kzalloc(sizeof(**umem), GFP_KERNEL); + + if (!(*umem)) + return -ENOMEM; + + return 0; +} + +static void xdp_umem_unpin_pages(struct xdp_umem *umem) +{ + unsigned int i; + + if (umem->pgs) { + for (i = 0; i < umem->npgs; i++) + put_page(umem->pgs[i]); + + kfree(umem->pgs); + umem->pgs = NULL; + } +} + +static void xdp_umem_unaccount_pages(struct xdp_umem *umem) +{ + if (umem->user) { + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); + } +} + +static void xdp_umem_release(struct xdp_umem *umem) +{ + struct task_struct *task; + struct mm_struct *mm; + unsigned long diff; + + if (umem->pgs) { + xdp_umem_unpin_pages(umem); + + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + diff = umem->size >> PAGE_SHIFT; + + down_write(&mm->mmap_sem); + mm->pinned_vm -= diff; + up_write(&mm->mmap_sem); + mmput(mm); + umem->pgs = NULL; + } + + xdp_umem_unaccount_pages(umem); +out: + kfree(umem); +} + +void xdp_put_umem(struct xdp_umem *umem) +{ + if (!umem) + return; + + if (atomic_dec_and_test(&umem->users)) + xdp_umem_release(umem); +} + +static int xdp_umem_pin_pages(struct xdp_umem *umem) +{ + unsigned int gup_flags = FOLL_WRITE; + long npgs; + int err; + + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); + if (!umem->pgs) + return -ENOMEM; + + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags, &umem->pgs[0], NULL); + if (npgs != umem->npgs) { + if (npgs >= 0) { + umem->npgs = npgs; + err = -ENOMEM; + goto out_pin; + } + err = npgs; + goto out_pgs; + } + return 0; + +out_pin: + xdp_umem_unpin_pages(umem); +out_pgs: + kfree(umem->pgs); + umem->pgs = NULL; + return err; +} + +static int xdp_umem_account_pages(struct xdp_umem *umem) +{ + unsigned long lock_limit, new_npgs, old_npgs; + + if (capable(CAP_IPC_LOCK)) + return 0; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + umem->user = get_uid(current_user()); + + do { + old_npgs = atomic_long_read(&umem->user->locked_vm); + new_npgs = old_npgs + umem->npgs; + if (new_npgs > lock_limit) { + free_uid(umem->user); + umem->user = NULL; + return -ENOBUFS; + } + } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, + new_npgs) != old_npgs); + return 0; +} + +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +{ + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; + u64 addr = mr->addr, size = mr->len; + unsigned int nframes; + int size_chk, err; + + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { + /* Strictly speaking we could support this, if: + * - huge pages, or* + * - using an IOMMU, or + * - making sure the memory area is consecutive + * but for now, we simply say "computer says no". + */ + return -EINVAL; + } + + if (!is_power_of_2(frame_size)) + return -EINVAL; + + if (!PAGE_ALIGNED(addr)) { + /* Memory area has to be page size aligned. For + * simplicity, this might change. + */ + return -EINVAL; + } + + if ((addr + size) < addr) + return -EINVAL; + + nframes = size / frame_size; + if (nframes == 0 || nframes > UINT_MAX) + return -EINVAL; + + frame_headroom = ALIGN(frame_headroom, 64); + + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; + if (size_chk < 0) + return -EINVAL; + + umem->pid = get_task_pid(current, PIDTYPE_PID); + umem->size = (size_t)size; + umem->address = (unsigned long)addr; + umem->props.frame_size = frame_size; + umem->props.nframes = nframes; + umem->frame_headroom = frame_headroom; + umem->npgs = size / PAGE_SIZE; + umem->pgs = NULL; + umem->user = NULL; + + umem->frame_size_log2 = ilog2(frame_size); + umem->nfpp_mask = (PAGE_SIZE / frame_size) - 1; + umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size); + atomic_set(&umem->users, 1); + + err = xdp_umem_account_pages(umem); + if (err) + goto out; + + err = xdp_umem_pin_pages(umem); + if (err) + goto out; + return 0; + +out: + put_pid(umem->pid); + return err; +} + +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +{ + int err; + + if (!umem) + return -EINVAL; + + down_write(¤t->mm->mmap_sem); + + err = __xdp_umem_reg(umem, mr); + + up_write(¤t->mm->mmap_sem); + return err; +} + diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h new file mode 100644 index 000000000000..58714f4f7f25 --- /dev/null +++ b/net/xdp/xdp_umem.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef XDP_UMEM_H_ +#define XDP_UMEM_H_ + +#include <linux/mm.h> +#include <linux/if_xdp.h> + +#include "xdp_umem_props.h" + +struct xdp_umem { + struct page **pgs; + struct xdp_umem_props props; + u32 npgs; + u32 frame_headroom; + u32 nfpp_mask; + u32 nfpplog2; + u32 frame_size_log2; + struct user_struct *user; + struct pid *pid; + unsigned long address; + size_t size; + atomic_t users; +}; + +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); +void xdp_put_umem(struct xdp_umem *umem); +int xdp_umem_create(struct xdp_umem **umem); + +#endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h new file mode 100644 index 000000000000..77fb5daf29f3 --- /dev/null +++ b/net/xdp/xdp_umem_props.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef XDP_UMEM_PROPS_H_ +#define XDP_UMEM_PROPS_H_ + +struct xdp_umem_props { + u32 frame_size; + u32 nframes; +}; + +#endif /* XDP_UMEM_PROPS_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c new file mode 100644 index 000000000000..19fc719cbe0d --- /dev/null +++ b/net/xdp/xsk.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets + * + * AF_XDP sockets allows a channel between XDP programs and userspace + * applications. + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Author(s): Björn Töpel <bjorn.topel@intel.com> + * Magnus Karlsson <magnus.karlsson@intel.com> + */ + +#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ + +#include <linux/if_xdp.h> +#include <linux/init.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/socket.h> +#include <linux/file.h> +#include <linux/uaccess.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <net/sock.h> + +#include "xdp_umem.h" + +struct xdp_sock { + /* struct sock must be the first member of struct xdp_sock */ + struct sock sk; + struct xdp_umem *umem; + /* Protects multiple processes in the control path */ + struct mutex mutex; +}; + +static struct xdp_sock *xdp_sk(struct sock *sk) +{ + return (struct xdp_sock *)sk; +} + +static int xsk_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct net *net; + + if (!sk) + return 0; + + net = sock_net(sk); + + local_bh_disable(); + sock_prot_inuse_add(net, sk->sk_prot, -1); + local_bh_enable(); + + sock_orphan(sk); + sock->sk = NULL; + + sk_refcnt_debug_release(sk); + sock_put(sk); + + return 0; +} + +static int xsk_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int err; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + switch (optname) { + case XDP_UMEM_REG: + { + struct xdp_umem_reg mr; + struct xdp_umem *umem; + + if (xs->umem) + return -EBUSY; + + if (copy_from_user(&mr, optval, sizeof(mr))) + return -EFAULT; + + mutex_lock(&xs->mutex); + err = xdp_umem_create(&umem); + + err = xdp_umem_reg(umem, &mr); + if (err) { + kfree(umem); + mutex_unlock(&xs->mutex); + return err; + } + + /* Make sure umem is ready before it can be seen by others */ + smp_wmb(); + + xs->umem = umem; + mutex_unlock(&xs->mutex); + return 0; + } + default: + break; + } + + return -ENOPROTOOPT; +} + +static struct proto xsk_proto = { + .name = "XDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct xdp_sock), +}; + +static const struct proto_ops xsk_proto_ops = { + .family = PF_XDP, + .owner = THIS_MODULE, + .release = xsk_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = sock_no_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = xsk_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static void xsk_destruct(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + return; + + xdp_put_umem(xs->umem); + + sk_refcnt_debug_dec(sk); +} + +static int xsk_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct xdp_sock *xs; + + if (!ns_capable(net->user_ns, CAP_NET_RAW)) + return -EPERM; + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + if (protocol) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); + if (!sk) + return -ENOBUFS; + + sock->ops = &xsk_proto_ops; + + sock_init_data(sock, sk); + + sk->sk_family = PF_XDP; + + sk->sk_destruct = xsk_destruct; + sk_refcnt_debug_inc(sk); + + xs = xdp_sk(sk); + mutex_init(&xs->mutex); + + local_bh_disable(); + sock_prot_inuse_add(net, &xsk_proto, 1); + local_bh_enable(); + + return 0; +} + +static const struct net_proto_family xsk_family_ops = { + .family = PF_XDP, + .create = xsk_create, + .owner = THIS_MODULE, +}; + +static int __init xsk_init(void) +{ + int err; + + err = proto_register(&xsk_proto, 0 /* no slab */); + if (err) + goto out; + + err = sock_register(&xsk_family_ops); + if (err) + goto out_proto; + + return 0; + +out_proto: + proto_unregister(&xsk_proto); +out: + return err; +} + +fs_initcall(xsk_init);