diff mbox series

[RFC,02/24] xsk: add user memory registration sockopt

Message ID 20180131135356.19134-3-bjorn.topel@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show
Series Introducing AF_XDP support | expand

Commit Message

Björn Töpel Jan. 31, 2018, 1:53 p.m. UTC
From: Björn Töpel <bjorn.topel@intel.com>

The XDP_MEM_REG socket option allows a process to register a window of
user space memory to the kernel. This memory will later be used as
frame data buffer.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 include/uapi/linux/if_xdp.h |   7 ++
 net/xdp/xsk.c               | 294 +++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk.h               |  19 ++-
 3 files changed, 316 insertions(+), 4 deletions(-)

Comments

Willem de Bruijn Feb. 7, 2018, 4 p.m. UTC | #1
On Wed, Jan 31, 2018 at 8:53 AM, Björn Töpel <bjorn.topel@gmail.com> wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
>
> The XDP_MEM_REG socket option allows a process to register a window of
> user space memory to the kernel. This memory will later be used as
> frame data buffer.
>
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
> ---

> +static struct xsk_umem *xsk_mem_reg(u64 addr, u64 size, u32 frame_size,
> +                                   u32 data_headroom)
> +{
> +       unsigned long lock_limit, locked, npages;
> +       int ret = 0;
> +       struct xsk_umem *umem;
> +
> +       if (!can_do_mlock())
> +               return ERR_PTR(-EPERM);
> +
> +       umem = xsk_umem_create(addr, size, frame_size, data_headroom);
> +       if (IS_ERR(umem))
> +               return umem;
> +
> +       npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT;
> +
> +       down_write(&current->mm->mmap_sem);
> +
> +       locked = npages + current->mm->pinned_vm;
> +       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +       if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +
> +       if (npages == 0 || npages > UINT_MAX) {
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +       umem->npgs = npages;
> +
> +       ret = xsk_umem_pin_pages(umem);
> +
> +out:
> +       if (ret < 0) {
> +               put_pid(umem->pid);
> +               kfree(umem);
> +       } else {
> +               current->mm->pinned_vm = locked;
> +       }
> +
> +       up_write(&current->mm->mmap_sem);

This limits per process. You may want to limit per user. See also
mm_account_pinned_pages.
Björn Töpel Feb. 7, 2018, 9:39 p.m. UTC | #2
2018-02-07 17:00 GMT+01:00 Willem de Bruijn <willemdebruijn.kernel@gmail.com>:
> On Wed, Jan 31, 2018 at 8:53 AM, Björn Töpel <bjorn.topel@gmail.com> wrote:
>> From: Björn Töpel <bjorn.topel@intel.com>
>>
>> The XDP_MEM_REG socket option allows a process to register a window of
>> user space memory to the kernel. This memory will later be used as
>> frame data buffer.
>>
>> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
>> ---
>
>> +static struct xsk_umem *xsk_mem_reg(u64 addr, u64 size, u32 frame_size,
>> +                                   u32 data_headroom)
>> +{
>> +       unsigned long lock_limit, locked, npages;
>> +       int ret = 0;
>> +       struct xsk_umem *umem;
>> +
>> +       if (!can_do_mlock())
>> +               return ERR_PTR(-EPERM);
>> +
>> +       umem = xsk_umem_create(addr, size, frame_size, data_headroom);
>> +       if (IS_ERR(umem))
>> +               return umem;
>> +
>> +       npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT;
>> +
>> +       down_write(&current->mm->mmap_sem);
>> +
>> +       locked = npages + current->mm->pinned_vm;
>> +       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> +
>> +       if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>> +               ret = -ENOMEM;
>> +               goto out;
>> +       }
>> +
>> +       if (npages == 0 || npages > UINT_MAX) {
>> +               ret = -EINVAL;
>> +               goto out;
>> +       }
>> +       umem->npgs = npages;
>> +
>> +       ret = xsk_umem_pin_pages(umem);
>> +
>> +out:
>> +       if (ret < 0) {
>> +               put_pid(umem->pid);
>> +               kfree(umem);
>> +       } else {
>> +               current->mm->pinned_vm = locked;
>> +       }
>> +
>> +       up_write(&current->mm->mmap_sem);
>
> This limits per process. You may want to limit per user. See also
> mm_account_pinned_pages.

Ah, noted! Thanks for pointing that out!
diff mbox series

Patch

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index cd09232e16c1..3f8c90c708b4 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -29,4 +29,11 @@  struct sockaddr_xdp {
 #define XDP_RX_RING	2
 #define XDP_TX_RING	3
 
+struct xdp_mr_req {
+	__u64	addr;           /* Start of packet data area */
+	__u64	len;            /* Length of packet data area */
+	__u32	frame_size;     /* Frame size */
+	__u32	data_headroom;  /* Frame head room */
+};
+
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 2d7c08a50c60..333ce1450cc7 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -19,18 +19,235 @@ 
 
 #include <linux/if_xdp.h>
 #include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/socket.h>
 #include <net/sock.h>
 
 #include "xsk.h"
 
+#define XSK_UMEM_MIN_FRAME_SIZE 2048
+
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
 	struct sock sk;
+	struct xsk_umem *umem;
 };
 
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+	return (struct xdp_sock *)sk;
+}
+
+static void xsk_umem_unpin_pages(struct xsk_umem *umem)
+{
+	unsigned int i;
+
+	if (umem->pgs) {
+		for (i = 0; i < umem->npgs; i++) {
+			struct page *page = umem->pgs[i];
+
+			set_page_dirty_lock(page);
+			put_page(page);
+		}
+
+		kfree(umem->pgs);
+		umem->pgs = NULL;
+	}
+}
+
+static void xsk_umem_destroy(struct xsk_umem *umem)
+{
+	struct mm_struct *mm;
+	struct task_struct *task;
+	unsigned long diff;
+
+	if (!umem)
+		return;
+
+	xsk_umem_unpin_pages(umem);
+
+	task = get_pid_task(umem->pid, PIDTYPE_PID);
+	put_pid(umem->pid);
+	if (!task)
+		goto out;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	diff = umem->size >> PAGE_SHIFT;
+
+	down_write(&mm->mmap_sem);
+	mm->pinned_vm -= diff;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+out:
+	kfree(umem);
+}
+
+static struct xsk_umem *xsk_umem_create(u64 addr, u64 size, u32 frame_size,
+					u32 data_headroom)
+{
+	struct xsk_umem *umem;
+	unsigned int nframes;
+	int size_chk;
+
+	if (frame_size < XSK_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+		/* Strictly speaking we could support this, if:
+		 * - huge pages, or*
+		 * - using an IOMMU, or
+		 * - making sure the memory area is consecutive
+		 * but for now, we simply say "computer says no".
+		 */
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!is_power_of_2(frame_size))
+		return ERR_PTR(-EINVAL);
+
+	if (!PAGE_ALIGNED(addr)) {
+		/* Memory area has to be page size aligned. For
+		 * simplicity, this might change.
+		 */
+		return ERR_PTR(-EINVAL);
+	}
+
+	if ((addr + size) < addr)
+		return ERR_PTR(-EINVAL);
+
+	nframes = size / frame_size;
+	if (nframes == 0)
+		return ERR_PTR(-EINVAL);
+
+	data_headroom =	ALIGN(data_headroom, 64);
+
+	size_chk = frame_size - data_headroom - XSK_KERNEL_HEADROOM;
+	if (size_chk < 0)
+		return ERR_PTR(-EINVAL);
+
+	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	umem->pid = get_task_pid(current, PIDTYPE_PID);
+	umem->size = (size_t)size;
+	umem->address = (unsigned long)addr;
+	umem->frame_size = frame_size;
+	umem->nframes = nframes;
+	umem->data_headroom = data_headroom;
+	umem->pgs = NULL;
+
+	return umem;
+}
+
+static int xsk_umem_pin_pages(struct xsk_umem *umem)
+{
+	unsigned int gup_flags = FOLL_WRITE;
+	long npgs;
+	int err;
+
+	/* XXX Fix so that we don't always pin.
+	 * "copy to user" from interrupt context, but how?
+	 */
+	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_ATOMIC);
+	if (!umem->pgs)
+		return -ENOMEM;
+
+	npgs = get_user_pages(umem->address, umem->npgs,
+			      gup_flags, &umem->pgs[0], NULL);
+	if (npgs != umem->npgs) {
+		if (npgs >= 0) {
+			umem->npgs = npgs;
+			err = -ENOMEM;
+			goto out_pin;
+		}
+		err = npgs;
+		goto out_pgs;
+	}
+
+	return 0;
+
+out_pin:
+	xsk_umem_unpin_pages(umem);
+out_pgs:
+	kfree(umem->pgs);
+	umem->pgs = NULL;
+
+	return err;
+}
+
+static struct xsk_umem *xsk_mem_reg(u64 addr, u64 size, u32 frame_size,
+				    u32 data_headroom)
+{
+	unsigned long lock_limit, locked, npages;
+	int ret = 0;
+	struct xsk_umem *umem;
+
+	if (!can_do_mlock())
+		return ERR_PTR(-EPERM);
+
+	umem = xsk_umem_create(addr, size, frame_size, data_headroom);
+	if (IS_ERR(umem))
+		return umem;
+
+	npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	locked = npages + current->mm->pinned_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (npages == 0 || npages > UINT_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+	umem->npgs = npages;
+
+	ret = xsk_umem_pin_pages(umem);
+
+out:
+	if (ret < 0) {
+		put_pid(umem->pid);
+		kfree(umem);
+	} else {
+		current->mm->pinned_vm = locked;
+	}
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret < 0 ? ERR_PTR(ret) : umem;
+}
+
 static int xsk_release(struct socket *sock)
 {
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct net *net;
+
+	if (!sk)
+		return 0;
+
+	net = sock_net(sk);
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, sk->sk_prot, -1);
+	local_bh_enable();
+
+	xsk_umem_destroy(xs->umem);
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	sk_refcnt_debug_release(sk);
+	sock_put(sk);
+
 	return 0;
 }
 
@@ -48,6 +265,43 @@  static unsigned int xsk_poll(struct file *file, struct socket *sock,
 static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+
+	if (level != SOL_XDP)
+		return -ENOPROTOOPT;
+
+	switch (optname) {
+	case XDP_MEM_REG:
+	{
+		struct xdp_mr_req req;
+		struct xsk_umem *umem;
+
+		if (optlen < sizeof(req))
+			return -EINVAL;
+		if (copy_from_user(&req, optval, sizeof(req)))
+			return -EFAULT;
+
+		umem = xsk_mem_reg(req.addr, req.len, req.frame_size,
+				   req.data_headroom);
+		if (IS_ERR(umem))
+			return PTR_ERR(umem);
+
+		lock_sock(sk);
+		if (xs->umem) { /* XXX create and check afterwards... really? */
+			release_sock(sk);
+			xsk_umem_destroy(umem);
+			return -EBUSY;
+		}
+		xs->umem = umem;
+		release_sock(sk);
+
+		return 0;
+	}
+	default:
+		break;
+	}
+
 	return -ENOPROTOOPT;
 }
 
@@ -97,10 +351,48 @@  static const struct proto_ops xsk_proto_ops = {
 	/* the rest vvv, OK to be missing implementation -- checked against NULL. */
 };
 
+static void xsk_destruct(struct sock *sk)
+{
+	if (!sock_flag(sk, SOCK_DEAD))
+		return;
+
+	sk_refcnt_debug_dec(sk);
+}
+
 static int xsk_create(struct net *net, struct socket *sock, int protocol,
 		      int kern)
 {
-	return -EOPNOTSUPP;
+	struct sock *sk;
+
+	if (!ns_capable(net->user_ns, CAP_NET_RAW))
+		return -EPERM;
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	/* XXX Require ETH_P_IP? Something else? */
+	if (protocol)
+		return -EPROTONOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+	if (!sk)
+		return -ENOBUFS;
+
+	sock->ops = &xsk_proto_ops;
+
+	sock_init_data(sock, sk);
+
+	sk->sk_family = PF_XDP;
+
+	sk->sk_destruct = xsk_destruct;
+	sk_refcnt_debug_inc(sk);
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, &xsk_proto, 1);
+	local_bh_enable();
+
+	return 0;
 }
 
 static const struct net_proto_family xsk_family_ops = {
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index 441f8d00a9d5..71559374645b 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -12,7 +12,20 @@ 
  * more details.
  */
 
-#ifndef _LINUX_XDPSOCK_H
-#define _LINUX_XDPSOCK_H
+#ifndef _LINUX_XSK_H
+#define _LINUX_XSK_H
 
-#endif /* _LINUX_XDPSOCK_H */
+#define XSK_KERNEL_HEADROOM 256 /* Headrom for XDP */
+
+struct xsk_umem {
+	struct pid *pid;
+	struct page **pgs;
+	unsigned long address;
+	size_t size;
+	u32 npgs;
+	u32 frame_size;
+	u32 nframes;
+	u32 data_headroom;
+};
+
+#endif /* _LINUX_XSK_H */