From patchwork Wed Jan 31 13:53:34 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: =?utf-8?b?QmrDtnJuIFTDtnBlbA==?= X-Patchwork-Id: 867960 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=netdev-owner@vger.kernel.org; receiver=) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 3zWlCt3T6Fz9s4s for ; Thu, 1 Feb 2018 00:56:42 +1100 (AEDT) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753064AbeAaN4j (ORCPT ); Wed, 31 Jan 2018 08:56:39 -0500 Received: from mga14.intel.com ([192.55.52.115]:52615 "EHLO mga14.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752367AbeAaNyU (ORCPT ); Wed, 31 Jan 2018 08:54:20 -0500 X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga006.fm.intel.com ([10.253.24.20]) by fmsmga103.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 31 Jan 2018 05:54:20 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.46,440,1511856000"; d="scan'208";a="200242526" Received: from btopel-mobl1.isw.intel.com (HELO btopel-mobl1.ger.corp.intel.com) ([10.103.211.155]) by fmsmga006.fm.intel.com with ESMTP; 31 Jan 2018 05:54:16 -0800 From: =?utf-8?b?QmrDtnJuIFTDtnBlbA==?= To: bjorn.topel@gmail.com, magnus.karlsson@intel.com, alexander.h.duyck@intel.com, alexander.duyck@gmail.com, john.fastabend@gmail.com, ast@fb.com, brouer@redhat.com, willemdebruijn.kernel@gmail.com, daniel@iogearbox.net, netdev@vger.kernel.org Cc: =?utf-8?b?QmrDtnJuIFTDtnBlbA==?= , michael.lundkvist@ericsson.com, jesse.brandeburg@intel.com, anjali.singhai@intel.com, jeffrey.b.shaw@intel.com, ferruh.yigit@intel.com, qi.z.zhang@intel.com Subject: [RFC PATCH 02/24] xsk: add user memory registration sockopt Date: Wed, 31 Jan 2018 14:53:34 +0100 Message-Id: <20180131135356.19134-3-bjorn.topel@gmail.com> X-Mailer: git-send-email 2.14.1 In-Reply-To: <20180131135356.19134-1-bjorn.topel@gmail.com> References: <20180131135356.19134-1-bjorn.topel@gmail.com> MIME-Version: 1.0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Björn Töpel The XDP_MEM_REG socket option allows a process to register a window of user space memory to the kernel. This memory will later be used as frame data buffer. Signed-off-by: Björn Töpel --- include/uapi/linux/if_xdp.h | 7 ++ net/xdp/xsk.c | 294 +++++++++++++++++++++++++++++++++++++++++++- net/xdp/xsk.h | 19 ++- 3 files changed, 316 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index cd09232e16c1..3f8c90c708b4 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -29,4 +29,11 @@ struct sockaddr_xdp { #define XDP_RX_RING 2 #define XDP_TX_RING 3 +struct xdp_mr_req { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 frame_size; /* Frame size */ + __u32 data_headroom; /* Frame head room */ +}; + #endif /* _LINUX_IF_XDP_H */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 2d7c08a50c60..333ce1450cc7 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -19,18 +19,235 @@ #include #include +#include +#include +#include #include #include #include "xsk.h" +#define XSK_UMEM_MIN_FRAME_SIZE 2048 + struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; + struct xsk_umem *umem; }; +static struct xdp_sock *xdp_sk(struct sock *sk) +{ + return (struct xdp_sock *)sk; +} + +static void xsk_umem_unpin_pages(struct xsk_umem *umem) +{ + unsigned int i; + + if (umem->pgs) { + for (i = 0; i < umem->npgs; i++) { + struct page *page = umem->pgs[i]; + + set_page_dirty_lock(page); + put_page(page); + } + + kfree(umem->pgs); + umem->pgs = NULL; + } +} + +static void xsk_umem_destroy(struct xsk_umem *umem) +{ + struct mm_struct *mm; + struct task_struct *task; + unsigned long diff; + + if (!umem) + return; + + xsk_umem_unpin_pages(umem); + + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + diff = umem->size >> PAGE_SHIFT; + + down_write(&mm->mmap_sem); + mm->pinned_vm -= diff; + up_write(&mm->mmap_sem); + mmput(mm); +out: + kfree(umem); +} + +static struct xsk_umem *xsk_umem_create(u64 addr, u64 size, u32 frame_size, + u32 data_headroom) +{ + struct xsk_umem *umem; + unsigned int nframes; + int size_chk; + + if (frame_size < XSK_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { + /* Strictly speaking we could support this, if: + * - huge pages, or* + * - using an IOMMU, or + * - making sure the memory area is consecutive + * but for now, we simply say "computer says no". + */ + return ERR_PTR(-EINVAL); + } + + if (!is_power_of_2(frame_size)) + return ERR_PTR(-EINVAL); + + if (!PAGE_ALIGNED(addr)) { + /* Memory area has to be page size aligned. For + * simplicity, this might change. + */ + return ERR_PTR(-EINVAL); + } + + if ((addr + size) < addr) + return ERR_PTR(-EINVAL); + + nframes = size / frame_size; + if (nframes == 0) + return ERR_PTR(-EINVAL); + + data_headroom = ALIGN(data_headroom, 64); + + size_chk = frame_size - data_headroom - XSK_KERNEL_HEADROOM; + if (size_chk < 0) + return ERR_PTR(-EINVAL); + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + umem->pid = get_task_pid(current, PIDTYPE_PID); + umem->size = (size_t)size; + umem->address = (unsigned long)addr; + umem->frame_size = frame_size; + umem->nframes = nframes; + umem->data_headroom = data_headroom; + umem->pgs = NULL; + + return umem; +} + +static int xsk_umem_pin_pages(struct xsk_umem *umem) +{ + unsigned int gup_flags = FOLL_WRITE; + long npgs; + int err; + + /* XXX Fix so that we don't always pin. + * "copy to user" from interrupt context, but how? + */ + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_ATOMIC); + if (!umem->pgs) + return -ENOMEM; + + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags, &umem->pgs[0], NULL); + if (npgs != umem->npgs) { + if (npgs >= 0) { + umem->npgs = npgs; + err = -ENOMEM; + goto out_pin; + } + err = npgs; + goto out_pgs; + } + + return 0; + +out_pin: + xsk_umem_unpin_pages(umem); +out_pgs: + kfree(umem->pgs); + umem->pgs = NULL; + + return err; +} + +static struct xsk_umem *xsk_mem_reg(u64 addr, u64 size, u32 frame_size, + u32 data_headroom) +{ + unsigned long lock_limit, locked, npages; + int ret = 0; + struct xsk_umem *umem; + + if (!can_do_mlock()) + return ERR_PTR(-EPERM); + + umem = xsk_umem_create(addr, size, frame_size, data_headroom); + if (IS_ERR(umem)) + return umem; + + npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked = npages + current->mm->pinned_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { + ret = -ENOMEM; + goto out; + } + + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; + goto out; + } + umem->npgs = npages; + + ret = xsk_umem_pin_pages(umem); + +out: + if (ret < 0) { + put_pid(umem->pid); + kfree(umem); + } else { + current->mm->pinned_vm = locked; + } + + up_write(¤t->mm->mmap_sem); + + return ret < 0 ? ERR_PTR(ret) : umem; +} + static int xsk_release(struct socket *sock) { + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + struct net *net; + + if (!sk) + return 0; + + net = sock_net(sk); + + local_bh_disable(); + sock_prot_inuse_add(net, sk->sk_prot, -1); + local_bh_enable(); + + xsk_umem_destroy(xs->umem); + + sock_orphan(sk); + sock->sk = NULL; + + sk_refcnt_debug_release(sk); + sock_put(sk); + return 0; } @@ -48,6 +265,43 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock, static int xsk_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + switch (optname) { + case XDP_MEM_REG: + { + struct xdp_mr_req req; + struct xsk_umem *umem; + + if (optlen < sizeof(req)) + return -EINVAL; + if (copy_from_user(&req, optval, sizeof(req))) + return -EFAULT; + + umem = xsk_mem_reg(req.addr, req.len, req.frame_size, + req.data_headroom); + if (IS_ERR(umem)) + return PTR_ERR(umem); + + lock_sock(sk); + if (xs->umem) { /* XXX create and check afterwards... really? */ + release_sock(sk); + xsk_umem_destroy(umem); + return -EBUSY; + } + xs->umem = umem; + release_sock(sk); + + return 0; + } + default: + break; + } + return -ENOPROTOOPT; } @@ -97,10 +351,48 @@ static const struct proto_ops xsk_proto_ops = { /* the rest vvv, OK to be missing implementation -- checked against NULL. */ }; +static void xsk_destruct(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_DEAD)) + return; + + sk_refcnt_debug_dec(sk); +} + static int xsk_create(struct net *net, struct socket *sock, int protocol, int kern) { - return -EOPNOTSUPP; + struct sock *sk; + + if (!ns_capable(net->user_ns, CAP_NET_RAW)) + return -EPERM; + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + /* XXX Require ETH_P_IP? Something else? */ + if (protocol) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); + if (!sk) + return -ENOBUFS; + + sock->ops = &xsk_proto_ops; + + sock_init_data(sock, sk); + + sk->sk_family = PF_XDP; + + sk->sk_destruct = xsk_destruct; + sk_refcnt_debug_inc(sk); + + local_bh_disable(); + sock_prot_inuse_add(net, &xsk_proto, 1); + local_bh_enable(); + + return 0; } static const struct net_proto_family xsk_family_ops = { diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index 441f8d00a9d5..71559374645b 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -12,7 +12,20 @@ * more details. */ -#ifndef _LINUX_XDPSOCK_H -#define _LINUX_XDPSOCK_H +#ifndef _LINUX_XSK_H +#define _LINUX_XSK_H -#endif /* _LINUX_XDPSOCK_H */ +#define XSK_KERNEL_HEADROOM 256 /* Headrom for XDP */ + +struct xsk_umem { + struct pid *pid; + struct page **pgs; + unsigned long address; + size_t size; + u32 npgs; + u32 frame_size; + u32 nframes; + u32 data_headroom; +}; + +#endif /* _LINUX_XSK_H */