From patchwork Fri Aug 11 11:41:18 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jason Wang X-Patchwork-Id: 800510 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=netdev-owner@vger.kernel.org; receiver=) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 3xTNQK0jmWz9t32 for ; Fri, 11 Aug 2017 21:42:01 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753325AbdHKLlr (ORCPT ); Fri, 11 Aug 2017 07:41:47 -0400 Received: from mx1.redhat.com ([209.132.183.28]:35812 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752169AbdHKLln (ORCPT ); Fri, 11 Aug 2017 07:41:43 -0400 Received: from smtp.corp.redhat.com (int-mx01.intmail.prod.int.phx2.redhat.com [10.5.11.11]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id 65767C0EC1A5; Fri, 11 Aug 2017 11:41:43 +0000 (UTC) DMARC-Filter: OpenDMARC Filter v1.3.2 mx1.redhat.com 65767C0EC1A5 Authentication-Results: ext-mx08.extmail.prod.ext.phx2.redhat.com; dmarc=none (p=none dis=none) header.from=redhat.com Authentication-Results: ext-mx08.extmail.prod.ext.phx2.redhat.com; spf=fail smtp.mailfrom=jasowang@redhat.com Received: from jason-ThinkPad-T450s.redhat.com (ovpn-12-31.pek2.redhat.com [10.72.12.31]) by smtp.corp.redhat.com (Postfix) with ESMTP id 9B20C7DB40; Fri, 11 Aug 2017 11:41:40 +0000 (UTC) From: Jason Wang To: davem@davemloft.net, mst@redhat.com, netdev@vger.kernel.org, linux-kernel@vger.kernel.org Cc: kubakici@wp.pl, Jason Wang Subject: [PATCH net-next V2 3/3] tap: XDP support Date: Fri, 11 Aug 2017 19:41:18 +0800 Message-Id: <1502451678-17358-4-git-send-email-jasowang@redhat.com> In-Reply-To: <1502451678-17358-1-git-send-email-jasowang@redhat.com> References: <1502451678-17358-1-git-send-email-jasowang@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.5.11.11 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.32]); Fri, 11 Aug 2017 11:41:43 +0000 (UTC) Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org This patch tries to implement XDP for tun. The implementation was split into two parts: - fast path: small and no gso packet. We try to do XDP at page level before build_skb(). For XDP_TX, since creating/destroying queues were completely under control of userspace, it was implemented through generic XDP helper after skb has been built. This could be optimized in the future. - slow path: big or gso packet. We try to do it after skb was created through generic XDP helpers. Test were done through pktgen with small packets. xdp1 test shows ~41.1% improvement: Before: ~1.7Mpps After: ~2.3Mpps xdp_redirect to ixgbe shows ~60% improvement: Before: ~0.8Mpps After: ~1.38Mpps Suggested-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- drivers/net/tun.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 142 insertions(+), 7 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9736df4..5892284 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -73,6 +73,8 @@ #include #include #include +#include +#include #include @@ -105,7 +107,8 @@ do { \ } while (0) #endif -#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) +#define TUN_HEADROOM 256 +#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD + TUN_HEADROOM) /* TUN device flags */ @@ -224,6 +227,7 @@ struct tun_struct { u32 flow_count; u32 rx_batched; struct tun_pcpu_stats __percpu *pcpu_stats; + struct bpf_prog __rcu *xdp_prog; }; #ifdef CONFIG_TUN_VNET_CROSS_LE @@ -590,6 +594,7 @@ static void tun_detach(struct tun_file *tfile, bool clean) static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); + struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; @@ -622,6 +627,9 @@ static void tun_detach_all(struct net_device *dev) } BUG_ON(tun->numdisabled != 0); + if (xdp_prog) + bpf_prog_put(xdp_prog); + if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } @@ -1008,6 +1016,46 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) stats->tx_dropped = tx_dropped; } +static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) +{ + struct tun_struct *tun = netdev_priv(dev); + struct bpf_prog *old_prog; + + old_prog = rtnl_dereference(tun->xdp_prog); + rcu_assign_pointer(tun->xdp_prog, prog); + if (old_prog) + bpf_prog_put(old_prog); + + return 0; +} + +static u32 tun_xdp_query(struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + const struct bpf_prog *xdp_prog; + + xdp_prog = rtnl_dereference(tun->xdp_prog); + if (xdp_prog) + return xdp_prog->aux->id; + + return 0; +} + +static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return tun_xdp_set(dev, xdp->prog, xdp->extack); + case XDP_QUERY_PROG: + xdp->prog_id = tun_xdp_query(dev); + xdp->prog_attached = !!xdp->prog_id; + return 0; + default: + return -EINVAL; + } +} + static const struct net_device_ops tun_netdev_ops = { .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, @@ -1038,6 +1086,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, + .ndo_xdp = tun_xdp, }; static void tun_flow_init(struct tun_struct *tun) @@ -1217,16 +1266,22 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, return true; } -static struct sk_buff *tun_build_skb(struct tun_file *tfile, +static struct sk_buff *tun_build_skb(struct tun_struct *tun, + struct tun_file *tfile, struct iov_iter *from, - int len) + struct virtio_net_hdr *hdr, + int len, int *generic_xdp) { struct page_frag *alloc_frag = &tfile->alloc_frag; struct sk_buff *skb; + struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + unsigned int delta = 0; char *buf; size_t copied; + bool xdp_xmit = false; + int err; if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) return ERR_PTR(-ENOMEM); @@ -1238,16 +1293,77 @@ static struct sk_buff *tun_build_skb(struct tun_file *tfile, if (copied != len) return ERR_PTR(-EFAULT); + if (hdr->gso_type) + *generic_xdp = 1; + else + *generic_xdp = 0; + + rcu_read_lock(); + xdp_prog = rcu_dereference(tun->xdp_prog); + if (xdp_prog && !*generic_xdp) { + struct xdp_buff xdp; + void *orig_data; + u32 act; + + xdp.data_hard_start = buf; + xdp.data = buf + TUN_RX_PAD; + xdp.data_end = xdp.data + len; + orig_data = xdp.data; + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + switch (act) { + case XDP_REDIRECT: + get_page(alloc_frag->page); + alloc_frag->offset += buflen; + err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); + if (err) + goto err_redirect; + return NULL; + case XDP_TX: + xdp_xmit = true; + /* fall through */ + case XDP_PASS: + delta = orig_data - xdp.data; + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(tun->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + goto err_xdp; + } + } + skb = build_skb(buf, buflen); - if (!skb) + if (!skb) { + rcu_read_unlock(); return ERR_PTR(-ENOMEM); + } - skb_reserve(skb, TUN_RX_PAD); - skb_put(skb, len); + skb_reserve(skb, TUN_RX_PAD - delta); + skb_put(skb, len + delta); get_page(alloc_frag->page); alloc_frag->offset += buflen; + if (xdp_xmit) { + skb->dev = tun->dev; + generic_xdp_tx(skb, xdp_prog); + rcu_read_lock(); + return NULL; + } + + rcu_read_unlock(); + return skb; + +err_redirect: + put_page(alloc_frag->page); +err_xdp: + rcu_read_unlock(); + this_cpu_inc(tun->pcpu_stats->rx_dropped); + return NULL; } /* Get packet from user space buffer */ @@ -1266,6 +1382,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, bool zerocopy = false; int err; u32 rxhash; + int generic_xdp = 1; if (!(tun->dev->flags & IFF_UP)) return -EIO; @@ -1324,11 +1441,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { - skb = tun_build_skb(tfile, from, len); + skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp); if (IS_ERR(skb)) { this_cpu_inc(tun->pcpu_stats->rx_dropped); return PTR_ERR(skb); } + if (!skb) + return total_len; } else { if (!zerocopy) { copylen = len; @@ -1402,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb_reset_network_header(skb); skb_probe_transport_header(skb, 0); + if (generic_xdp) { + struct bpf_prog *xdp_prog; + int ret; + + rcu_read_lock(); + xdp_prog = rcu_dereference(tun->xdp_prog); + if (xdp_prog) { + ret = do_xdp_generic(xdp_prog, skb); + if (ret != XDP_PASS) { + rcu_read_unlock(); + return total_len; + } + } + rcu_read_unlock(); + } + rxhash = __skb_get_hash_symmetric(skb); #ifndef CONFIG_4KSTACKS tun_rx_batched(tun, tfile, skb, more);