From patchwork Wed Jan 30 23:51:34 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Peter Oskolkov <posk@google.com>
X-Patchwork-Id: 1033838
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <netdev-owner@vger.kernel.org>
X-Original-To: patchwork-incoming-netdev@ozlabs.org
Delivered-To: patchwork-incoming-netdev@ozlabs.org
Authentication-Results: ozlabs.org;
	spf=none (mailfrom) smtp.mailfrom=vger.kernel.org
	(client-ip=209.132.180.67; helo=vger.kernel.org;
	envelope-from=netdev-owner@vger.kernel.org;
	receiver=<UNKNOWN>)
Authentication-Results: ozlabs.org; dmarc=pass (p=reject dis=none)
	header.from=google.com
Authentication-Results: ozlabs.org; dkim=pass (2048-bit key;
	unprotected) header.d=google.com header.i=@google.com
	header.b="r87wq9XE"; dkim-atps=neutral
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by ozlabs.org (Postfix) with ESMTP id 43qgBh234Yz9s9h
	for <patchwork-incoming-netdev@ozlabs.org>;
	Thu, 31 Jan 2019 10:51:56 +1100 (AEDT)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1725811AbfA3Xvy (ORCPT
	<rfc822;patchwork-incoming-netdev@ozlabs.org>);
	Wed, 30 Jan 2019 18:51:54 -0500
Received: from mail-ot1-f73.google.com ([209.85.210.73]:41823 "EHLO
	mail-ot1-f73.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1727888AbfA3Xvx (ORCPT
	<rfc822;netdev@vger.kernel.org>); Wed, 30 Jan 2019 18:51:53 -0500
Received: by mail-ot1-f73.google.com with SMTP id n22so599835otq.8
	for <netdev@vger.kernel.org>; Wed, 30 Jan 2019 15:51:51 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
	d=google.com; s=20161025;
	h=date:in-reply-to:message-id:mime-version:references:subject:from:to
	:cc; bh=K4WJwzikUAVjEzMjFBMX+cOQyPNo32wVU9+JKWyRndw=;
	b=r87wq9XEELVOpVvM56yjj2TnRJ7ImmCGsKAiYO8lVX0eXR1zJ6UIFY9kTZPLIEvcmy
	g9xMLwwe8SYfiAUvsSK53faPuP2z2lVR2i+FYbYRAUaDmIDBd9qG7tA6s016tqmPJKfJ
	ECnlzDLnxRzMPActHykG0ygSoZuDx/8xDkf0HH75H84EaCYEpN6vHvW+aHAU9kE0a2t6
	2XicIY/VBMIAXFHEQ8+b2+IZTzpsHZJH+sCiwGTFvV8ukJ7BT6zvOmSiplKR5jpTeqLg
	TOlFwx2kBkp1+Mi79giLekmHcbVdVlhK4EIYGEZKmm3G3AMrdblFuhbTNt7vzP5s0o1h
	g3FA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
	d=1e100.net; s=20161025;
	h=x-gm-message-state:date:in-reply-to:message-id:mime-version
	:references:subject:from:to:cc;
	bh=K4WJwzikUAVjEzMjFBMX+cOQyPNo32wVU9+JKWyRndw=;
	b=uaymwDYFG1mJlPrWwGJDLiPwHHijd3DvktGwNENqsuIRUsPQtPNZH6zXSVyiUcyBij
	SBeFito6h7lkVJVPDGKUVFJ/5o0ufOViXZhOOizlQfcz/5embuu4Ixrbwxouy99shVGg
	U6prdG9zlaMbLWhEt/7mggd1ZddxIXNP0+FZg5LHI9IjpIRF5HmdLJTBj1o731dIxUH/
	lwnM2LqoTFwu7J+4ZViO8a9nX4cBXB/UmZrfZDKDd5YAhjaTPXUgSYIcTV+7vTblxnEF
	/ORWqnIEQUIefsimOZz0m0PjKnvzqX26/FYDildjUh1vRLcVa1R2X07PWi09lZnDALYc
	AEAw==
X-Gm-Message-State: AHQUAuYrpIKLM1wFkn1iEW4lIGdKQdgMr7nJENJ2DGejLS+pUqEUs9yv
	kdVK95KzvE/pSC/DJIxG9uu3UDH/
X-Google-Smtp-Source: 
 AHgI3IZ/+AHERN1rzpnjG6hxtA0fhhx42kO9JWo2INeOnR/7VCdzDQ3Z1I0iEeh3ADTKenyom6c1Jr7v
X-Received: by 2002:aca:75c3:: with SMTP id
	q186mr8314611oic.35.1548892311201;
	Wed, 30 Jan 2019 15:51:51 -0800 (PST)
Date: Wed, 30 Jan 2019 15:51:34 -0800
In-Reply-To: <20190130235136.136527-1-posk@google.com>
Message-Id: <20190130235136.136527-4-posk@google.com>
Mime-Version: 1.0
References: <20190130235136.136527-1-posk@google.com>
X-Mailer: git-send-email 2.20.1.495.gaa96b0ce6b-goog
Subject: [PATCH bpf-next v5 3/5] bpf: add handling of BPF_LWT_REROUTE to
	lwt_bpf.c
From: Peter Oskolkov <posk@google.com>
To: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>, netdev@vger.kernel.org
Cc: Peter Oskolkov <posk.devel@gmail.com>, David Ahern <dsahern@gmail.com>,
	Peter Oskolkov <posk@google.com>
Sender: netdev-owner@vger.kernel.org
Precedence: bulk
List-ID: <netdev.vger.kernel.org>
X-Mailing-List: netdev@vger.kernel.org

This patch builds on top of the previous patch in the patchset,
which added BPF_LWT_ENCAP_IP mode to bpf_lwt_push_encap. As the
encapping can result in the skb needing to go via a different
interface/route/dst, bpf programs can indicate this by returning
BPF_LWT_REROUTE, which triggers a new route lookup for the skb.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 net/core/lwt_bpf.c | 125 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 6a6e9acab73d..20581567f84a 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/bpf.h>
 #include <net/lwtunnel.h>
+#include <net/ip6_route.h>
 
 struct bpf_lwt_prog {
 	struct bpf_prog *prog;
@@ -55,6 +56,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 
 	switch (ret) {
 	case BPF_OK:
+	case BPF_LWT_REROUTE:
 		break;
 
 	case BPF_REDIRECT:
@@ -87,6 +89,32 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	return ret;
 }
 
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
+{
+	int err = -EINVAL;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct iphdr *iph = ip_hdr(skb);
+
+		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					   iph->tos, skb_dst(skb)->dev);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ip6_route_input(skb);
+		err = skb_dst(skb)->error;
+	} else {
+		pr_warn_once("BPF_LWT_REROUTE input: unsupported proto %d\n",
+			     skb->protocol);
+	}
+
+	if (err)
+		goto err;
+	return dst_input(skb);
+
+err:
+	kfree_skb(skb);
+	return err;
+}
+
 static int bpf_input(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -98,6 +126,8 @@ static int bpf_input(struct sk_buff *skb)
 		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
 		if (ret < 0)
 			return ret;
+		if (ret == BPF_LWT_REROUTE)
+			return bpf_lwt_input_reroute(skb);
 	}
 
 	if (unlikely(!dst->lwtstate->orig_input)) {
@@ -147,6 +177,90 @@ static int xmit_check_hhlen(struct sk_buff *skb)
 	return 0;
 }
 
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
+{
+	struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+	int oif = l3mdev ? l3mdev->ifindex : 0;
+	struct dst_entry *dst = NULL;
+	struct sock *sk;
+	struct net *net;
+	bool ipv4;
+	int err;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ipv4 = true;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ipv4 = false;
+	} else {
+		pr_warn_once("BPF_LWT_REROUTE xmit: unsupported proto %d\n",
+			     skb->protocol);
+		return -EINVAL;
+	}
+
+	sk = sk_to_full_sk(skb->sk);
+	if (sk) {
+		if (sk->sk_bound_dev_if)
+			oif = sk->sk_bound_dev_if;
+		net = sock_net(sk);
+	} else {
+		net = dev_net(skb_dst(skb)->dev);
+	}
+
+	if (ipv4) {
+		struct iphdr *iph = ip_hdr(skb);
+		struct flowi4 fl4 = {0};
+		struct rtable *rt;
+
+		fl4.flowi4_oif = oif;
+		fl4.flowi4_mark = skb->mark;
+		fl4.flowi4_uid = sock_net_uid(net, sk);
+		fl4.flowi4_tos = RT_TOS(iph->tos);
+		fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+		fl4.flowi4_proto = iph->protocol;
+		fl4.daddr = iph->daddr;
+		fl4.saddr = iph->saddr;
+
+		rt = ip_route_output_key(net, &fl4);
+		if (IS_ERR(rt) || rt->dst.error)
+			return -EINVAL;
+		dst = &rt->dst;
+	} else {
+		struct ipv6hdr *iph6 = ipv6_hdr(skb);
+		struct flowi6 fl6 = {0};
+
+		fl6.flowi6_oif = oif;
+		fl6.flowi6_mark = skb->mark;
+		fl6.flowi6_uid = sock_net_uid(net, sk);
+		fl6.flowlabel = ip6_flowinfo(iph6);
+		fl6.flowi6_proto = iph6->nexthdr;
+		fl6.daddr = iph6->daddr;
+		fl6.saddr = iph6->saddr;
+
+		dst = ip6_route_output(net, skb->sk, &fl6);
+		if (IS_ERR(dst) || dst->error)
+			return -EINVAL;
+	}
+
+	/* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
+	 * was done for the previous dst, so we are doing it here again, in
+	 * case the new dst needs much more space. The call below is a noop
+	 * if there is enough header space in skb.
+	 */
+	err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+	if (unlikely(err))
+		return err;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+	if (unlikely(err))
+		return err;
+
+	/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+	return LWTUNNEL_XMIT_DONE;
+}
+
 static int bpf_xmit(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -154,11 +268,20 @@ static int bpf_xmit(struct sk_buff *skb)
 
 	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
 	if (bpf->xmit.prog) {
+		__be16 proto = skb->protocol;
 		int ret;
 
 		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
 		switch (ret) {
 		case BPF_OK:
+			/* If the header changed, e.g. via bpf_lwt_push_encap,
+			 * BPF_LWT_REROUTE below should have been used if the
+			 * protocol was also changed.
+			 */
+			if (skb->protocol != proto) {
+				kfree_skb(skb);
+				return -EINVAL;
+			}
 			/* If the header was expanded, headroom might be too
 			 * small for L2 header to come, expand as needed.
 			 */
@@ -169,6 +292,8 @@ static int bpf_xmit(struct sk_buff *skb)
 			return LWTUNNEL_XMIT_CONTINUE;
 		case BPF_REDIRECT:
 			return LWTUNNEL_XMIT_DONE;
+		case BPF_LWT_REROUTE:
+			return bpf_lwt_xmit_reroute(skb);
 		default:
 			return ret;
 		}