diff mbox series

[bpf-next,v2,09/13] bpf: add bpf_skb_adjust_room encap flags

Message ID 20190322151504.89983-10-willemdebruijn.kernel@gmail.com
State Changes Requested
Headers show
Series bpf tc tunneling | expand

Commit Message

Willem de Bruijn March 22, 2019, 3:15 p.m. UTC
From: Willem de Bruijn <willemb@google.com>

When pushing tunnel headers, annotate skbs in the same way as tunnel
devices.

For GSO packets, the network stack requires certain fields set to
segment packets with tunnel headers. gro_gse_segment depends on
transport and inner mac header, for instance.

Add an option to pass this information.

Remove the restriction on len_diff to network header length, which
is too short, e.g., for GRE protocols.

Changes v1->v2:
  - document new flags
  - BPF_F_ADJ_ROOM_MASK moved

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/uapi/linux/bpf.h | 19 +++++++++++-
 net/core/filter.c        | 63 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 76 insertions(+), 6 deletions(-)

Comments

Alexei Starovoitov March 22, 2019, 3:44 p.m. UTC | #1
On Fri, Mar 22, 2019 at 8:15 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4   (1ULL << 1)
> +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6   (1ULL << 2)
> +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK   (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
> +                                        BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)

mask is still in uapi...
Willem de Bruijn March 22, 2019, 3:47 p.m. UTC | #2
On Fri, Mar 22, 2019 at 11:44 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Mar 22, 2019 at 8:15 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4   (1ULL << 1)
> > +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6   (1ULL << 2)
> > +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK   (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
> > +                                        BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
>
> mask is still in uapi...

That's only the L3 MASK, which captures ipv4 or ipv6. I don't see that
being expanded.

BPF_F_ADJ_ROOM_MASK was moved to net/core/filter.c
Alexei Starovoitov March 22, 2019, 4:11 p.m. UTC | #3
On Fri, Mar 22, 2019 at 8:48 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Fri, Mar 22, 2019 at 11:44 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Mar 22, 2019 at 8:15 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4   (1ULL << 1)
> > > +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6   (1ULL << 2)
> > > +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK   (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
> > > +                                        BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
> >
> > mask is still in uapi...
>
> That's only the L3 MASK, which captures ipv4 or ipv6. I don't see that
> being expanded.

and what's the use of it for user space?
packet is either ipv4 or ipv6.
passing two flags will be rejected:
if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
+                   flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+                       return -EINVAL;
Willem de Bruijn March 22, 2019, 4:31 p.m. UTC | #4
On Fri, Mar 22, 2019 at 12:11 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Mar 22, 2019 at 8:48 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > On Fri, Mar 22, 2019 at 11:44 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Fri, Mar 22, 2019 at 8:15 AM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4   (1ULL << 1)
> > > > +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6   (1ULL << 2)
> > > > +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK   (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
> > > > +                                        BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
> > >
> > > mask is still in uapi...
> >
> > That's only the L3 MASK, which captures ipv4 or ipv6. I don't see that
> > being expanded.
>
> and what's the use of it for user space?

I see. Indeed, it has none there. Will remove. Apologies for the extra
round of revision as a result of this.
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4f157d0ec571..f770f0de5b9c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1486,11 +1486,20 @@  union bpf_attr {
  * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
  * 		  (room space is added or removed below the layer 3 header).
  *
- *		There is one supported flag at this time:
+ *		The following flags are supported at this time:
  *
  *		* **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
  *		  Adjusting mss in this way is not allowed for datagrams.
  *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **:
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **:
+ *		  Any new space is reserved to hold a tunnel header.
+ *		  Configure skb offsets and other fields accordingly.
+ *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **:
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
+ *		  Use with ENCAP_L3 flags to further specify the tunnel type.
+ *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -2632,6 +2641,14 @@  enum bpf_func_id {
 /* BPF_FUNC_skb_adjust_room flags. */
 #define BPF_F_ADJ_ROOM_FIXED_GSO	(1ULL << 0)
 
+#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4	(1ULL << 1)
+#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	(1ULL << 2)
+#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK	(BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
+					 BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+
+#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE	(1ULL << 3)
+#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP	(1ULL << 4)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
diff --git a/net/core/filter.c b/net/core/filter.c
index 393d1e4903b5..d0ebeb0147bc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2966,6 +2966,9 @@  static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
 static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 			    u64 flags)
 {
+	bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
+	unsigned int gso_type = SKB_GSO_DODGY;
+	u16 mac_len, inner_net, inner_trans;
 	int ret;
 
 	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
@@ -2979,10 +2982,60 @@  static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 	if (unlikely(ret < 0))
 		return ret;
 
+	if (encap) {
+		if (skb->protocol != htons(ETH_P_IP) &&
+		    skb->protocol != htons(ETH_P_IPV6))
+			return -ENOTSUPP;
+
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
+		    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+			return -EINVAL;
+
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
+		    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+			return -EINVAL;
+
+		if (skb->encapsulation)
+			return -EALREADY;
+
+		mac_len = skb->network_header - skb->mac_header;
+		inner_net = skb->network_header;
+		inner_trans = skb->transport_header;
+	}
+
 	ret = bpf_skb_net_hdr_push(skb, off, len_diff);
 	if (unlikely(ret < 0))
 		return ret;
 
+	if (encap) {
+		/* inner mac == inner_net on l3 encap */
+		skb->inner_mac_header = inner_net;
+		skb->inner_network_header = inner_net;
+		skb->inner_transport_header = inner_trans;
+		skb_set_inner_protocol(skb, skb->protocol);
+
+		skb->encapsulation = 1;
+		skb_set_network_header(skb, mac_len);
+
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+			gso_type |= SKB_GSO_UDP_TUNNEL;
+		else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
+			gso_type |= SKB_GSO_GRE;
+		else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+			gso_type |= SKB_GSO_IPXIP6;
+		else
+			gso_type |= SKB_GSO_IPXIP4;
+
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
+		    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
+			int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
+					sizeof(struct ipv6hdr) :
+					sizeof(struct iphdr);
+
+			skb_set_transport_header(skb, mac_len + nh_len);
+		}
+	}
+
 	if (skb_is_gso(skb)) {
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 
@@ -2991,7 +3044,7 @@  static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 			skb_decrease_gso_size(shinfo, len_diff);
 
 		/* Header must be checked, and gso_segs recomputed. */
-		shinfo->gso_type |= SKB_GSO_DODGY;
+		shinfo->gso_type |= gso_type;
 		shinfo->gso_segs = 0;
 	}
 
@@ -3039,12 +3092,14 @@  static u32 __bpf_skb_max_len(const struct sk_buff *skb)
 			  SKB_MAX_ALLOC;
 }
 
-#define BPF_F_ADJ_ROOM_MASK		(BPF_F_ADJ_ROOM_FIXED_GSO)
+#define BPF_F_ADJ_ROOM_MASK		(BPF_F_ADJ_ROOM_FIXED_GSO | \
+					 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
+					 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
+					 BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
 
 BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
 	   u32, mode, u64, flags)
 {
-	bool trans_same = skb->transport_header == skb->network_header;
 	u32 len_cur, len_diff_abs = abs(len_diff);
 	u32 len_min = bpf_skb_net_base_len(skb);
 	u32 len_max = __bpf_skb_max_len(skb);
@@ -3073,8 +3128,6 @@  BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
 	}
 
 	len_cur = skb->len - skb_network_offset(skb);
-	if (skb_transport_header_was_set(skb) && !trans_same)
-		len_cur = skb_network_header_len(skb);
 	if ((shrink && (len_diff_abs >= len_cur ||
 			len_cur - len_diff_abs < len_min)) ||
 	    (!shrink && (skb->len + len_diff_abs > len_max &&