diff mbox

[1/6] vxlan: Allow for VXLAN extensions to be implemented

Message ID 7c5e8240263a22a69042c45a632c9940e5ae1062.1420594925.git.tgraf@suug.ch
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Thomas Graf Jan. 7, 2015, 2:05 a.m. UTC
The VXLAN receive code is currently conservative in what it accepts and
will reject any frame that uses any of the reserved VXLAN protocol fields.
The VXLAN draft specifies that "reserved fields MUST be set to zero on
transmit and ignored on receive.".

Retain the current conservative parsing behaviour by default but allows
these fields to be used by VXLAN extensions which are explicitly enabled
on the VXLAN socket respectively VXLAN net_device.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 drivers/net/vxlan.c | 29 +++++++++++++++++++----------
 include/net/vxlan.h | 32 +++++++++++++++++++++++++++++---
 2 files changed, 48 insertions(+), 13 deletions(-)

Comments

Tom Herbert Jan. 7, 2015, 3:46 a.m. UTC | #1
On Tue, Jan 6, 2015 at 6:05 PM, Thomas Graf <tgraf@suug.ch> wrote:
> The VXLAN receive code is currently conservative in what it accepts and
> will reject any frame that uses any of the reserved VXLAN protocol fields.
> The VXLAN draft specifies that "reserved fields MUST be set to zero on
> transmit and ignored on receive.".
>
IMO it is an unfortunate decision in VXLAN to ignore set reserved
fields on receive. Had they not done this, then adding a protocol
field to the header would have been feasible and we wouldn't need yet
another encapsulation protocol (i.e. VXLAN-GPE). Rejecting frames with
reserved bits set is the better behavior, but I think the comment
about this needs to be clear about why this diverges from RFC7348.

> Retain the current conservative parsing behaviour by default but allows
> these fields to be used by VXLAN extensions which are explicitly enabled
> on the VXLAN socket respectively VXLAN net_device.
>
Conceptually, VXLAN has both mandatory flags and optional flags for
extensions. You may want to look at the VXLAN RCO patches that added
an extension and infrastructure for them.

Tom

> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> ---
>  drivers/net/vxlan.c | 29 +++++++++++++++++++----------
>  include/net/vxlan.h | 32 +++++++++++++++++++++++++++++---
>  2 files changed, 48 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 2ab0922..4d52aa9 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -65,7 +65,7 @@
>  #define VXLAN_VID_MASK (VXLAN_N_VID - 1)
>  #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
>
> -#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */
> +#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags default value. */
>
>  /* UDP port for VXLAN traffic.
>   * The IANA assigned port is 4789, but the Linux default is 8472
> @@ -1100,22 +1100,28 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>         if (!pskb_may_pull(skb, VXLAN_HLEN))
>                 goto error;
>
> +       vs = rcu_dereference_sk_user_data(sk);
> +       if (!vs)
> +               goto drop;
> +
>         /* Return packets with reserved bits set */
>         vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
> -       if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
> -           (vxh->vx_vni & htonl(0xff))) {
> -               netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
> -                          ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
> -               goto error;
> +
> +       /* For backwards compatibility, only allow reserved fields to be
> +        * used by VXLAN extensions if explicitly requested.
> +        */
> +       if (vs->exts) {
> +               if (!vxh->vni_present)
> +                       goto error_invalid_header;
> +       } else {
> +               if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
> +                   (vxh->vx_vni & htonl(0xff)))
> +                       goto error_invalid_header;
>         }
>
>         if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
>                 goto drop;
>
> -       vs = rcu_dereference_sk_user_data(sk);
> -       if (!vs)
> -               goto drop;
> -
>         vs->rcv(vs, skb, vxh->vx_vni);
>         return 0;
>
> @@ -1124,6 +1130,9 @@ drop:
>         kfree_skb(skb);
>         return 0;
>
> +error_invalid_header:
> +       netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
> +                  ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
>  error:
>         /* Return non vxlan pkt */
>         return 1;
> diff --git a/include/net/vxlan.h b/include/net/vxlan.h
> index 903461a..3e98d31 100644
> --- a/include/net/vxlan.h
> +++ b/include/net/vxlan.h
> @@ -11,10 +11,35 @@
>  #define VNI_HASH_BITS  10
>  #define VNI_HASH_SIZE  (1<<VNI_HASH_BITS)
>
> -/* VXLAN protocol header */
> +/* VXLAN protocol header:
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * |R|R|R|R|I|R|R|R|               Reserved                        |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * |                VXLAN Network Identifier (VNI) |   Reserved    |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *
> + * I = 1       VXLAN Network Identifier (VNI) present
> + */
>  struct vxlanhdr {
> -       __be32 vx_flags;
> -       __be32 vx_vni;
> +       union {
> +               struct {
> +#ifdef __LITTLE_ENDIAN_BITFIELD
> +                       __u8    reserved_flags1:3,
> +                               vni_present:1,
> +                               reserved_flags2:4;
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +                       __u8    reserved_flags2:4,
> +                               vni_present:1,
> +                               reserved_flags1:3;
> +#else
> +#error "Please fix <asm/byteorder.h>"
> +#endif
> +                       __u8    vx_reserved1;
> +                       __be16  vx_reserved2;
> +               };
> +               __be32 vx_flags;
> +       };
> +       __be32  vx_vni;
>  };
>
>  struct vxlan_sock;
> @@ -25,6 +50,7 @@ struct vxlan_sock {
>         struct hlist_node hlist;
>         vxlan_rcv_t      *rcv;
>         void             *data;
> +       u32               exts;
>         struct work_struct del_work;
>         struct socket    *sock;
>         struct rcu_head   rcu;
> --
> 1.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Graf Jan. 7, 2015, 10:27 a.m. UTC | #2
On 01/06/15 at 07:46pm, Tom Herbert wrote:
> On Tue, Jan 6, 2015 at 6:05 PM, Thomas Graf <tgraf@suug.ch> wrote:
> > The VXLAN receive code is currently conservative in what it accepts and
> > will reject any frame that uses any of the reserved VXLAN protocol fields.
> > The VXLAN draft specifies that "reserved fields MUST be set to zero on
> > transmit and ignored on receive.".
> >
> IMO it is an unfortunate decision in VXLAN to ignore set reserved
> fields on receive. Had they not done this, then adding a protocol
> field to the header would have been feasible and we wouldn't need yet
> another encapsulation protocol (i.e. VXLAN-GPE). Rejecting frames with
> reserved bits set is the better behavior, but I think the comment
> about this needs to be clear about why this diverges from RFC7348.

Agreed. Do you want to give it a shot? I know you have been involved
on all aspects of this topic for a long time in NVO3 ;-)

> > Retain the current conservative parsing behaviour by default but allows
> > these fields to be used by VXLAN extensions which are explicitly enabled
> > on the VXLAN socket respectively VXLAN net_device.
> >
> Conceptually, VXLAN has both mandatory flags and optional flags for
> extensions. You may want to look at the VXLAN RCO patches that added
> an extension and infrastructure for them.

I've seen your proposed changes. I think they could be easily rebased
on top of this and use the enablement infrastructure. In fact, I see
this as the only feasible option to deal with VXLAN extensions: Leave
it to the user to decide which extensions to run. That way we can
support any combination of extensions, even conflicting ones. All we
have to do is catch incompatible extension configurations on a socket
and reject them at configuration time. Expecting that NVO3/IETF will
find consensus on a list of compatible set of extensions with perfect
forward and backwards compatibility is unrealistic in my eyes. We have
Geneve and GUE to solve it properly in the future.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross Jan. 7, 2015, 10:45 p.m. UTC | #3
On Wed, Jan 7, 2015 at 2:27 AM, Thomas Graf <tgraf@suug.ch> wrote:
> On 01/06/15 at 07:46pm, Tom Herbert wrote:
>> On Tue, Jan 6, 2015 at 6:05 PM, Thomas Graf <tgraf@suug.ch> wrote:
>> > The VXLAN receive code is currently conservative in what it accepts and
>> > will reject any frame that uses any of the reserved VXLAN protocol fields.
>> > The VXLAN draft specifies that "reserved fields MUST be set to zero on
>> > transmit and ignored on receive.".
>> >
>> IMO it is an unfortunate decision in VXLAN to ignore set reserved
>> fields on receive. Had they not done this, then adding a protocol
>> field to the header would have been feasible and we wouldn't need yet
>> another encapsulation protocol (i.e. VXLAN-GPE). Rejecting frames with
>> reserved bits set is the better behavior, but I think the comment
>> about this needs to be clear about why this diverges from RFC7348.
>
> Agreed. Do you want to give it a shot? I know you have been involved
> on all aspects of this topic for a long time in NVO3 ;-)

There is little chance of this happening. Besides the IETF procedural
aspects, ignoring unknown reserved bits on receive is the long
standing standard way of extending protocols. It's how TCP was
extended to support ECN, for example.

>> > Retain the current conservative parsing behaviour by default but allows
>> > these fields to be used by VXLAN extensions which are explicitly enabled
>> > on the VXLAN socket respectively VXLAN net_device.
>> >
>> Conceptually, VXLAN has both mandatory flags and optional flags for
>> extensions. You may want to look at the VXLAN RCO patches that added
>> an extension and infrastructure for them.
>
> I've seen your proposed changes. I think they could be easily rebased
> on top of this and use the enablement infrastructure. In fact, I see
> this as the only feasible option to deal with VXLAN extensions: Leave
> it to the user to decide which extensions to run. That way we can
> support any combination of extensions, even conflicting ones. All we
> have to do is catch incompatible extension configurations on a socket
> and reject them at configuration time. Expecting that NVO3/IETF will
> find consensus on a list of compatible set of extensions with perfect
> forward and backwards compatibility is unrealistic in my eyes. We have
> Geneve and GUE to solve it properly in the future.

My concern is that having multiple (and potentially overlapping)
extensions is going to make the VXLAN code very messy and hard to
follow. I think there's already quite a big of complexity there from
the DOVE extensions (which are basically dead at this point to the
best of my knowledge). We already see this some with the discussion
over the header struct but that's just the beginning - can you imagine
the code that checks a single bit for several different
interpretations? Not to mention the fact that the location of the bits
in some of these proposals has already been shuffled several times. It
seems very painful...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Graf Jan. 7, 2015, 11:24 p.m. UTC | #4
On 01/07/15 at 02:45pm, Jesse Gross wrote:
> My concern is that having multiple (and potentially overlapping)
> extensions is going to make the VXLAN code very messy and hard to
> follow. I think there's already quite a big of complexity there from
> the DOVE extensions (which are basically dead at this point to the
> best of my knowledge). We already see this some with the discussion
> over the header struct but that's just the beginning - can you imagine
> the code that checks a single bit for several different
> interpretations?

I'm not disagreeing with you Jesse. I think it's manageable though.
I don't think the code as proposed is overly difficult to understand
or messy. I'm glad to adjust the struct definition if the union style
is considered messy, no problem.

We have these reserved bits in a widely used protocol and it is our
choice to make use of these bits or not. My opinion is that we should.
Good candidates to me seem to be GBP for security labels, RCO for
checksumming and GPE for non-L2 over VXLAN.

I would love for everybody to switch over to Geneve or GUE and as you
know I'm highly interested in pushing it forward. Reality is that it
might be another couple of years for it to become fully established.

> Not to mention the fact that the location of the bits
> in some of these proposals has already been shuffled several times. It
> seems very painful...

I don't really want to defend the process behind these drafts. I
believe that code defines the standard, in particular open source code.
As for GBP, we are the authors of the draft and this has been deployed
to hardware as well so this won't change.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Jan. 8, 2015, 12:02 a.m. UTC | #5
On Wed, Jan 7, 2015 at 3:24 PM, Thomas Graf <tgraf@suug.ch> wrote:
> On 01/07/15 at 02:45pm, Jesse Gross wrote:
>> My concern is that having multiple (and potentially overlapping)
>> extensions is going to make the VXLAN code very messy and hard to
>> follow. I think there's already quite a big of complexity there from
>> the DOVE extensions (which are basically dead at this point to the
>> best of my knowledge). We already see this some with the discussion
>> over the header struct but that's just the beginning - can you imagine
>> the code that checks a single bit for several different
>> interpretations?
>
> I'm not disagreeing with you Jesse. I think it's manageable though.
> I don't think the code as proposed is overly difficult to understand
> or messy. I'm glad to adjust the struct definition if the union style
> is considered messy, no problem.
>
> We have these reserved bits in a widely used protocol and it is our
> choice to make use of these bits or not. My opinion is that we should.
> Good candidates to me seem to be GBP for security labels, RCO for
> checksumming and GPE for non-L2 over VXLAN.
>
Do you know how could GPE work with GBP they want to use the same bits
in header for data? Seems like these are mutually exclusive
extensions. RCO should be fine with either :-)

> I would love for everybody to switch over to Geneve or GUE and as you
> know I'm highly interested in pushing it forward. Reality is that it
> might be another couple of years for it to become fully established.
>
>> Not to mention the fact that the location of the bits
>> in some of these proposals has already been shuffled several times. It
>> seems very painful...
>
> I don't really want to defend the process behind these drafts. I
> believe that code defines the standard, in particular open source code.
> As for GBP, we are the authors of the draft and this has been deployed
> to hardware as well so this won't change.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Graf Jan. 8, 2015, 12:14 a.m. UTC | #6
On 01/07/15 at 04:02pm, Tom Herbert wrote:
> Do you know how could GPE work with GBP they want to use the same bits
> in header for data? Seems like these are mutually exclusive
> extensions. RCO should be fine with either :-)

Yes, GBP and GPE are mutually exclusive extensions. Although
GPE would allow to define an intermediate header for additional
metadata, f.e. encoded as GUE or Geneve options.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Jan. 8, 2015, 12:23 a.m. UTC | #7
On Wed, Jan 7, 2015 at 4:14 PM, Thomas Graf <tgraf@suug.ch> wrote:
> On 01/07/15 at 04:02pm, Tom Herbert wrote:
>> Do you know how could GPE work with GBP they want to use the same bits
>> in header for data? Seems like these are mutually exclusive
>> extensions. RCO should be fine with either :-)
>
> Yes, GBP and GPE are mutually exclusive extensions. Although
> GPE would allow to define an intermediate header for additional
> metadata, f.e. encoded as GUE or Geneve options.

The latest GPE draft uses eight bits for next protocol, so if you make
the security field eight bits it would fit nicely in the octet before
the next protocol.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 2ab0922..4d52aa9 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -65,7 +65,7 @@ 
 #define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
 #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
 
-#define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
+#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags default value. */
 
 /* UDP port for VXLAN traffic.
  * The IANA assigned port is 4789, but the Linux default is 8472
@@ -1100,22 +1100,28 @@  static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!pskb_may_pull(skb, VXLAN_HLEN))
 		goto error;
 
+	vs = rcu_dereference_sk_user_data(sk);
+	if (!vs)
+		goto drop;
+
 	/* Return packets with reserved bits set */
 	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
-	if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
-	    (vxh->vx_vni & htonl(0xff))) {
-		netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
-			   ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
-		goto error;
+
+	/* For backwards compatibility, only allow reserved fields to be
+	 * used by VXLAN extensions if explicitly requested.
+	 */
+	if (vs->exts) {
+		if (!vxh->vni_present)
+			goto error_invalid_header;
+	} else {
+		if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+		    (vxh->vx_vni & htonl(0xff)))
+			goto error_invalid_header;
 	}
 
 	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
 		goto drop;
 
-	vs = rcu_dereference_sk_user_data(sk);
-	if (!vs)
-		goto drop;
-
 	vs->rcv(vs, skb, vxh->vx_vni);
 	return 0;
 
@@ -1124,6 +1130,9 @@  drop:
 	kfree_skb(skb);
 	return 0;
 
+error_invalid_header:
+	netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
+		   ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
 error:
 	/* Return non vxlan pkt */
 	return 1;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 903461a..3e98d31 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -11,10 +11,35 @@ 
 #define VNI_HASH_BITS	10
 #define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
 
-/* VXLAN protocol header */
+/* VXLAN protocol header:
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |R|R|R|R|I|R|R|R|               Reserved                        |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                VXLAN Network Identifier (VNI) |   Reserved    |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * I = 1	VXLAN Network Identifier (VNI) present
+ */
 struct vxlanhdr {
-	__be32 vx_flags;
-	__be32 vx_vni;
+	union {
+		struct {
+#ifdef __LITTLE_ENDIAN_BITFIELD
+			__u8	reserved_flags1:3,
+				vni_present:1,
+				reserved_flags2:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+			__u8	reserved_flags2:4,
+				vni_present:1,
+				reserved_flags1:3;
+#else
+#error	"Please fix <asm/byteorder.h>"
+#endif
+			__u8	vx_reserved1;
+			__be16	vx_reserved2;
+		};
+		__be32 vx_flags;
+	};
+	__be32	vx_vni;
 };
 
 struct vxlan_sock;
@@ -25,6 +50,7 @@  struct vxlan_sock {
 	struct hlist_node hlist;
 	vxlan_rcv_t	 *rcv;
 	void		 *data;
+	u32		  exts;
 	struct work_struct del_work;
 	struct socket	 *sock;
 	struct rcu_head	  rcu;