diff mbox series

[net-next,03/11] vxlan: Allow configuration of DF behaviour

Message ID 1922d1e13074e73435724523f901f2c97eb3a764.1541533786.git.sbrivio@redhat.com
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series ICMP error handling for UDP tunnels | expand

Commit Message

Stefano Brivio Nov. 6, 2018, 9:38 p.m. UTC
Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.

For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.

According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.

Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 drivers/net/vxlan.c          | 29 +++++++++++++++++++++++++++++
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  9 +++++++++
 3 files changed, 39 insertions(+)

Comments

Stephen Hemminger Nov. 7, 2018, 5 a.m. UTC | #1
On Tue,  6 Nov 2018 22:38:59 +0100
Stefano Brivio <sbrivio@redhat.com> wrote:

> 			df = htons(IP_DF);
>  		}
>  
> +		if (!df) {
> +			if (vxlan->cfg.df == VXLAN_DF_SET) {
> +				df = htons(IP_DF);

I am confused, this looks like this new flag is duplicating the exiting tunnel DF flag.
(in info->key.tun.flags). Why is another flag needed?
Stefano Brivio Nov. 7, 2018, 10:35 a.m. UTC | #2
Stephen, thanks for reviewing this.

On Tue, 6 Nov 2018 21:00:18 -0800
Stephen Hemminger <stephen@networkplumber.org> wrote:

> On Tue,  6 Nov 2018 22:38:59 +0100
> Stefano Brivio <sbrivio@redhat.com> wrote:
> 
> > 			df = htons(IP_DF);
> >  		}
> >  
> > +		if (!df) {
> > +			if (vxlan->cfg.df == VXLAN_DF_SET) {
> > +				df = htons(IP_DF);  
> 
> I am confused, this looks like this new flag is duplicating the exiting tunnel DF flag.
> (in info->key.tun.flags). Why is another flag needed?

The reason is that for non-lwt tunnels the tunnel key is not used in
VXLAN, and this patch is intended for non-lwt tunnels only, as external
control planes already have a way to set the DF bit.

But now I see that the way I wrote this is misleading: this should
really be in an if (rdst) or if (!info) clause. I'll place this into
the if (!info) block just above. TTL and TOS are handled in a similar
way, using the if (rdst) clause above.

Functionally, it's equivalent, because external control planes will
never set non-default values for vxlan->cfg.df, but still not a good
reason to write it this way.

Similar story for GENEVE, I will place that under if
(!geneve->collect_md) instead.

With a notable difference, though: GENEVE already uses struct
ip_tunnel_key for some of its interface configuration, so I had already
thought about adding a TUNNEL_DONT_FRAGMENT_INHERIT flag that could be
used in tun_flags.

However, I would use the last available bit there, and this wouldn't be
terribly useful: an external control plane already has access to the
inner DF bit, and would most likely decide on its own whether to set DF
or not, by setting that in tun_flags. So I'd rather keep that in struct
geneve_dev.
diff mbox series

Patch

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 7706e392b2a7..ccb19b833706 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2289,6 +2289,19 @@  static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			df = htons(IP_DF);
 		}
 
+		if (!df) {
+			if (vxlan->cfg.df == VXLAN_DF_SET) {
+				df = htons(IP_DF);
+			} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
+				struct ethhdr *eth = eth_hdr(skb);
+
+				if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
+				    (ntohs(eth->h_proto) == ETH_P_IP &&
+				     old_iph->frag_off & htons(IP_DF)))
+					df = htons(IP_DF);
+			}
+		}
+
 		ndst = &rt->dst;
 		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
 
@@ -2837,6 +2850,7 @@  static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
 	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
+	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -2893,6 +2907,16 @@  static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
 		}
 	}
 
+	if (data[IFLA_VXLAN_DF]) {
+		enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);
+
+		if (df < 0 || df > VXLAN_DF_MAX) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_DF],
+					    "Invalid DF attribute");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -3538,6 +3562,9 @@  static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 		conf->mtu = nla_get_u32(tb[IFLA_MTU]);
 	}
 
+	if (data[IFLA_VXLAN_DF])
+		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);
+
 	return 0;
 }
 
@@ -3630,6 +3657,7 @@  static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
 		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
@@ -3696,6 +3724,7 @@  static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
 		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
 	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 			!!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 03431c148e16..ec999c49df1f 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -216,6 +216,7 @@  struct vxlan_config {
 	unsigned long		age_interval;
 	unsigned int		addrmax;
 	bool			no_share;
+	enum ifla_vxlan_df	df;
 };
 
 struct vxlan_dev_node {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 1debfa42cba1..efc588949431 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -533,6 +533,7 @@  enum {
 	IFLA_VXLAN_LABEL,
 	IFLA_VXLAN_GPE,
 	IFLA_VXLAN_TTL_INHERIT,
+	IFLA_VXLAN_DF,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
@@ -542,6 +543,14 @@  struct ifla_vxlan_port_range {
 	__be16	high;
 };
 
+enum ifla_vxlan_df {
+	VXLAN_DF_UNSET = 0,
+	VXLAN_DF_SET,
+	VXLAN_DF_INHERIT,
+	__VXLAN_DF_END,
+	VXLAN_DF_MAX = __VXLAN_DF_END - 1,
+};
+
 /* GENEVE section */
 enum {
 	IFLA_GENEVE_UNSPEC,