diff mbox

[ovs-dev,v2] VxLAN-gpe implementation

Message ID 1466426203-64974-1-git-send-email-yi.y.yang@intel.com
State Changes Requested
Headers show

Commit Message

Yang, Yi June 20, 2016, 12:36 p.m. UTC
Current Linux kernel git tree has included VxLAN-gpe implementation

author  Jiri Benc <jbenc@redhat.com>
committer       David S. Miller <davem@davemloft.net>
commit  e1e5314de08ba6003b358125eafc9ad9e75a950c (patch)
tree    1e18cdabf1c9d9ef17e26c6480e629465447f77f /drivers/net/vxlan.c
parent  a6d5bbf34efa8330af7b0b1dba0f38148516ed97 (diff)
vxlan: implement GPE

This patch is to port it to ovs in order that people also can use VxLAN-gpe
even if they don't replace their kernels with latest Linux kernel.

Signed-off-by: Johnson Li <johnson.li@intel.com>
Signed-off-by: Yi Yang <yi.y.yang@intel.com>
---
 datapath/linux/compat/include/linux/if_link.h     |   4 +
 datapath/linux/compat/include/linux/openvswitch.h |   1 +
 datapath/linux/compat/include/net/vxlan.h         |  73 ++++
 datapath/linux/compat/vxlan.c                     | 461 ++++++++++++++++++++--
 lib/dpif-netlink.c                                |   5 +
 lib/netdev-vport.c                                |   4 +-
 6 files changed, 512 insertions(+), 36 deletions(-)

Comments

Thadeu Lima de Souza Cascardo June 20, 2016, 1:11 p.m. UTC | #1
On Mon, Jun 20, 2016 at 08:36:43PM +0800, Yi Yang wrote:
> Current Linux kernel git tree has included VxLAN-gpe implementation
> 
> author  Jiri Benc <jbenc@redhat.com>
> committer       David S. Miller <davem@davemloft.net>
> commit  e1e5314de08ba6003b358125eafc9ad9e75a950c (patch)
> tree    1e18cdabf1c9d9ef17e26c6480e629465447f77f /drivers/net/vxlan.c
> parent  a6d5bbf34efa8330af7b0b1dba0f38148516ed97 (diff)
> vxlan: implement GPE
> 
> This patch is to port it to ovs in order that people also can use VxLAN-gpe
> even if they don't replace their kernels with latest Linux kernel.
> 
> Signed-off-by: Johnson Li <johnson.li@intel.com>
> Signed-off-by: Yi Yang <yi.y.yang@intel.com>


Hi, Yi Yang.

Before adding the OVS_VXLAN_EXT_GPE extension to the out-of-tree module, you
should send it to the mainline kernel. Besides, you need a very good
justification why you can't wait for my patchset to be accepted and have
VXLAN-GPE enabled using rtnetlink.

Also, I would split any changes to the datapath and userspace parts of the code
into multiple commits.

Meanwhile, you could backport only the upstreamed portions of VXLAN-GPE and send
that as a single commit, no userspace changes.

Cascardo.

> ---
>  datapath/linux/compat/include/linux/if_link.h     |   4 +
>  datapath/linux/compat/include/linux/openvswitch.h |   1 +
>  datapath/linux/compat/include/net/vxlan.h         |  73 ++++
>  datapath/linux/compat/vxlan.c                     | 461 ++++++++++++++++++++--
>  lib/dpif-netlink.c                                |   5 +
>  lib/netdev-vport.c                                |   4 +-
>  6 files changed, 512 insertions(+), 36 deletions(-)
> 
> diff --git a/datapath/linux/compat/include/linux/if_link.h b/datapath/linux/compat/include/linux/if_link.h
> index 6209dcb..de87769 100644
> --- a/datapath/linux/compat/include/linux/if_link.h
> +++ b/datapath/linux/compat/include/linux/if_link.h
> @@ -100,6 +100,10 @@ enum {
>  	IFLA_VXLAN_REMCSUM_NOPARTIAL,
>  #define IFLA_VXLAN_COLLECT_METADATA rpl_IFLA_VXLAN_COLLECT_METADATA
>  	IFLA_VXLAN_COLLECT_METADATA,
> +#define IFLA_VXLAN_LABEL rpl_IFLA_VXLAN_LABEL
> +        IFLA_VXLAN_LABEL,
> +#define IFLA_VXLAN_GPE rpl_IFLA_VXLAN_GPE
> +        IFLA_VXLAN_GPE,
>  #define __IFLA_VXLAN_MAX rpl___IFLA_VXLAN_MAX
>  	__IFLA_VXLAN_MAX
>  };
> diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h
> index edfa7a1..761d9c6 100644
> --- a/datapath/linux/compat/include/linux/openvswitch.h
> +++ b/datapath/linux/compat/include/linux/openvswitch.h
> @@ -287,6 +287,7 @@ enum ovs_vport_attr {
>  enum {
>  	OVS_VXLAN_EXT_UNSPEC,
>  	OVS_VXLAN_EXT_GBP,      /* Flag or __u32 */
> +	OVS_VXLAN_EXT_GPE,      /* Flag, Generic Protocol Extension */
>  	__OVS_VXLAN_EXT_MAX,
>  };
>  
> diff --git a/datapath/linux/compat/include/net/vxlan.h b/datapath/linux/compat/include/net/vxlan.h
> index 75a5a7a..b3f45c4 100644
> --- a/datapath/linux/compat/include/net/vxlan.h
> +++ b/datapath/linux/compat/include/net/vxlan.h
> @@ -84,6 +84,66 @@ struct vxlanhdr_gbp {
>  #define VXLAN_GBP_POLICY_APPLIED	(BIT(3) << 16)
>  #define VXLAN_GBP_ID_MASK		(0xFFFF)
>  
> +/*
> + * VXLAN Generic Protocol Extension (VXLAN_F_GPE):
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * |R|R|Ver|I|P|R|O|       Reserved                |Next Protocol  |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * |                VXLAN Network Identifier (VNI) |   Reserved    |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *
> + * Ver = Version. Indicates VXLAN GPE protocol version.
> + *
> + * P = Next Protocol Bit. The P bit is set to indicate that the
> + *     Next Protocol field is present.
> + *
> + * O = OAM Flag Bit. The O bit is set to indicate that the packet
> + *     is an OAM packet.
> + *
> + * Next Protocol = This 8 bit field indicates the protocol header
> + * immediately following the VXLAN GPE header.
> + *
> + * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01
> + */
> +
> +struct vxlanhdr_gpe {
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +       u8      oam_flag:1,
> +               reserved_flags1:1,
> +               np_applied:1,
> +               instance_applied:1,
> +               version:2,
> +reserved_flags2:2;
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +       u8      reserved_flags2:2,
> +               version:2,
> +               instance_applied:1,
> +               np_applied:1,
> +               reserved_flags1:1,
> +               oam_flag:1;
> +#endif
> +       u8      reserved_flags3;
> +       u8      reserved_flags4;
> +       u8      next_protocol;
> +       __be32  vx_vni;
> +};
> +
> +/* VXLAN-GPE header flags. */
> +#define VXLAN_HF_VER   (BIT(29) | BIT(28))
> +#define VXLAN_HF_NP    (BIT(26))
> +#define VXLAN_HF_OAM   (BIT(24))
> +#define VXLAN_HF_GPE   (BIT(26))
> +
> +#define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \
> +                            (0xFF))
> +
> +/* VXLAN-GPE header Next Protocol. */
> +#define VXLAN_GPE_NP_IPV4      0x01
> +#define VXLAN_GPE_NP_IPV6      0x02
> +#define VXLAN_GPE_NP_ETHERNET  0x03
> +#define VXLAN_GPE_NP_NSH       0x04
> +#define ETH_P_NSH              0x894f
> +
>  /* VXLAN protocol header:
>   * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>   * |G|R|R|R|I|R|R|C|               Reserved                        |
> @@ -167,6 +227,7 @@ struct vxlan_config {
>  	__u16			port_max;
>  	__u8			tos;
>  	__u8			ttl;
> +	__be32                  label;
>  	u32			flags;
>  	unsigned long		age_interval;
>  	unsigned int		addrmax;
> @@ -205,15 +266,27 @@ struct vxlan_dev {
>  #define VXLAN_F_GBP			0x800
>  #define VXLAN_F_REMCSUM_NOPARTIAL	0x1000
>  #define VXLAN_F_COLLECT_METADATA	0x2000
> +#define VXLAN_F_GPE                     0x4000
> +#define VXLAN_F_UDP_ZERO_CSUM_TX VXLAN_F_UDP_CSUM
>  
>  /* Flags that are used in the receive path. These flags must match in
>   * order for a socket to be shareable
>   */
>  #define VXLAN_F_RCV_FLAGS		(VXLAN_F_GBP |			\
> +                                         VXLAN_F_GPE |                  \
>  					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
>  					 VXLAN_F_REMCSUM_RX |		\
>  					 VXLAN_F_REMCSUM_NOPARTIAL |	\
>  					 VXLAN_F_COLLECT_METADATA)
> +
> +/* Flags that can be set together with VXLAN_F_GPE. */
> +#define VXLAN_F_ALLOWED_GPE             (VXLAN_F_GPE |                  \
> +                                         VXLAN_F_IPV6 |                 \
> +                                         VXLAN_F_UDP_CSUM |     \
> +                                         VXLAN_F_UDP_ZERO_CSUM6_TX |    \
> +                                         VXLAN_F_UDP_ZERO_CSUM6_RX |    \
> +                                         VXLAN_F_COLLECT_METADATA)
> +
>  #define vxlan_dev_create rpl_vxlan_dev_create
>  struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name,
>  				    u8 name_assign_type, struct vxlan_config *conf);
> diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c
> index 4faa18f..570d2d9 100644
> --- a/datapath/linux/compat/vxlan.c
> +++ b/datapath/linux/compat/vxlan.c
> @@ -812,6 +812,45 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
>  }
>  #endif
>  
> +static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
> +			       __be32 *protocol,
> +			       struct sk_buff *skb, u32 vxflags)
> +{
> +       struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;
> +
> +       /* Need to have Next Protocol set for interfaces in GPE mode. */
> +       if (!gpe->np_applied)
> +	       return false;
> +       /* "The initial version is 0. If a receiver does not support the
> +	* version indicated it MUST drop the packet.
> +	*/
> +       if (gpe->version != 0)
> +	       return false;
> +       /* "When the O bit is set to 1, the packet is an OAM packet and OAM
> +	* processing MUST occur." However, we don't implement OAM
> +	* processing, thus drop the packet.
> +	*/
> +       if (gpe->oam_flag)
> +	       return false;
> +
> +       switch (gpe->next_protocol) {
> +       case VXLAN_GPE_NP_IPV4:
> +	       *protocol = htons(ETH_P_IP);
> +	       break;
> +       case VXLAN_GPE_NP_IPV6:
> +	       *protocol = htons(ETH_P_IPV6);
> +	       break;
> +       case VXLAN_GPE_NP_ETHERNET:
> +	       *protocol = htons(ETH_P_TEB);
> +	       break;
> +       default:
> +	       return false;
> +       }
> +
> +       unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
> +       return true;
> +}
> +
>  static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
>  		      struct vxlan_metadata *md, u32 vni,
>  		      struct metadata_dst *tun_dst)
> @@ -822,6 +861,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
>  	struct pcpu_sw_netstats *stats;
>  	union vxlan_addr saddr;
>  	int err = 0;
> +	struct vxlanhdr unparsed;
> +	__be32 protocol = htons(ETH_P_TEB);
> +	bool raw_proto = false;
>  
>  	/* For flow based devices, map all packets to VNI 0 */
>  	if (vs->flags & VXLAN_F_COLLECT_METADATA)
> @@ -832,14 +874,35 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
>  	if (!vxlan)
>  		goto drop;
>  
> -	skb_reset_mac_header(skb);
> -	skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
> -	skb->protocol = eth_type_trans(skb, vxlan->dev);
> -	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> +	/* For backwards compatibility, only allow reserved fields to be
> +	 * used by VXLAN extensions if explicitly requested.
> +	 */
> +	if (vs->flags & VXLAN_F_GPE) {
> +		unparsed = *(struct vxlanhdr *)(udp_hdr(skb) + 1);
> +		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
> +			goto drop;
> +		if (protocol != htons(ETH_P_TEB)) {
> +		    raw_proto = true;
> +		}
> +	}
>  
> -	/* Ignore packet loops (and multicast echo) */
> -	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
> -		goto drop;
> +	if (!raw_proto) {
> +		skb_reset_mac_header(skb);
> +		skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
> +		skb->protocol = eth_type_trans(skb, vxlan->dev);
> +		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> +
> +		/* Ignore packet loops (and multicast echo) */
> +		if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
> +			goto drop;
> +
> +		if ((vxlan->flags & VXLAN_F_LEARN) &&
> +		    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
> +			goto drop;
> +	} else {
> +		skb->dev = vxlan->dev;
> +		skb->pkt_type = PACKET_HOST;
> +	}
>  
>  	/* Get data from the outer IP header */
>  	if (vxlan_get_sk_family(vs) == AF_INET) {
> @@ -861,10 +924,6 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
>  		goto drop;
>  	}
>  
> -	if ((vxlan->flags & VXLAN_F_LEARN) &&
> -	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
> -		goto drop;
> -
>  	skb_reset_network_header(skb);
>  	/* In flow-based mode, GBP is carried in dst_metadata */
>  	if (!(vs->flags & VXLAN_F_COLLECT_METADATA))
> @@ -908,6 +967,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>  		struct metadata_dst dst;
>  		char buf[sizeof(struct metadata_dst) + sizeof(*md)];
>  	} buf;
> +	struct vxlanhdr unparsed;
> +	__be32 protocol = htons(ETH_P_TEB);
>  
>  	/* Need Vxlan and inner Ethernet header to be present */
>  	if (!pskb_may_pull(skb, VXLAN_HLEN))
> @@ -924,14 +985,25 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>  		goto bad_flags;
>  	}
>  
> -	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
> -		goto drop;
> -	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
> -
>  	vs = rcu_dereference_sk_user_data(sk);
>  	if (!vs)
>  		goto drop;
>  
> +	/* For backwards compatibility, only allow reserved fields to be
> +	 * used by VXLAN extensions if explicitly requested.
> +	 */
> +	if (vs->flags & VXLAN_F_GPE) {
> +		unparsed = *(struct vxlanhdr *)(udp_hdr(skb) + 1);
> +		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
> +			goto drop;
> +		buf.dst.u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
> +		flags &= ~VXLAN_GPE_USED_BITS;
> +	}
> +
> +	if (iptunnel_pull_header(skb, VXLAN_HLEN, protocol))
> +		goto drop;
> +	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
> +
>  #ifdef HAVE_VXLAN_HF_RCO
>  	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
>  		vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni,
> @@ -1023,6 +1095,33 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
>  	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
>  }
>  
> +static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
> +			       __be16 protocol)
> +{
> +	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
> +
> +	vxh->vx_flags |= htonl(VXLAN_HF_GPE);
> +	gpe->np_applied = 1;
> +	gpe->version = 0;
> +	gpe->oam_flag = 0;
> +
> +	switch (protocol) {
> +	case htons(ETH_P_IP):
> +		gpe->next_protocol = VXLAN_GPE_NP_IPV4;
> +		return 0;
> +	case htons(ETH_P_IPV6):
> +		gpe->next_protocol = VXLAN_GPE_NP_IPV6;
> +		return 0;
> +	case htons(ETH_P_TEB):
> +		gpe->next_protocol = VXLAN_GPE_NP_ETHERNET;
> +		return 0;
> +	case htons(ETH_P_NSH):
> +		gpe->next_protocol = VXLAN_GPE_NP_NSH;
> +		return 0;
> +	}
> +	return -EPFNOSUPPORT;
> +}
> +
>  #if IS_ENABLED(CONFIG_IPV6)
>  static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
>  			   struct sk_buff *skb,
> @@ -1036,6 +1135,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
>  	int err;
>  	bool udp_sum = !(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX);
>  	int type = 0;
> +	__be16 inner_protocol = htons(ETH_P_TEB);
>  
>  	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
>  	    skb->ip_summed == CHECKSUM_PARTIAL) {
> @@ -1106,8 +1206,14 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
>  
>  	if (vxflags & VXLAN_F_GBP)
>  		vxlan_build_gbp_hdr(vxh, vxflags, md);
> +	if (vxflags & VXLAN_F_GPE) {
> +		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
> +		if (err < 0)
> +			goto err;
> +		inner_protocol = skb->protocol;
> +	}
>  
> -	ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> +	ovs_skb_set_inner_protocol(skb, inner_protocol);
>  
>  	udp_tunnel6_xmit_skb(dst, sk, skb, dev, saddr, daddr, prio,
>  			     ttl, src_port, dst_port,
> @@ -1129,6 +1235,7 @@ static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *sk
>  	int err;
>  	bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM);
>  	int type = 0;
> +	__be16 inner_protocol = htons(ETH_P_TEB);
>  
>  	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
>  	    skb->ip_summed == CHECKSUM_PARTIAL) {
> @@ -1191,8 +1298,14 @@ static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *sk
>  	}
>  	if (vxflags & VXLAN_F_GBP)
>  		vxlan_build_gbp_hdr(vxh, vxflags, md);
> +	if (vxflags & VXLAN_F_GPE) {
> +		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
> +		if (err < 0)
> +			return err;
> +		inner_protocol = skb->protocol;
> +	}
>  
> -	ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> +	ovs_skb_set_inner_protocol(skb, inner_protocol);
>  
>  	return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos,
>  				   ttl, df, src_port, dst_port, xnet,
> @@ -1419,7 +1532,7 @@ tx_free:
>   *
>   * Outer IP header inherits ECN and DF from inner header.
>   * Outer UDP destination is the VXLAN assigned port.
> - *           source port is based on hash of flow
> + *	   source port is based on hash of flow
>   */
>  netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
>  {
> @@ -1648,7 +1761,7 @@ static netdev_tx_t vxlan_dev_xmit(struct sk_buff *skb, struct net_device *dev)
>  	return NETDEV_TX_OK;
>  }
>  
> -static const struct net_device_ops vxlan_netdev_ops = {
> +static const struct net_device_ops vxlan_netdev_ether_ops = {
>  	.ndo_init		= vxlan_init,
>  	.ndo_uninit		= vxlan_uninit,
>  	.ndo_get_stats64	= ip_tunnel_get_stats64,
> @@ -1661,6 +1774,16 @@ static const struct net_device_ops vxlan_netdev_ops = {
>  	.ndo_set_mac_address	= eth_mac_addr,
>  };
>  
> +static const struct net_device_ops vxlan_netdev_raw_ops = {
> +	.ndo_init		= vxlan_init,
> +	.ndo_uninit		= vxlan_uninit,
> +	.ndo_get_stats64	= ip_tunnel_get_stats64,
> +	.ndo_open		= vxlan_open,
> +	.ndo_stop		= vxlan_stop,
> +	.ndo_start_xmit		= vxlan_dev_xmit,
> +	.ndo_change_mtu		= vxlan_change_mtu,
> +};
> +
>  /* Info for udev, that this is a virtual tunnel endpoint */
>  static struct device_type vxlan_type = {
>  	.name = "vxlan",
> @@ -1675,7 +1798,7 @@ static void vxlan_setup(struct net_device *dev)
>  	eth_hw_addr_random(dev);
>  	ether_setup(dev);
>  
> -	dev->netdev_ops = &vxlan_netdev_ops;
> +	dev->netdev_ops = &vxlan_netdev_ether_ops;
>  	dev->destructor = free_netdev;
>  	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
>  
> @@ -1712,8 +1835,51 @@ static void vxlan_setup(struct net_device *dev)
>  		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
>  }
>  
> +static void vxlan_ether_setup(struct net_device *dev)
> +{
> +	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
> +	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
> +	dev->netdev_ops = &vxlan_netdev_ether_ops;
> +}
> +
> +static void vxlan_raw_setup(struct net_device *dev)
> +{
> +	dev->header_ops = NULL;
> +	dev->type = ARPHRD_NONE;
> +	dev->hard_header_len = 0;
> +	dev->addr_len = 0;
> +	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
> +	dev->netdev_ops = &vxlan_netdev_raw_ops;
> +}
> +
>  static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
> -	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
> +	[IFLA_VXLAN_ID]	 = { .type = NLA_U32 },
> +	[IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
> +	[IFLA_VXLAN_GROUP6]     = { .len = sizeof(struct in6_addr) },
> +	[IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
> +	[IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
> +	[IFLA_VXLAN_LOCAL6]     = { .len = sizeof(struct in6_addr) },
> +	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
> +	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
> +	[IFLA_VXLAN_LABEL]      = { .type = NLA_U32 },
> +	[IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
> +	[IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
> +	[IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
> +	[IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
> +	[IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
> +	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
> +	[IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
> +	[IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
> +	[IFLA_VXLAN_COLLECT_METADATA]   = { .type = NLA_U8 },
> +	[IFLA_VXLAN_PORT]       = { .type = NLA_U16 },
> +	[IFLA_VXLAN_UDP_CSUM]   = { .type = NLA_U8 },
> +	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]  = { .type = NLA_U8 },
> +	[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]  = { .type = NLA_U8 },
> +	[IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
> +	[IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
> +	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
> +	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
> +	[IFLA_VXLAN_REMCSUM_NOPARTIAL]  = { .type = NLA_FLAG },
>  };
>  
>  static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
> @@ -1897,6 +2063,21 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
>  	__be16 default_port = vxlan->cfg.dst_port;
>  	struct net_device *lowerdev = NULL;
>  
> +	if (conf->flags & VXLAN_F_GPE) {
> +		if (conf->flags & ~VXLAN_F_ALLOWED_GPE)
> +			return -EINVAL;
> +		/* For now, allow GPE only together with COLLECT_METADATA.
> +		 * This can be relaxed later; in such case, the other side
> +		 * of the PtP link will have to be provided.
> +		 */
> +		if (!(conf->flags & VXLAN_F_COLLECT_METADATA))
> +			return -EINVAL;
> +
> +		vxlan_raw_setup(dev);
> +	} else {
> +		vxlan_ether_setup(dev);
> +	}
> +
>  	vxlan->net = src_net;
>  
>  	dst->remote_vni = conf->vni;
> @@ -2023,7 +2204,136 @@ static int vxlan_newlink(struct net_device *dev,
>  			 struct nlattr *tb[], struct nlattr *data[])
>  #endif
>  {
> -	return -EINVAL;
> +	struct vxlan_config conf;
> +	int err;
> +
> +	memset(&conf, 0, sizeof(conf));
> +
> +	if (data[IFLA_VXLAN_ID])
> +		conf.vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
> +
> +	if (data[IFLA_VXLAN_GROUP]) {
> +		conf.remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
> +	} else if (data[IFLA_VXLAN_GROUP6]) {
> +		if (!IS_ENABLED(CONFIG_IPV6))
> +			return -EPFNOSUPPORT;
> +
> +		conf.remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
> +		conf.remote_ip.sa.sa_family = AF_INET6;
> +	}
> +
> +	if (data[IFLA_VXLAN_LOCAL]) {
> +		conf.saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
> +		conf.saddr.sa.sa_family = AF_INET;
> +	} else if (data[IFLA_VXLAN_LOCAL6]) {
> +		if (!IS_ENABLED(CONFIG_IPV6))
> +			return -EPFNOSUPPORT;
> +
> +		/* TODO: respect scope id */
> +		conf.saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
> +		conf.saddr.sa.sa_family = AF_INET6;
> +	}
> +
> +	if (data[IFLA_VXLAN_LINK])
> +		conf.remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
> +
> +	if (data[IFLA_VXLAN_TOS])
> +		conf.tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
> +
> +	if (data[IFLA_VXLAN_TTL])
> +		conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
> +
> +	if (data[IFLA_VXLAN_LABEL])
> +		conf.label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
> +			     IPV6_FLOWLABEL_MASK;
> +
> +	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
> +		conf.flags |= VXLAN_F_LEARN;
> +
> +	if (data[IFLA_VXLAN_AGEING])
> +		conf.age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
> +
> +	if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
> +		conf.flags |= VXLAN_F_PROXY;
> +
> +	if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
> +		conf.flags |= VXLAN_F_RSC;
> +
> +	if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
> +		conf.flags |= VXLAN_F_L2MISS;
> +
> +	if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
> +		conf.flags |= VXLAN_F_L3MISS;
> +
> +	if (data[IFLA_VXLAN_LIMIT])
> +		conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
> +
> +	if (data[IFLA_VXLAN_COLLECT_METADATA] &&
> +	    nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA]))
> +		conf.flags |= VXLAN_F_COLLECT_METADATA;
> +
> +	if (data[IFLA_VXLAN_PORT_RANGE]) {
> +		const struct ifla_vxlan_port_range *p
> +			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
> +		conf.port_min = ntohs(p->low);
> +		conf.port_max = ntohs(p->high);
> +	}
> +
> +	if (data[IFLA_VXLAN_PORT])
> +		conf.dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
> +
> +	if (data[IFLA_VXLAN_UDP_CSUM] &&
> +	    !nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
> +		conf.flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
> +
> +	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] &&
> +	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
> +		conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
> +
> +	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] &&
> +	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
> +		conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
> +
> +	if (data[IFLA_VXLAN_REMCSUM_TX] &&
> +	    nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX]))
> +		conf.flags |= VXLAN_F_REMCSUM_TX;
> +
> +	if (data[IFLA_VXLAN_REMCSUM_RX] &&
> +	    nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX]))
> +		conf.flags |= VXLAN_F_REMCSUM_RX;
> +
> +	if (data[IFLA_VXLAN_GBP])
> +		conf.flags |= VXLAN_F_GBP;
> +
> +	if (data[IFLA_VXLAN_GPE])
> +		conf.flags |= VXLAN_F_GPE;
> +
> +	if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL])
> +		conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL;
> +
> +	if (tb[IFLA_MTU])
> +		conf.mtu = nla_get_u32(tb[IFLA_MTU]);
> +
> +	err = vxlan_dev_configure(src_net, dev, &conf);
> +	switch (err) {
> +	case -ENODEV:
> +		pr_info("ifindex %d does not exist\n", conf.remote_ifindex);
> +		break;
> +
> +	case -EPERM:
> +		pr_info("IPv6 is disabled via sysctl\n");
> +		break;
> +
> +	case -EEXIST:
> +		pr_info("duplicate VNI %u\n", be32_to_cpu(conf.vni));
> +		break;
> +
> +	case -EINVAL:
> +		pr_info("unsupported combination of extensions\n");
> +		break;
> +	}
> +
> +	return err;
>  }
>  
>  #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
> @@ -2047,20 +2357,21 @@ static void vxlan_dellink(struct net_device *dev)
>  static size_t vxlan_get_size(const struct net_device *dev)
>  {
>  
> -	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
> +	return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
>  		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
> -		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
> +		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
>  		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
> -		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
> -		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
> +		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_COLLECT_METADATA */
> +		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
> +		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
>  		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
>  		nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
>  		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
> @@ -2074,8 +2385,88 @@ static size_t vxlan_get_size(const struct net_device *dev)
>  static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
>  {
>  	const struct vxlan_dev *vxlan = netdev_priv(dev);
> +	const struct vxlan_rdst *dst = &vxlan->default_dst;
> +	struct ifla_vxlan_port_range ports = {
> +		.low =  htons(vxlan->cfg.port_min),
> +		.high = htons(vxlan->cfg.port_max),
> +	};
> +
> +	if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
> +		goto nla_put_failure;
> +
> +	if (!vxlan_addr_any(&dst->remote_ip)) {
> +		if (dst->remote_ip.sa.sa_family == AF_INET) {
> +			if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
> +					    dst->remote_ip.sin.sin_addr.s_addr))
> +				goto nla_put_failure;
> +#if IS_ENABLED(CONFIG_IPV6)
> +		} else {
> +			if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
> +					     &dst->remote_ip.sin6.sin6_addr))
> +				goto nla_put_failure;
> +#endif
> +		}
> +	}
> +
> +	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
> +		goto nla_put_failure;
> +
> +	if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
> +		if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
> +			if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
> +					    vxlan->cfg.saddr.sin.sin_addr.s_addr))
> +				goto nla_put_failure;
> +#if IS_ENABLED(CONFIG_IPV6)
> +		} else {
> +			if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
> +					     &vxlan->cfg.saddr.sin6.sin6_addr))
> +				goto nla_put_failure;
> +#endif
> +		}
> +	}
> +
> +	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
> +	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
> +			!!(vxlan->flags & VXLAN_F_LEARN)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
> +			!!(vxlan->flags & VXLAN_F_PROXY)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_L2MISS,
> +			!!(vxlan->flags & VXLAN_F_L2MISS)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
> +			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
> +		       !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) ||
> +	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
> +	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
> +	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
> +			!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
> +			!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
> +			!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
> +			!!(vxlan->flags & VXLAN_F_REMCSUM_TX)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
> +			!!(vxlan->flags & VXLAN_F_REMCSUM_RX)))
> +		goto nla_put_failure;
> +
> +	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
> +		goto nla_put_failure;
> +
> +	if (vxlan->flags & VXLAN_F_GBP &&
> +	    nla_put_flag(skb, IFLA_VXLAN_GBP))
> +		goto nla_put_failure;
> +
> +	if (vxlan->flags & VXLAN_F_GPE &&
> +	    nla_put_flag(skb, IFLA_VXLAN_GPE))
> +		goto nla_put_failure;
>  
> -	if (nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port))
> +	if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL &&
> +	    nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
>  		goto nla_put_failure;
>  
>  	return 0;
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 1e88c13..2b07e54 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -988,6 +988,8 @@ netdev_geneve_destroy(const char *name)
>  #define IFLA_VXLAN_UDP_ZERO_CSUM6_RX 20
>  #define IFLA_VXLAN_GBP 23
>  #define IFLA_VXLAN_COLLECT_METADATA 25
> +#define IFLA_VXLAN_LABEL 26
> +#define IFLA_VXLAN_GPE 27
>  #endif
>  
>  #if IFLA_GRE_MAX < 18
> @@ -1037,6 +1039,9 @@ netdev_vxlan_create(struct netdev *netdev)
>              if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP)) {
>                  nl_msg_put_flag(&request, IFLA_VXLAN_GBP);
>              }
> +            else if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GPE)) {
> +                nl_msg_put_flag(&request, IFLA_VXLAN_GPE);
> +            }
>              nl_msg_put_be16(&request, IFLA_VXLAN_PORT, tnl_cfg->dst_port);
>          nl_msg_end_nested(&request, infodata_off);
>      nl_msg_end_nested(&request, linkinfo_off);
> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
> index ec5c44e..fa56af5 100644
> --- a/lib/netdev-vport.c
> +++ b/lib/netdev-vport.c
> @@ -541,7 +541,9 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args)
>              while (ext) {
>                  if (!strcmp(type, "vxlan") && !strcmp(ext, "gbp")) {
>                      tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GBP);
> -                } else {
> +                } else if (!strcmp(type, "vxlan") && !strcmp(ext, "gpe")) {
> +                     tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GPE);
> +		} else {
>                      VLOG_WARN("%s: unknown extension '%s'", name, ext);
>                  }
>  
> -- 
> 1.9.3
>
Yang, Yi June 21, 2016, 1:39 a.m. UTC | #2
On Mon, Jun 20, 2016 at 10:11:14AM -0300, Thadeu Lima de Souza Cascardo wrote:
> On Mon, Jun 20, 2016 at 08:36:43PM +0800, Yi Yang wrote:
> > Current Linux kernel git tree has included VxLAN-gpe implementation
> > 
> > author  Jiri Benc <jbenc@redhat.com>
> > committer       David S. Miller <davem@davemloft.net>
> > commit  e1e5314de08ba6003b358125eafc9ad9e75a950c (patch)
> > tree    1e18cdabf1c9d9ef17e26c6480e629465447f77f /drivers/net/vxlan.c
> > parent  a6d5bbf34efa8330af7b0b1dba0f38148516ed97 (diff)
> > vxlan: implement GPE
> > 
> > This patch is to port it to ovs in order that people also can use VxLAN-gpe
> > even if they don't replace their kernels with latest Linux kernel.
> > 
> > Signed-off-by: Johnson Li <johnson.li@intel.com>
> > Signed-off-by: Yi Yang <yi.y.yang@intel.com>
> 
> 
> Hi, Yi Yang.
> 
> Before adding the OVS_VXLAN_EXT_GPE extension to the out-of-tree module, you
> should send it to the mainline kernel. Besides, you need a very good
> justification why you can't wait for my patchset to be accepted and have
> VXLAN-GPE enabled using rtnetlink.

Will add VS_VXLAN_EXT_GPE to include/uapi/linux/openvswitch.h and send a
kernel patch, but ovs and net-next kernel also can work together.

This patch depends on your patch set and Simon's patch set, we look
forward to seeing your patch set can be merged as soon as possible, the
intention to send this patch is to get your comments as early as
possible in order that we can accelerate NSH support merging process.
The users are very eager to get NSH support merged in ovs.

> 
> Also, I would split any changes to the datapath and userspace parts of the code
> into multiple commits.
> 
> Meanwhile, you could backport only the upstreamed portions of VXLAN-GPE and send
> that as a single commit, no userspace changes.

VxLAN-gpe part depends on some new changes in vxlan module, we have to
backport them into ovs, this patch has been VxLAN-gpe only, current ovs
vxlan is much older than vxlan kernel module, Jesse mentioned we should
backport all the new changes in vxlan kernel module, but the effort will
be very huge, ovs vxlan needs to be compatible with some the old
kernels, that makes backporting must be very careful, Jesse also
mentioned Pravin is backporting them. This patch is just to make sure
your patch set, Simon's patch set and VxLAN-gpe can work correctly as
expected.

You can take the part for netlink and merge it to your next patch set
version if needed. Changes are not big, our email system will result in
some patches loss if multiple patches are sent, so we prefer one patch
including all the stuff.
> 
> Cascardo.
> 
> > ---
> >  datapath/linux/compat/include/linux/if_link.h     |   4 +
> >  datapath/linux/compat/include/linux/openvswitch.h |   1 +
> >  datapath/linux/compat/include/net/vxlan.h         |  73 ++++
> >  datapath/linux/compat/vxlan.c                     | 461 ++++++++++++++++++++--
> >  lib/dpif-netlink.c                                |   5 +
> >  lib/netdev-vport.c                                |   4 +-
> >  6 files changed, 512 insertions(+), 36 deletions(-)
> > 
> > diff --git a/datapath/linux/compat/include/linux/if_link.h b/datapath/linux/compat/include/linux/if_link.h
> > index 6209dcb..de87769 100644
> > --- a/datapath/linux/compat/include/linux/if_link.h
> > +++ b/datapath/linux/compat/include/linux/if_link.h
> > @@ -100,6 +100,10 @@ enum {
> >  	IFLA_VXLAN_REMCSUM_NOPARTIAL,
> >  #define IFLA_VXLAN_COLLECT_METADATA rpl_IFLA_VXLAN_COLLECT_METADATA
> >  	IFLA_VXLAN_COLLECT_METADATA,
> > +#define IFLA_VXLAN_LABEL rpl_IFLA_VXLAN_LABEL
> > +        IFLA_VXLAN_LABEL,
> > +#define IFLA_VXLAN_GPE rpl_IFLA_VXLAN_GPE
> > +        IFLA_VXLAN_GPE,
> >  #define __IFLA_VXLAN_MAX rpl___IFLA_VXLAN_MAX
> >  	__IFLA_VXLAN_MAX
> >  };
> > diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h
> > index edfa7a1..761d9c6 100644
> > --- a/datapath/linux/compat/include/linux/openvswitch.h
> > +++ b/datapath/linux/compat/include/linux/openvswitch.h
> > @@ -287,6 +287,7 @@ enum ovs_vport_attr {
> >  enum {
> >  	OVS_VXLAN_EXT_UNSPEC,
> >  	OVS_VXLAN_EXT_GBP,      /* Flag or __u32 */
> > +	OVS_VXLAN_EXT_GPE,      /* Flag, Generic Protocol Extension */
> >  	__OVS_VXLAN_EXT_MAX,
> >  };
> >  
> > diff --git a/datapath/linux/compat/include/net/vxlan.h b/datapath/linux/compat/include/net/vxlan.h
> > index 75a5a7a..b3f45c4 100644
> > --- a/datapath/linux/compat/include/net/vxlan.h
> > +++ b/datapath/linux/compat/include/net/vxlan.h
> > @@ -84,6 +84,66 @@ struct vxlanhdr_gbp {
> >  #define VXLAN_GBP_POLICY_APPLIED	(BIT(3) << 16)
> >  #define VXLAN_GBP_ID_MASK		(0xFFFF)
> >  
> > +/*
> > + * VXLAN Generic Protocol Extension (VXLAN_F_GPE):
> > + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> > + * |R|R|Ver|I|P|R|O|       Reserved                |Next Protocol  |
> > + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> > + * |                VXLAN Network Identifier (VNI) |   Reserved    |
> > + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> > + *
> > + * Ver = Version. Indicates VXLAN GPE protocol version.
> > + *
> > + * P = Next Protocol Bit. The P bit is set to indicate that the
> > + *     Next Protocol field is present.
> > + *
> > + * O = OAM Flag Bit. The O bit is set to indicate that the packet
> > + *     is an OAM packet.
> > + *
> > + * Next Protocol = This 8 bit field indicates the protocol header
> > + * immediately following the VXLAN GPE header.
> > + *
> > + * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01
> > + */
> > +
> > +struct vxlanhdr_gpe {
> > +#if defined(__LITTLE_ENDIAN_BITFIELD)
> > +       u8      oam_flag:1,
> > +               reserved_flags1:1,
> > +               np_applied:1,
> > +               instance_applied:1,
> > +               version:2,
> > +reserved_flags2:2;
> > +#elif defined(__BIG_ENDIAN_BITFIELD)
> > +       u8      reserved_flags2:2,
> > +               version:2,
> > +               instance_applied:1,
> > +               np_applied:1,
> > +               reserved_flags1:1,
> > +               oam_flag:1;
> > +#endif
> > +       u8      reserved_flags3;
> > +       u8      reserved_flags4;
> > +       u8      next_protocol;
> > +       __be32  vx_vni;
> > +};
> > +
> > +/* VXLAN-GPE header flags. */
> > +#define VXLAN_HF_VER   (BIT(29) | BIT(28))
> > +#define VXLAN_HF_NP    (BIT(26))
> > +#define VXLAN_HF_OAM   (BIT(24))
> > +#define VXLAN_HF_GPE   (BIT(26))
> > +
> > +#define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \
> > +                            (0xFF))
> > +
> > +/* VXLAN-GPE header Next Protocol. */
> > +#define VXLAN_GPE_NP_IPV4      0x01
> > +#define VXLAN_GPE_NP_IPV6      0x02
> > +#define VXLAN_GPE_NP_ETHERNET  0x03
> > +#define VXLAN_GPE_NP_NSH       0x04
> > +#define ETH_P_NSH              0x894f
> > +
> >  /* VXLAN protocol header:
> >   * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> >   * |G|R|R|R|I|R|R|C|               Reserved                        |
> > @@ -167,6 +227,7 @@ struct vxlan_config {
> >  	__u16			port_max;
> >  	__u8			tos;
> >  	__u8			ttl;
> > +	__be32                  label;
> >  	u32			flags;
> >  	unsigned long		age_interval;
> >  	unsigned int		addrmax;
> > @@ -205,15 +266,27 @@ struct vxlan_dev {
> >  #define VXLAN_F_GBP			0x800
> >  #define VXLAN_F_REMCSUM_NOPARTIAL	0x1000
> >  #define VXLAN_F_COLLECT_METADATA	0x2000
> > +#define VXLAN_F_GPE                     0x4000
> > +#define VXLAN_F_UDP_ZERO_CSUM_TX VXLAN_F_UDP_CSUM
> >  
> >  /* Flags that are used in the receive path. These flags must match in
> >   * order for a socket to be shareable
> >   */
> >  #define VXLAN_F_RCV_FLAGS		(VXLAN_F_GBP |			\
> > +                                         VXLAN_F_GPE |                  \
> >  					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
> >  					 VXLAN_F_REMCSUM_RX |		\
> >  					 VXLAN_F_REMCSUM_NOPARTIAL |	\
> >  					 VXLAN_F_COLLECT_METADATA)
> > +
> > +/* Flags that can be set together with VXLAN_F_GPE. */
> > +#define VXLAN_F_ALLOWED_GPE             (VXLAN_F_GPE |                  \
> > +                                         VXLAN_F_IPV6 |                 \
> > +                                         VXLAN_F_UDP_CSUM |     \
> > +                                         VXLAN_F_UDP_ZERO_CSUM6_TX |    \
> > +                                         VXLAN_F_UDP_ZERO_CSUM6_RX |    \
> > +                                         VXLAN_F_COLLECT_METADATA)
> > +
> >  #define vxlan_dev_create rpl_vxlan_dev_create
> >  struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name,
> >  				    u8 name_assign_type, struct vxlan_config *conf);
> > diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c
> > index 4faa18f..570d2d9 100644
> > --- a/datapath/linux/compat/vxlan.c
> > +++ b/datapath/linux/compat/vxlan.c
> > @@ -812,6 +812,45 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
> >  }
> >  #endif
> >  
> > +static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
> > +			       __be32 *protocol,
> > +			       struct sk_buff *skb, u32 vxflags)
> > +{
> > +       struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;
> > +
> > +       /* Need to have Next Protocol set for interfaces in GPE mode. */
> > +       if (!gpe->np_applied)
> > +	       return false;
> > +       /* "The initial version is 0. If a receiver does not support the
> > +	* version indicated it MUST drop the packet.
> > +	*/
> > +       if (gpe->version != 0)
> > +	       return false;
> > +       /* "When the O bit is set to 1, the packet is an OAM packet and OAM
> > +	* processing MUST occur." However, we don't implement OAM
> > +	* processing, thus drop the packet.
> > +	*/
> > +       if (gpe->oam_flag)
> > +	       return false;
> > +
> > +       switch (gpe->next_protocol) {
> > +       case VXLAN_GPE_NP_IPV4:
> > +	       *protocol = htons(ETH_P_IP);
> > +	       break;
> > +       case VXLAN_GPE_NP_IPV6:
> > +	       *protocol = htons(ETH_P_IPV6);
> > +	       break;
> > +       case VXLAN_GPE_NP_ETHERNET:
> > +	       *protocol = htons(ETH_P_TEB);
> > +	       break;
> > +       default:
> > +	       return false;
> > +       }
> > +
> > +       unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
> > +       return true;
> > +}
> > +
> >  static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
> >  		      struct vxlan_metadata *md, u32 vni,
> >  		      struct metadata_dst *tun_dst)
> > @@ -822,6 +861,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
> >  	struct pcpu_sw_netstats *stats;
> >  	union vxlan_addr saddr;
> >  	int err = 0;
> > +	struct vxlanhdr unparsed;
> > +	__be32 protocol = htons(ETH_P_TEB);
> > +	bool raw_proto = false;
> >  
> >  	/* For flow based devices, map all packets to VNI 0 */
> >  	if (vs->flags & VXLAN_F_COLLECT_METADATA)
> > @@ -832,14 +874,35 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
> >  	if (!vxlan)
> >  		goto drop;
> >  
> > -	skb_reset_mac_header(skb);
> > -	skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
> > -	skb->protocol = eth_type_trans(skb, vxlan->dev);
> > -	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> > +	/* For backwards compatibility, only allow reserved fields to be
> > +	 * used by VXLAN extensions if explicitly requested.
> > +	 */
> > +	if (vs->flags & VXLAN_F_GPE) {
> > +		unparsed = *(struct vxlanhdr *)(udp_hdr(skb) + 1);
> > +		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
> > +			goto drop;
> > +		if (protocol != htons(ETH_P_TEB)) {
> > +		    raw_proto = true;
> > +		}
> > +	}
> >  
> > -	/* Ignore packet loops (and multicast echo) */
> > -	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
> > -		goto drop;
> > +	if (!raw_proto) {
> > +		skb_reset_mac_header(skb);
> > +		skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
> > +		skb->protocol = eth_type_trans(skb, vxlan->dev);
> > +		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> > +
> > +		/* Ignore packet loops (and multicast echo) */
> > +		if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
> > +			goto drop;
> > +
> > +		if ((vxlan->flags & VXLAN_F_LEARN) &&
> > +		    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
> > +			goto drop;
> > +	} else {
> > +		skb->dev = vxlan->dev;
> > +		skb->pkt_type = PACKET_HOST;
> > +	}
> >  
> >  	/* Get data from the outer IP header */
> >  	if (vxlan_get_sk_family(vs) == AF_INET) {
> > @@ -861,10 +924,6 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
> >  		goto drop;
> >  	}
> >  
> > -	if ((vxlan->flags & VXLAN_F_LEARN) &&
> > -	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
> > -		goto drop;
> > -
> >  	skb_reset_network_header(skb);
> >  	/* In flow-based mode, GBP is carried in dst_metadata */
> >  	if (!(vs->flags & VXLAN_F_COLLECT_METADATA))
> > @@ -908,6 +967,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> >  		struct metadata_dst dst;
> >  		char buf[sizeof(struct metadata_dst) + sizeof(*md)];
> >  	} buf;
> > +	struct vxlanhdr unparsed;
> > +	__be32 protocol = htons(ETH_P_TEB);
> >  
> >  	/* Need Vxlan and inner Ethernet header to be present */
> >  	if (!pskb_may_pull(skb, VXLAN_HLEN))
> > @@ -924,14 +985,25 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> >  		goto bad_flags;
> >  	}
> >  
> > -	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
> > -		goto drop;
> > -	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
> > -
> >  	vs = rcu_dereference_sk_user_data(sk);
> >  	if (!vs)
> >  		goto drop;
> >  
> > +	/* For backwards compatibility, only allow reserved fields to be
> > +	 * used by VXLAN extensions if explicitly requested.
> > +	 */
> > +	if (vs->flags & VXLAN_F_GPE) {
> > +		unparsed = *(struct vxlanhdr *)(udp_hdr(skb) + 1);
> > +		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
> > +			goto drop;
> > +		buf.dst.u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
> > +		flags &= ~VXLAN_GPE_USED_BITS;
> > +	}
> > +
> > +	if (iptunnel_pull_header(skb, VXLAN_HLEN, protocol))
> > +		goto drop;
> > +	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
> > +
> >  #ifdef HAVE_VXLAN_HF_RCO
> >  	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
> >  		vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni,
> > @@ -1023,6 +1095,33 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
> >  	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
> >  }
> >  
> > +static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
> > +			       __be16 protocol)
> > +{
> > +	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
> > +
> > +	vxh->vx_flags |= htonl(VXLAN_HF_GPE);
> > +	gpe->np_applied = 1;
> > +	gpe->version = 0;
> > +	gpe->oam_flag = 0;
> > +
> > +	switch (protocol) {
> > +	case htons(ETH_P_IP):
> > +		gpe->next_protocol = VXLAN_GPE_NP_IPV4;
> > +		return 0;
> > +	case htons(ETH_P_IPV6):
> > +		gpe->next_protocol = VXLAN_GPE_NP_IPV6;
> > +		return 0;
> > +	case htons(ETH_P_TEB):
> > +		gpe->next_protocol = VXLAN_GPE_NP_ETHERNET;
> > +		return 0;
> > +	case htons(ETH_P_NSH):
> > +		gpe->next_protocol = VXLAN_GPE_NP_NSH;
> > +		return 0;
> > +	}
> > +	return -EPFNOSUPPORT;
> > +}
> > +
> >  #if IS_ENABLED(CONFIG_IPV6)
> >  static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
> >  			   struct sk_buff *skb,
> > @@ -1036,6 +1135,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
> >  	int err;
> >  	bool udp_sum = !(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX);
> >  	int type = 0;
> > +	__be16 inner_protocol = htons(ETH_P_TEB);
> >  
> >  	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
> >  	    skb->ip_summed == CHECKSUM_PARTIAL) {
> > @@ -1106,8 +1206,14 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
> >  
> >  	if (vxflags & VXLAN_F_GBP)
> >  		vxlan_build_gbp_hdr(vxh, vxflags, md);
> > +	if (vxflags & VXLAN_F_GPE) {
> > +		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
> > +		if (err < 0)
> > +			goto err;
> > +		inner_protocol = skb->protocol;
> > +	}
> >  
> > -	ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> > +	ovs_skb_set_inner_protocol(skb, inner_protocol);
> >  
> >  	udp_tunnel6_xmit_skb(dst, sk, skb, dev, saddr, daddr, prio,
> >  			     ttl, src_port, dst_port,
> > @@ -1129,6 +1235,7 @@ static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *sk
> >  	int err;
> >  	bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM);
> >  	int type = 0;
> > +	__be16 inner_protocol = htons(ETH_P_TEB);
> >  
> >  	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
> >  	    skb->ip_summed == CHECKSUM_PARTIAL) {
> > @@ -1191,8 +1298,14 @@ static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *sk
> >  	}
> >  	if (vxflags & VXLAN_F_GBP)
> >  		vxlan_build_gbp_hdr(vxh, vxflags, md);
> > +	if (vxflags & VXLAN_F_GPE) {
> > +		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
> > +		if (err < 0)
> > +			return err;
> > +		inner_protocol = skb->protocol;
> > +	}
> >  
> > -	ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> > +	ovs_skb_set_inner_protocol(skb, inner_protocol);
> >  
> >  	return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos,
> >  				   ttl, df, src_port, dst_port, xnet,
> > @@ -1419,7 +1532,7 @@ tx_free:
> >   *
> >   * Outer IP header inherits ECN and DF from inner header.
> >   * Outer UDP destination is the VXLAN assigned port.
> > - *           source port is based on hash of flow
> > + *	   source port is based on hash of flow
> >   */
> >  netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
> >  {
> > @@ -1648,7 +1761,7 @@ static netdev_tx_t vxlan_dev_xmit(struct sk_buff *skb, struct net_device *dev)
> >  	return NETDEV_TX_OK;
> >  }
> >  
> > -static const struct net_device_ops vxlan_netdev_ops = {
> > +static const struct net_device_ops vxlan_netdev_ether_ops = {
> >  	.ndo_init		= vxlan_init,
> >  	.ndo_uninit		= vxlan_uninit,
> >  	.ndo_get_stats64	= ip_tunnel_get_stats64,
> > @@ -1661,6 +1774,16 @@ static const struct net_device_ops vxlan_netdev_ops = {
> >  	.ndo_set_mac_address	= eth_mac_addr,
> >  };
> >  
> > +static const struct net_device_ops vxlan_netdev_raw_ops = {
> > +	.ndo_init		= vxlan_init,
> > +	.ndo_uninit		= vxlan_uninit,
> > +	.ndo_get_stats64	= ip_tunnel_get_stats64,
> > +	.ndo_open		= vxlan_open,
> > +	.ndo_stop		= vxlan_stop,
> > +	.ndo_start_xmit		= vxlan_dev_xmit,
> > +	.ndo_change_mtu		= vxlan_change_mtu,
> > +};
> > +
> >  /* Info for udev, that this is a virtual tunnel endpoint */
> >  static struct device_type vxlan_type = {
> >  	.name = "vxlan",
> > @@ -1675,7 +1798,7 @@ static void vxlan_setup(struct net_device *dev)
> >  	eth_hw_addr_random(dev);
> >  	ether_setup(dev);
> >  
> > -	dev->netdev_ops = &vxlan_netdev_ops;
> > +	dev->netdev_ops = &vxlan_netdev_ether_ops;
> >  	dev->destructor = free_netdev;
> >  	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
> >  
> > @@ -1712,8 +1835,51 @@ static void vxlan_setup(struct net_device *dev)
> >  		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
> >  }
> >  
> > +static void vxlan_ether_setup(struct net_device *dev)
> > +{
> > +	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
> > +	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
> > +	dev->netdev_ops = &vxlan_netdev_ether_ops;
> > +}
> > +
> > +static void vxlan_raw_setup(struct net_device *dev)
> > +{
> > +	dev->header_ops = NULL;
> > +	dev->type = ARPHRD_NONE;
> > +	dev->hard_header_len = 0;
> > +	dev->addr_len = 0;
> > +	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
> > +	dev->netdev_ops = &vxlan_netdev_raw_ops;
> > +}
> > +
> >  static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
> > -	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
> > +	[IFLA_VXLAN_ID]	 = { .type = NLA_U32 },
> > +	[IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
> > +	[IFLA_VXLAN_GROUP6]     = { .len = sizeof(struct in6_addr) },
> > +	[IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
> > +	[IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
> > +	[IFLA_VXLAN_LOCAL6]     = { .len = sizeof(struct in6_addr) },
> > +	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
> > +	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
> > +	[IFLA_VXLAN_LABEL]      = { .type = NLA_U32 },
> > +	[IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
> > +	[IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
> > +	[IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
> > +	[IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
> > +	[IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
> > +	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
> > +	[IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
> > +	[IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
> > +	[IFLA_VXLAN_COLLECT_METADATA]   = { .type = NLA_U8 },
> > +	[IFLA_VXLAN_PORT]       = { .type = NLA_U16 },
> > +	[IFLA_VXLAN_UDP_CSUM]   = { .type = NLA_U8 },
> > +	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]  = { .type = NLA_U8 },
> > +	[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]  = { .type = NLA_U8 },
> > +	[IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
> > +	[IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
> > +	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
> > +	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
> > +	[IFLA_VXLAN_REMCSUM_NOPARTIAL]  = { .type = NLA_FLAG },
> >  };
> >  
> >  static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
> > @@ -1897,6 +2063,21 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
> >  	__be16 default_port = vxlan->cfg.dst_port;
> >  	struct net_device *lowerdev = NULL;
> >  
> > +	if (conf->flags & VXLAN_F_GPE) {
> > +		if (conf->flags & ~VXLAN_F_ALLOWED_GPE)
> > +			return -EINVAL;
> > +		/* For now, allow GPE only together with COLLECT_METADATA.
> > +		 * This can be relaxed later; in such case, the other side
> > +		 * of the PtP link will have to be provided.
> > +		 */
> > +		if (!(conf->flags & VXLAN_F_COLLECT_METADATA))
> > +			return -EINVAL;
> > +
> > +		vxlan_raw_setup(dev);
> > +	} else {
> > +		vxlan_ether_setup(dev);
> > +	}
> > +
> >  	vxlan->net = src_net;
> >  
> >  	dst->remote_vni = conf->vni;
> > @@ -2023,7 +2204,136 @@ static int vxlan_newlink(struct net_device *dev,
> >  			 struct nlattr *tb[], struct nlattr *data[])
> >  #endif
> >  {
> > -	return -EINVAL;
> > +	struct vxlan_config conf;
> > +	int err;
> > +
> > +	memset(&conf, 0, sizeof(conf));
> > +
> > +	if (data[IFLA_VXLAN_ID])
> > +		conf.vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
> > +
> > +	if (data[IFLA_VXLAN_GROUP]) {
> > +		conf.remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
> > +	} else if (data[IFLA_VXLAN_GROUP6]) {
> > +		if (!IS_ENABLED(CONFIG_IPV6))
> > +			return -EPFNOSUPPORT;
> > +
> > +		conf.remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
> > +		conf.remote_ip.sa.sa_family = AF_INET6;
> > +	}
> > +
> > +	if (data[IFLA_VXLAN_LOCAL]) {
> > +		conf.saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
> > +		conf.saddr.sa.sa_family = AF_INET;
> > +	} else if (data[IFLA_VXLAN_LOCAL6]) {
> > +		if (!IS_ENABLED(CONFIG_IPV6))
> > +			return -EPFNOSUPPORT;
> > +
> > +		/* TODO: respect scope id */
> > +		conf.saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
> > +		conf.saddr.sa.sa_family = AF_INET6;
> > +	}
> > +
> > +	if (data[IFLA_VXLAN_LINK])
> > +		conf.remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
> > +
> > +	if (data[IFLA_VXLAN_TOS])
> > +		conf.tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
> > +
> > +	if (data[IFLA_VXLAN_TTL])
> > +		conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
> > +
> > +	if (data[IFLA_VXLAN_LABEL])
> > +		conf.label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
> > +			     IPV6_FLOWLABEL_MASK;
> > +
> > +	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
> > +		conf.flags |= VXLAN_F_LEARN;
> > +
> > +	if (data[IFLA_VXLAN_AGEING])
> > +		conf.age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
> > +
> > +	if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
> > +		conf.flags |= VXLAN_F_PROXY;
> > +
> > +	if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
> > +		conf.flags |= VXLAN_F_RSC;
> > +
> > +	if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
> > +		conf.flags |= VXLAN_F_L2MISS;
> > +
> > +	if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
> > +		conf.flags |= VXLAN_F_L3MISS;
> > +
> > +	if (data[IFLA_VXLAN_LIMIT])
> > +		conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
> > +
> > +	if (data[IFLA_VXLAN_COLLECT_METADATA] &&
> > +	    nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA]))
> > +		conf.flags |= VXLAN_F_COLLECT_METADATA;
> > +
> > +	if (data[IFLA_VXLAN_PORT_RANGE]) {
> > +		const struct ifla_vxlan_port_range *p
> > +			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
> > +		conf.port_min = ntohs(p->low);
> > +		conf.port_max = ntohs(p->high);
> > +	}
> > +
> > +	if (data[IFLA_VXLAN_PORT])
> > +		conf.dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
> > +
> > +	if (data[IFLA_VXLAN_UDP_CSUM] &&
> > +	    !nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
> > +		conf.flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
> > +
> > +	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] &&
> > +	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
> > +		conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
> > +
> > +	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] &&
> > +	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
> > +		conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
> > +
> > +	if (data[IFLA_VXLAN_REMCSUM_TX] &&
> > +	    nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX]))
> > +		conf.flags |= VXLAN_F_REMCSUM_TX;
> > +
> > +	if (data[IFLA_VXLAN_REMCSUM_RX] &&
> > +	    nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX]))
> > +		conf.flags |= VXLAN_F_REMCSUM_RX;
> > +
> > +	if (data[IFLA_VXLAN_GBP])
> > +		conf.flags |= VXLAN_F_GBP;
> > +
> > +	if (data[IFLA_VXLAN_GPE])
> > +		conf.flags |= VXLAN_F_GPE;
> > +
> > +	if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL])
> > +		conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL;
> > +
> > +	if (tb[IFLA_MTU])
> > +		conf.mtu = nla_get_u32(tb[IFLA_MTU]);
> > +
> > +	err = vxlan_dev_configure(src_net, dev, &conf);
> > +	switch (err) {
> > +	case -ENODEV:
> > +		pr_info("ifindex %d does not exist\n", conf.remote_ifindex);
> > +		break;
> > +
> > +	case -EPERM:
> > +		pr_info("IPv6 is disabled via sysctl\n");
> > +		break;
> > +
> > +	case -EEXIST:
> > +		pr_info("duplicate VNI %u\n", be32_to_cpu(conf.vni));
> > +		break;
> > +
> > +	case -EINVAL:
> > +		pr_info("unsupported combination of extensions\n");
> > +		break;
> > +	}
> > +
> > +	return err;
> >  }
> >  
> >  #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
> > @@ -2047,20 +2357,21 @@ static void vxlan_dellink(struct net_device *dev)
> >  static size_t vxlan_get_size(const struct net_device *dev)
> >  {
> >  
> > -	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
> > +	return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
> >  		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
> > -		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
> > +		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
> >  		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
> > -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
> > -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
> > -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
> > -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
> > -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
> > -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
> > -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
> > -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
> > -		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
> > -		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
> > +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
> > +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
> > +		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
> > +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
> > +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
> > +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
> > +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
> > +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
> > +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_COLLECT_METADATA */
> > +		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
> > +		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
> >  		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
> >  		nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
> >  		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
> > @@ -2074,8 +2385,88 @@ static size_t vxlan_get_size(const struct net_device *dev)
> >  static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
> >  {
> >  	const struct vxlan_dev *vxlan = netdev_priv(dev);
> > +	const struct vxlan_rdst *dst = &vxlan->default_dst;
> > +	struct ifla_vxlan_port_range ports = {
> > +		.low =  htons(vxlan->cfg.port_min),
> > +		.high = htons(vxlan->cfg.port_max),
> > +	};
> > +
> > +	if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
> > +		goto nla_put_failure;
> > +
> > +	if (!vxlan_addr_any(&dst->remote_ip)) {
> > +		if (dst->remote_ip.sa.sa_family == AF_INET) {
> > +			if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
> > +					    dst->remote_ip.sin.sin_addr.s_addr))
> > +				goto nla_put_failure;
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +		} else {
> > +			if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
> > +					     &dst->remote_ip.sin6.sin6_addr))
> > +				goto nla_put_failure;
> > +#endif
> > +		}
> > +	}
> > +
> > +	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
> > +		goto nla_put_failure;
> > +
> > +	if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
> > +		if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
> > +			if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
> > +					    vxlan->cfg.saddr.sin.sin_addr.s_addr))
> > +				goto nla_put_failure;
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +		} else {
> > +			if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
> > +					     &vxlan->cfg.saddr.sin6.sin6_addr))
> > +				goto nla_put_failure;
> > +#endif
> > +		}
> > +	}
> > +
> > +	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
> > +	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
> > +			!!(vxlan->flags & VXLAN_F_LEARN)) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
> > +			!!(vxlan->flags & VXLAN_F_PROXY)) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_L2MISS,
> > +			!!(vxlan->flags & VXLAN_F_L2MISS)) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
> > +			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
> > +		       !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) ||
> > +	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
> > +	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
> > +	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
> > +			!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
> > +			!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
> > +			!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
> > +			!!(vxlan->flags & VXLAN_F_REMCSUM_TX)) ||
> > +	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
> > +			!!(vxlan->flags & VXLAN_F_REMCSUM_RX)))
> > +		goto nla_put_failure;
> > +
> > +	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
> > +		goto nla_put_failure;
> > +
> > +	if (vxlan->flags & VXLAN_F_GBP &&
> > +	    nla_put_flag(skb, IFLA_VXLAN_GBP))
> > +		goto nla_put_failure;
> > +
> > +	if (vxlan->flags & VXLAN_F_GPE &&
> > +	    nla_put_flag(skb, IFLA_VXLAN_GPE))
> > +		goto nla_put_failure;
> >  
> > -	if (nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port))
> > +	if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL &&
> > +	    nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
> >  		goto nla_put_failure;
> >  
> >  	return 0;
> > diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> > index 1e88c13..2b07e54 100644
> > --- a/lib/dpif-netlink.c
> > +++ b/lib/dpif-netlink.c
> > @@ -988,6 +988,8 @@ netdev_geneve_destroy(const char *name)
> >  #define IFLA_VXLAN_UDP_ZERO_CSUM6_RX 20
> >  #define IFLA_VXLAN_GBP 23
> >  #define IFLA_VXLAN_COLLECT_METADATA 25
> > +#define IFLA_VXLAN_LABEL 26
> > +#define IFLA_VXLAN_GPE 27
> >  #endif
> >  
> >  #if IFLA_GRE_MAX < 18
> > @@ -1037,6 +1039,9 @@ netdev_vxlan_create(struct netdev *netdev)
> >              if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP)) {
> >                  nl_msg_put_flag(&request, IFLA_VXLAN_GBP);
> >              }
> > +            else if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GPE)) {
> > +                nl_msg_put_flag(&request, IFLA_VXLAN_GPE);
> > +            }
> >              nl_msg_put_be16(&request, IFLA_VXLAN_PORT, tnl_cfg->dst_port);
> >          nl_msg_end_nested(&request, infodata_off);
> >      nl_msg_end_nested(&request, linkinfo_off);
> > diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
> > index ec5c44e..fa56af5 100644
> > --- a/lib/netdev-vport.c
> > +++ b/lib/netdev-vport.c
> > @@ -541,7 +541,9 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args)
> >              while (ext) {
> >                  if (!strcmp(type, "vxlan") && !strcmp(ext, "gbp")) {
> >                      tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GBP);
> > -                } else {
> > +                } else if (!strcmp(type, "vxlan") && !strcmp(ext, "gpe")) {
> > +                     tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GPE);
> > +		} else {
> >                      VLOG_WARN("%s: unknown extension '%s'", name, ext);
> >                  }
> >  
> > -- 
> > 1.9.3
> >
Jesse Gross June 21, 2016, 2 a.m. UTC | #3
On Mon, Jun 20, 2016 at 6:39 PM, Yang, Yi <yi.y.yang@intel.com> wrote:
> On Mon, Jun 20, 2016 at 10:11:14AM -0300, Thadeu Lima de Souza Cascardo wrote:
>> On Mon, Jun 20, 2016 at 08:36:43PM +0800, Yi Yang wrote:
>> > Current Linux kernel git tree has included VxLAN-gpe implementation
>> >
>> > author  Jiri Benc <jbenc@redhat.com>
>> > committer       David S. Miller <davem@davemloft.net>
>> > commit  e1e5314de08ba6003b358125eafc9ad9e75a950c (patch)
>> > tree    1e18cdabf1c9d9ef17e26c6480e629465447f77f /drivers/net/vxlan.c
>> > parent  a6d5bbf34efa8330af7b0b1dba0f38148516ed97 (diff)
>> > vxlan: implement GPE
>> >
>> > This patch is to port it to ovs in order that people also can use VxLAN-gpe
>> > even if they don't replace their kernels with latest Linux kernel.
>> >
>> > Signed-off-by: Johnson Li <johnson.li@intel.com>
>> > Signed-off-by: Yi Yang <yi.y.yang@intel.com>
>>
>>
>> Hi, Yi Yang.
>>
>> Before adding the OVS_VXLAN_EXT_GPE extension to the out-of-tree module, you
>> should send it to the mainline kernel. Besides, you need a very good
>> justification why you can't wait for my patchset to be accepted and have
>> VXLAN-GPE enabled using rtnetlink.
>
> Will add VS_VXLAN_EXT_GPE to include/uapi/linux/openvswitch.h and send a
> kernel patch, but ovs and net-next kernel also can work together.

I think that you might have misunderstood the gist of the suggestion -
please do not add this extension. This is pure compatibility code for
existing, old features and should not be extended with new things
unless there is a very, very good reason (which would almost certainly
be related to fixing bugs, not new functionality) - especially since
there's already an existing mechanism to do this.

>> Also, I would split any changes to the datapath and userspace parts of the code
>> into multiple commits.
>>
>> Meanwhile, you could backport only the upstreamed portions of VXLAN-GPE and send
>> that as a single commit, no userspace changes.
>
> VxLAN-gpe part depends on some new changes in vxlan module, we have to
> backport them into ovs, this patch has been VxLAN-gpe only, current ovs
> vxlan is much older than vxlan kernel module, Jesse mentioned we should
> backport all the new changes in vxlan kernel module, but the effort will
> be very huge, ovs vxlan needs to be compatible with some the old
> kernels, that makes backporting must be very careful, Jesse also
> mentioned Pravin is backporting them. This patch is just to make sure
> your patch set, Simon's patch set and VxLAN-gpe can work correctly as
> expected.

If a patch is not ready to be merged, it's nice if you mark it as an RFC.

> You can take the part for netlink and merge it to your next patch set
> version if needed. Changes are not big, our email system will result in
> some patches loss if multiple patches are sent, so we prefer one patch
> including all the stuff.

One option to deal with mail problems is to put the patches up on
github (though please at least try to send a copy to the mailing list
as well).
Yang, Yi June 21, 2016, 2:30 a.m. UTC | #4
On Mon, Jun 20, 2016 at 07:00:56PM -0700, Jesse Gross wrote:
> >>
> >> Hi, Yi Yang.
> >>
> >> Before adding the OVS_VXLAN_EXT_GPE extension to the out-of-tree module, you
> >> should send it to the mainline kernel. Besides, you need a very good
> >> justification why you can't wait for my patchset to be accepted and have
> >> VXLAN-GPE enabled using rtnetlink.
> >
> > Will add VS_VXLAN_EXT_GPE to include/uapi/linux/openvswitch.h and send a
> > kernel patch, but ovs and net-next kernel also can work together.
> 
> I think that you might have misunderstood the gist of the suggestion -
> please do not add this extension. This is pure compatibility code for
> existing, old features and should not be extended with new things
> unless there is a very, very good reason (which would almost certainly
> be related to fixing bugs, not new functionality) - especially since
> there's already an existing mechanism to do this.

I'm confused, kernel has this header file
include/uapi/linux/openvswitch.h, but ovs also has this header file
datapath/linux/compat/include/linux/openvswitch.h, both of them included
this enumeration definition:

enum {
        OVS_VXLAN_EXT_UNSPEC,
        OVS_VXLAN_EXT_GBP,      /* Flag or __u32 */
        __OVS_VXLAN_EXT_MAX,
};

Don't we need to make sure they are consistent with the below line?

        OVS_VXLAN_EXT_GPE,      /* Flag, Generic Protocol Extension */
Jesse Gross June 21, 2016, 4:26 p.m. UTC | #5
On Mon, Jun 20, 2016 at 7:30 PM, Yang, Yi <yi.y.yang@intel.com> wrote:
> On Mon, Jun 20, 2016 at 07:00:56PM -0700, Jesse Gross wrote:
>> >>
>> >> Hi, Yi Yang.
>> >>
>> >> Before adding the OVS_VXLAN_EXT_GPE extension to the out-of-tree module, you
>> >> should send it to the mainline kernel. Besides, you need a very good
>> >> justification why you can't wait for my patchset to be accepted and have
>> >> VXLAN-GPE enabled using rtnetlink.
>> >
>> > Will add VS_VXLAN_EXT_GPE to include/uapi/linux/openvswitch.h and send a
>> > kernel patch, but ovs and net-next kernel also can work together.
>>
>> I think that you might have misunderstood the gist of the suggestion -
>> please do not add this extension. This is pure compatibility code for
>> existing, old features and should not be extended with new things
>> unless there is a very, very good reason (which would almost certainly
>> be related to fixing bugs, not new functionality) - especially since
>> there's already an existing mechanism to do this.
>
> I'm confused, kernel has this header file
> include/uapi/linux/openvswitch.h, but ovs also has this header file
> datapath/linux/compat/include/linux/openvswitch.h, both of them included
> this enumeration definition:
>
> enum {
>         OVS_VXLAN_EXT_UNSPEC,
>         OVS_VXLAN_EXT_GBP,      /* Flag or __u32 */
>         __OVS_VXLAN_EXT_MAX,
> };
>
> Don't we need to make sure they are consistent with the below line?
>
>         OVS_VXLAN_EXT_GPE,      /* Flag, Generic Protocol Extension */

You don't need to add this, either upstream or in the OVS tree. It's
already possible to configure VXLAN GPE through the VXLAN netlink
interface, so that's what you should be using.

When doing backports, please do a straight backport of what is
upstream without adding anything new. I would also like you to layer
this on top of VXLAN previous code that needs to be backported rather
than taking it out of order.
diff mbox

Patch

diff --git a/datapath/linux/compat/include/linux/if_link.h b/datapath/linux/compat/include/linux/if_link.h
index 6209dcb..de87769 100644
--- a/datapath/linux/compat/include/linux/if_link.h
+++ b/datapath/linux/compat/include/linux/if_link.h
@@ -100,6 +100,10 @@  enum {
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
 #define IFLA_VXLAN_COLLECT_METADATA rpl_IFLA_VXLAN_COLLECT_METADATA
 	IFLA_VXLAN_COLLECT_METADATA,
+#define IFLA_VXLAN_LABEL rpl_IFLA_VXLAN_LABEL
+        IFLA_VXLAN_LABEL,
+#define IFLA_VXLAN_GPE rpl_IFLA_VXLAN_GPE
+        IFLA_VXLAN_GPE,
 #define __IFLA_VXLAN_MAX rpl___IFLA_VXLAN_MAX
 	__IFLA_VXLAN_MAX
 };
diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h
index edfa7a1..761d9c6 100644
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -287,6 +287,7 @@  enum ovs_vport_attr {
 enum {
 	OVS_VXLAN_EXT_UNSPEC,
 	OVS_VXLAN_EXT_GBP,      /* Flag or __u32 */
+	OVS_VXLAN_EXT_GPE,      /* Flag, Generic Protocol Extension */
 	__OVS_VXLAN_EXT_MAX,
 };
 
diff --git a/datapath/linux/compat/include/net/vxlan.h b/datapath/linux/compat/include/net/vxlan.h
index 75a5a7a..b3f45c4 100644
--- a/datapath/linux/compat/include/net/vxlan.h
+++ b/datapath/linux/compat/include/net/vxlan.h
@@ -84,6 +84,66 @@  struct vxlanhdr_gbp {
 #define VXLAN_GBP_POLICY_APPLIED	(BIT(3) << 16)
 #define VXLAN_GBP_ID_MASK		(0xFFFF)
 
+/*
+ * VXLAN Generic Protocol Extension (VXLAN_F_GPE):
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |R|R|Ver|I|P|R|O|       Reserved                |Next Protocol  |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                VXLAN Network Identifier (VNI) |   Reserved    |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Ver = Version. Indicates VXLAN GPE protocol version.
+ *
+ * P = Next Protocol Bit. The P bit is set to indicate that the
+ *     Next Protocol field is present.
+ *
+ * O = OAM Flag Bit. The O bit is set to indicate that the packet
+ *     is an OAM packet.
+ *
+ * Next Protocol = This 8 bit field indicates the protocol header
+ * immediately following the VXLAN GPE header.
+ *
+ * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01
+ */
+
+struct vxlanhdr_gpe {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       u8      oam_flag:1,
+               reserved_flags1:1,
+               np_applied:1,
+               instance_applied:1,
+               version:2,
+reserved_flags2:2;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       u8      reserved_flags2:2,
+               version:2,
+               instance_applied:1,
+               np_applied:1,
+               reserved_flags1:1,
+               oam_flag:1;
+#endif
+       u8      reserved_flags3;
+       u8      reserved_flags4;
+       u8      next_protocol;
+       __be32  vx_vni;
+};
+
+/* VXLAN-GPE header flags. */
+#define VXLAN_HF_VER   (BIT(29) | BIT(28))
+#define VXLAN_HF_NP    (BIT(26))
+#define VXLAN_HF_OAM   (BIT(24))
+#define VXLAN_HF_GPE   (BIT(26))
+
+#define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \
+                            (0xFF))
+
+/* VXLAN-GPE header Next Protocol. */
+#define VXLAN_GPE_NP_IPV4      0x01
+#define VXLAN_GPE_NP_IPV6      0x02
+#define VXLAN_GPE_NP_ETHERNET  0x03
+#define VXLAN_GPE_NP_NSH       0x04
+#define ETH_P_NSH              0x894f
+
 /* VXLAN protocol header:
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |G|R|R|R|I|R|R|C|               Reserved                        |
@@ -167,6 +227,7 @@  struct vxlan_config {
 	__u16			port_max;
 	__u8			tos;
 	__u8			ttl;
+	__be32                  label;
 	u32			flags;
 	unsigned long		age_interval;
 	unsigned int		addrmax;
@@ -205,15 +266,27 @@  struct vxlan_dev {
 #define VXLAN_F_GBP			0x800
 #define VXLAN_F_REMCSUM_NOPARTIAL	0x1000
 #define VXLAN_F_COLLECT_METADATA	0x2000
+#define VXLAN_F_GPE                     0x4000
+#define VXLAN_F_UDP_ZERO_CSUM_TX VXLAN_F_UDP_CSUM
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
  */
 #define VXLAN_F_RCV_FLAGS		(VXLAN_F_GBP |			\
+                                         VXLAN_F_GPE |                  \
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
 					 VXLAN_F_REMCSUM_RX |		\
 					 VXLAN_F_REMCSUM_NOPARTIAL |	\
 					 VXLAN_F_COLLECT_METADATA)
+
+/* Flags that can be set together with VXLAN_F_GPE. */
+#define VXLAN_F_ALLOWED_GPE             (VXLAN_F_GPE |                  \
+                                         VXLAN_F_IPV6 |                 \
+                                         VXLAN_F_UDP_CSUM |     \
+                                         VXLAN_F_UDP_ZERO_CSUM6_TX |    \
+                                         VXLAN_F_UDP_ZERO_CSUM6_RX |    \
+                                         VXLAN_F_COLLECT_METADATA)
+
 #define vxlan_dev_create rpl_vxlan_dev_create
 struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name,
 				    u8 name_assign_type, struct vxlan_config *conf);
diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c
index 4faa18f..570d2d9 100644
--- a/datapath/linux/compat/vxlan.c
+++ b/datapath/linux/compat/vxlan.c
@@ -812,6 +812,45 @@  static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 }
 #endif
 
+static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
+			       __be32 *protocol,
+			       struct sk_buff *skb, u32 vxflags)
+{
+       struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;
+
+       /* Need to have Next Protocol set for interfaces in GPE mode. */
+       if (!gpe->np_applied)
+	       return false;
+       /* "The initial version is 0. If a receiver does not support the
+	* version indicated it MUST drop the packet.
+	*/
+       if (gpe->version != 0)
+	       return false;
+       /* "When the O bit is set to 1, the packet is an OAM packet and OAM
+	* processing MUST occur." However, we don't implement OAM
+	* processing, thus drop the packet.
+	*/
+       if (gpe->oam_flag)
+	       return false;
+
+       switch (gpe->next_protocol) {
+       case VXLAN_GPE_NP_IPV4:
+	       *protocol = htons(ETH_P_IP);
+	       break;
+       case VXLAN_GPE_NP_IPV6:
+	       *protocol = htons(ETH_P_IPV6);
+	       break;
+       case VXLAN_GPE_NP_ETHERNET:
+	       *protocol = htons(ETH_P_TEB);
+	       break;
+       default:
+	       return false;
+       }
+
+       unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
+       return true;
+}
+
 static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 		      struct vxlan_metadata *md, u32 vni,
 		      struct metadata_dst *tun_dst)
@@ -822,6 +861,9 @@  static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	struct pcpu_sw_netstats *stats;
 	union vxlan_addr saddr;
 	int err = 0;
+	struct vxlanhdr unparsed;
+	__be32 protocol = htons(ETH_P_TEB);
+	bool raw_proto = false;
 
 	/* For flow based devices, map all packets to VNI 0 */
 	if (vs->flags & VXLAN_F_COLLECT_METADATA)
@@ -832,14 +874,35 @@  static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	if (!vxlan)
 		goto drop;
 
-	skb_reset_mac_header(skb);
-	skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
-	skb->protocol = eth_type_trans(skb, vxlan->dev);
-	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+	/* For backwards compatibility, only allow reserved fields to be
+	 * used by VXLAN extensions if explicitly requested.
+	 */
+	if (vs->flags & VXLAN_F_GPE) {
+		unparsed = *(struct vxlanhdr *)(udp_hdr(skb) + 1);
+		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
+			goto drop;
+		if (protocol != htons(ETH_P_TEB)) {
+		    raw_proto = true;
+		}
+	}
 
-	/* Ignore packet loops (and multicast echo) */
-	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
-		goto drop;
+	if (!raw_proto) {
+		skb_reset_mac_header(skb);
+		skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
+		skb->protocol = eth_type_trans(skb, vxlan->dev);
+		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+		/* Ignore packet loops (and multicast echo) */
+		if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
+			goto drop;
+
+		if ((vxlan->flags & VXLAN_F_LEARN) &&
+		    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
+			goto drop;
+	} else {
+		skb->dev = vxlan->dev;
+		skb->pkt_type = PACKET_HOST;
+	}
 
 	/* Get data from the outer IP header */
 	if (vxlan_get_sk_family(vs) == AF_INET) {
@@ -861,10 +924,6 @@  static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 		goto drop;
 	}
 
-	if ((vxlan->flags & VXLAN_F_LEARN) &&
-	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
-		goto drop;
-
 	skb_reset_network_header(skb);
 	/* In flow-based mode, GBP is carried in dst_metadata */
 	if (!(vs->flags & VXLAN_F_COLLECT_METADATA))
@@ -908,6 +967,8 @@  static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		struct metadata_dst dst;
 		char buf[sizeof(struct metadata_dst) + sizeof(*md)];
 	} buf;
+	struct vxlanhdr unparsed;
+	__be32 protocol = htons(ETH_P_TEB);
 
 	/* Need Vxlan and inner Ethernet header to be present */
 	if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -924,14 +985,25 @@  static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		goto bad_flags;
 	}
 
-	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
-		goto drop;
-	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
-
 	vs = rcu_dereference_sk_user_data(sk);
 	if (!vs)
 		goto drop;
 
+	/* For backwards compatibility, only allow reserved fields to be
+	 * used by VXLAN extensions if explicitly requested.
+	 */
+	if (vs->flags & VXLAN_F_GPE) {
+		unparsed = *(struct vxlanhdr *)(udp_hdr(skb) + 1);
+		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
+			goto drop;
+		buf.dst.u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
+		flags &= ~VXLAN_GPE_USED_BITS;
+	}
+
+	if (iptunnel_pull_header(skb, VXLAN_HLEN, protocol))
+		goto drop;
+	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
+
 #ifdef HAVE_VXLAN_HF_RCO
 	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
 		vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni,
@@ -1023,6 +1095,33 @@  static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
 	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
 }
 
+static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
+			       __be16 protocol)
+{
+	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
+
+	vxh->vx_flags |= htonl(VXLAN_HF_GPE);
+	gpe->np_applied = 1;
+	gpe->version = 0;
+	gpe->oam_flag = 0;
+
+	switch (protocol) {
+	case htons(ETH_P_IP):
+		gpe->next_protocol = VXLAN_GPE_NP_IPV4;
+		return 0;
+	case htons(ETH_P_IPV6):
+		gpe->next_protocol = VXLAN_GPE_NP_IPV6;
+		return 0;
+	case htons(ETH_P_TEB):
+		gpe->next_protocol = VXLAN_GPE_NP_ETHERNET;
+		return 0;
+	case htons(ETH_P_NSH):
+		gpe->next_protocol = VXLAN_GPE_NP_NSH;
+		return 0;
+	}
+	return -EPFNOSUPPORT;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			   struct sk_buff *skb,
@@ -1036,6 +1135,7 @@  static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 	int err;
 	bool udp_sum = !(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX);
 	int type = 0;
+	__be16 inner_protocol = htons(ETH_P_TEB);
 
 	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
 	    skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -1106,8 +1206,14 @@  static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 
 	if (vxflags & VXLAN_F_GBP)
 		vxlan_build_gbp_hdr(vxh, vxflags, md);
+	if (vxflags & VXLAN_F_GPE) {
+		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
+		if (err < 0)
+			goto err;
+		inner_protocol = skb->protocol;
+	}
 
-	ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+	ovs_skb_set_inner_protocol(skb, inner_protocol);
 
 	udp_tunnel6_xmit_skb(dst, sk, skb, dev, saddr, daddr, prio,
 			     ttl, src_port, dst_port,
@@ -1129,6 +1235,7 @@  static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *sk
 	int err;
 	bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM);
 	int type = 0;
+	__be16 inner_protocol = htons(ETH_P_TEB);
 
 	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
 	    skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -1191,8 +1298,14 @@  static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *sk
 	}
 	if (vxflags & VXLAN_F_GBP)
 		vxlan_build_gbp_hdr(vxh, vxflags, md);
+	if (vxflags & VXLAN_F_GPE) {
+		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
+		if (err < 0)
+			return err;
+		inner_protocol = skb->protocol;
+	}
 
-	ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+	ovs_skb_set_inner_protocol(skb, inner_protocol);
 
 	return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos,
 				   ttl, df, src_port, dst_port, xnet,
@@ -1419,7 +1532,7 @@  tx_free:
  *
  * Outer IP header inherits ECN and DF from inner header.
  * Outer UDP destination is the VXLAN assigned port.
- *           source port is based on hash of flow
+ *	   source port is based on hash of flow
  */
 netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
 {
@@ -1648,7 +1761,7 @@  static netdev_tx_t vxlan_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-static const struct net_device_ops vxlan_netdev_ops = {
+static const struct net_device_ops vxlan_netdev_ether_ops = {
 	.ndo_init		= vxlan_init,
 	.ndo_uninit		= vxlan_uninit,
 	.ndo_get_stats64	= ip_tunnel_get_stats64,
@@ -1661,6 +1774,16 @@  static const struct net_device_ops vxlan_netdev_ops = {
 	.ndo_set_mac_address	= eth_mac_addr,
 };
 
+static const struct net_device_ops vxlan_netdev_raw_ops = {
+	.ndo_init		= vxlan_init,
+	.ndo_uninit		= vxlan_uninit,
+	.ndo_get_stats64	= ip_tunnel_get_stats64,
+	.ndo_open		= vxlan_open,
+	.ndo_stop		= vxlan_stop,
+	.ndo_start_xmit		= vxlan_dev_xmit,
+	.ndo_change_mtu		= vxlan_change_mtu,
+};
+
 /* Info for udev, that this is a virtual tunnel endpoint */
 static struct device_type vxlan_type = {
 	.name = "vxlan",
@@ -1675,7 +1798,7 @@  static void vxlan_setup(struct net_device *dev)
 	eth_hw_addr_random(dev);
 	ether_setup(dev);
 
-	dev->netdev_ops = &vxlan_netdev_ops;
+	dev->netdev_ops = &vxlan_netdev_ether_ops;
 	dev->destructor = free_netdev;
 	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
 
@@ -1712,8 +1835,51 @@  static void vxlan_setup(struct net_device *dev)
 		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
 }
 
+static void vxlan_ether_setup(struct net_device *dev)
+{
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	dev->netdev_ops = &vxlan_netdev_ether_ops;
+}
+
+static void vxlan_raw_setup(struct net_device *dev)
+{
+	dev->header_ops = NULL;
+	dev->type = ARPHRD_NONE;
+	dev->hard_header_len = 0;
+	dev->addr_len = 0;
+	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+	dev->netdev_ops = &vxlan_netdev_raw_ops;
+}
+
 static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
-	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
+	[IFLA_VXLAN_ID]	 = { .type = NLA_U32 },
+	[IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_GROUP6]     = { .len = sizeof(struct in6_addr) },
+	[IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
+	[IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_LOCAL6]     = { .len = sizeof(struct in6_addr) },
+	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LABEL]      = { .type = NLA_U32 },
+	[IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
+	[IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
+	[IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
+	[IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
+	[IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
+	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
+	[IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
+	[IFLA_VXLAN_COLLECT_METADATA]   = { .type = NLA_U8 },
+	[IFLA_VXLAN_PORT]       = { .type = NLA_U16 },
+	[IFLA_VXLAN_UDP_CSUM]   = { .type = NLA_U8 },
+	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]  = { .type = NLA_U8 },
+	[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]  = { .type = NLA_U8 },
+	[IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
+	[IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
+	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
+	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
+	[IFLA_VXLAN_REMCSUM_NOPARTIAL]  = { .type = NLA_FLAG },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1897,6 +2063,21 @@  static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
 	__be16 default_port = vxlan->cfg.dst_port;
 	struct net_device *lowerdev = NULL;
 
+	if (conf->flags & VXLAN_F_GPE) {
+		if (conf->flags & ~VXLAN_F_ALLOWED_GPE)
+			return -EINVAL;
+		/* For now, allow GPE only together with COLLECT_METADATA.
+		 * This can be relaxed later; in such case, the other side
+		 * of the PtP link will have to be provided.
+		 */
+		if (!(conf->flags & VXLAN_F_COLLECT_METADATA))
+			return -EINVAL;
+
+		vxlan_raw_setup(dev);
+	} else {
+		vxlan_ether_setup(dev);
+	}
+
 	vxlan->net = src_net;
 
 	dst->remote_vni = conf->vni;
@@ -2023,7 +2204,136 @@  static int vxlan_newlink(struct net_device *dev,
 			 struct nlattr *tb[], struct nlattr *data[])
 #endif
 {
-	return -EINVAL;
+	struct vxlan_config conf;
+	int err;
+
+	memset(&conf, 0, sizeof(conf));
+
+	if (data[IFLA_VXLAN_ID])
+		conf.vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
+
+	if (data[IFLA_VXLAN_GROUP]) {
+		conf.remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
+	} else if (data[IFLA_VXLAN_GROUP6]) {
+		if (!IS_ENABLED(CONFIG_IPV6))
+			return -EPFNOSUPPORT;
+
+		conf.remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
+		conf.remote_ip.sa.sa_family = AF_INET6;
+	}
+
+	if (data[IFLA_VXLAN_LOCAL]) {
+		conf.saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
+		conf.saddr.sa.sa_family = AF_INET;
+	} else if (data[IFLA_VXLAN_LOCAL6]) {
+		if (!IS_ENABLED(CONFIG_IPV6))
+			return -EPFNOSUPPORT;
+
+		/* TODO: respect scope id */
+		conf.saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
+		conf.saddr.sa.sa_family = AF_INET6;
+	}
+
+	if (data[IFLA_VXLAN_LINK])
+		conf.remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
+
+	if (data[IFLA_VXLAN_TOS])
+		conf.tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
+
+	if (data[IFLA_VXLAN_TTL])
+		conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
+
+	if (data[IFLA_VXLAN_LABEL])
+		conf.label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
+			     IPV6_FLOWLABEL_MASK;
+
+	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
+		conf.flags |= VXLAN_F_LEARN;
+
+	if (data[IFLA_VXLAN_AGEING])
+		conf.age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
+
+	if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
+		conf.flags |= VXLAN_F_PROXY;
+
+	if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
+		conf.flags |= VXLAN_F_RSC;
+
+	if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
+		conf.flags |= VXLAN_F_L2MISS;
+
+	if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
+		conf.flags |= VXLAN_F_L3MISS;
+
+	if (data[IFLA_VXLAN_LIMIT])
+		conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
+
+	if (data[IFLA_VXLAN_COLLECT_METADATA] &&
+	    nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA]))
+		conf.flags |= VXLAN_F_COLLECT_METADATA;
+
+	if (data[IFLA_VXLAN_PORT_RANGE]) {
+		const struct ifla_vxlan_port_range *p
+			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
+		conf.port_min = ntohs(p->low);
+		conf.port_max = ntohs(p->high);
+	}
+
+	if (data[IFLA_VXLAN_PORT])
+		conf.dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
+
+	if (data[IFLA_VXLAN_UDP_CSUM] &&
+	    !nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
+		conf.flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
+
+	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] &&
+	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
+		conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
+
+	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] &&
+	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
+		conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
+
+	if (data[IFLA_VXLAN_REMCSUM_TX] &&
+	    nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX]))
+		conf.flags |= VXLAN_F_REMCSUM_TX;
+
+	if (data[IFLA_VXLAN_REMCSUM_RX] &&
+	    nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX]))
+		conf.flags |= VXLAN_F_REMCSUM_RX;
+
+	if (data[IFLA_VXLAN_GBP])
+		conf.flags |= VXLAN_F_GBP;
+
+	if (data[IFLA_VXLAN_GPE])
+		conf.flags |= VXLAN_F_GPE;
+
+	if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL])
+		conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL;
+
+	if (tb[IFLA_MTU])
+		conf.mtu = nla_get_u32(tb[IFLA_MTU]);
+
+	err = vxlan_dev_configure(src_net, dev, &conf);
+	switch (err) {
+	case -ENODEV:
+		pr_info("ifindex %d does not exist\n", conf.remote_ifindex);
+		break;
+
+	case -EPERM:
+		pr_info("IPv6 is disabled via sysctl\n");
+		break;
+
+	case -EEXIST:
+		pr_info("duplicate VNI %u\n", be32_to_cpu(conf.vni));
+		break;
+
+	case -EINVAL:
+		pr_info("unsupported combination of extensions\n");
+		break;
+	}
+
+	return err;
 }
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
@@ -2047,20 +2357,21 @@  static void vxlan_dellink(struct net_device *dev)
 static size_t vxlan_get_size(const struct net_device *dev)
 {
 
-	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
+	return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
-		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
+		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
-		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
-		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
-		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
-		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
-		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
-		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
-		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
-		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
-		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
-		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
+		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
+		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
+		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
+		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
+		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
+		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
+		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
+		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_COLLECT_METADATA */
+		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
+		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
 		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
 		nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
 		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
@@ -2074,8 +2385,88 @@  static size_t vxlan_get_size(const struct net_device *dev)
 static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	const struct vxlan_dev *vxlan = netdev_priv(dev);
+	const struct vxlan_rdst *dst = &vxlan->default_dst;
+	struct ifla_vxlan_port_range ports = {
+		.low =  htons(vxlan->cfg.port_min),
+		.high = htons(vxlan->cfg.port_max),
+	};
+
+	if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
+		goto nla_put_failure;
+
+	if (!vxlan_addr_any(&dst->remote_ip)) {
+		if (dst->remote_ip.sa.sa_family == AF_INET) {
+			if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
+					    dst->remote_ip.sin.sin_addr.s_addr))
+				goto nla_put_failure;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else {
+			if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
+					     &dst->remote_ip.sin6.sin6_addr))
+				goto nla_put_failure;
+#endif
+		}
+	}
+
+	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
+		goto nla_put_failure;
+
+	if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
+		if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
+			if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
+					    vxlan->cfg.saddr.sin.sin_addr.s_addr))
+				goto nla_put_failure;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else {
+			if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
+					     &vxlan->cfg.saddr.sin6.sin6_addr))
+				goto nla_put_failure;
+#endif
+		}
+	}
+
+	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
+	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
+	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
+			!!(vxlan->flags & VXLAN_F_LEARN)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
+			!!(vxlan->flags & VXLAN_F_PROXY)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_L2MISS,
+			!!(vxlan->flags & VXLAN_F_L2MISS)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
+			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
+		       !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) ||
+	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
+	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
+	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
+	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
+			!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
+			!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
+			!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
+			!!(vxlan->flags & VXLAN_F_REMCSUM_TX)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
+			!!(vxlan->flags & VXLAN_F_REMCSUM_RX)))
+		goto nla_put_failure;
+
+	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
+		goto nla_put_failure;
+
+	if (vxlan->flags & VXLAN_F_GBP &&
+	    nla_put_flag(skb, IFLA_VXLAN_GBP))
+		goto nla_put_failure;
+
+	if (vxlan->flags & VXLAN_F_GPE &&
+	    nla_put_flag(skb, IFLA_VXLAN_GPE))
+		goto nla_put_failure;
 
-	if (nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port))
+	if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL &&
+	    nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
 		goto nla_put_failure;
 
 	return 0;
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 1e88c13..2b07e54 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -988,6 +988,8 @@  netdev_geneve_destroy(const char *name)
 #define IFLA_VXLAN_UDP_ZERO_CSUM6_RX 20
 #define IFLA_VXLAN_GBP 23
 #define IFLA_VXLAN_COLLECT_METADATA 25
+#define IFLA_VXLAN_LABEL 26
+#define IFLA_VXLAN_GPE 27
 #endif
 
 #if IFLA_GRE_MAX < 18
@@ -1037,6 +1039,9 @@  netdev_vxlan_create(struct netdev *netdev)
             if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP)) {
                 nl_msg_put_flag(&request, IFLA_VXLAN_GBP);
             }
+            else if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GPE)) {
+                nl_msg_put_flag(&request, IFLA_VXLAN_GPE);
+            }
             nl_msg_put_be16(&request, IFLA_VXLAN_PORT, tnl_cfg->dst_port);
         nl_msg_end_nested(&request, infodata_off);
     nl_msg_end_nested(&request, linkinfo_off);
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index ec5c44e..fa56af5 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -541,7 +541,9 @@  set_tunnel_config(struct netdev *dev_, const struct smap *args)
             while (ext) {
                 if (!strcmp(type, "vxlan") && !strcmp(ext, "gbp")) {
                     tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GBP);
-                } else {
+                } else if (!strcmp(type, "vxlan") && !strcmp(ext, "gpe")) {
+                     tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GPE);
+		} else {
                     VLOG_WARN("%s: unknown extension '%s'", name, ext);
                 }