[ovs-dev,v3,16/28] datapath: compat: Update Geneve and VxLAN modules.

Message ID	1467421181-121681-7-git-send-email-pshelar@ovn.org
State	Superseded
Headers	show Return-Path: <dev-bounces@openvswitch.org> Received-SPF: pass (mx3-pf1.cudamail.com: SPF record at ovn.org designates 217.70.183.198 as permitted sender) sender: pshelar@ovn.org) by relay6-d.mail.gandi.net (Postfix) with ESMTPSA id 7CF7DFB8A4; Sat, 2 Jul 2016 03:00:11 +0200 (CEST) From: Pravin B Shelar <pshelar@ovn.org> To: dev@openvswitch.org Date: Fri, 1 Jul 2016 17:59:38 -0700 datapath: compat: Update Geneve and VxLAN modules. Message-Id: <1467421181-121681-7-git-send-email-pshelar@ovn.org> In-Reply-To: <1467421181-121681-1-git-send-email-pshelar@ovn.org> References: <1467421181-121681-1-git-send-email-pshelar@ovn.org> Subject: [ovs-dev] [PATCH v3 16/28] datapath: compat: Update Geneve and VxLAN modules. Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: dev-bounces@openvswitch.org Sender: "dev" <dev-bounces@openvswitch.org>

diff --git a/acinclude.m4 b/acinclude.m4 index 874eff3..4b50985 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -392,6 +392,7 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/linux/etherdevice.h], [eth_hw_addr_random]) OVS_GREP_IFELSE([$KSRC/include/linux/etherdevice.h], [ether_addr_copy]) + OVS_GREP_IFELSE([$KSRC/nclude/linux/if_ether.h], [inner_eth_hdr]) OVS_GREP_IFELSE([$KSRC/include/uapi/linux/if_link.h], [IFLA_GENEVE_TOS]) OVS_GREP_IFELSE([$KSRC/include/uapi/linux/if_link.h], [rtnl_link_stats64]) @@ -432,11 +433,18 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/net/inetpeer.h], [vif], [OVS_DEFINE([HAVE_INETPEER_VIF_SUPPORT])]) + dnl Check for dst_cache and ipv6 lable to use backported tunnel infrastructure. + dnl OVS does not really need ipv6 label field, but its presence signifies that + dnl the stack has all required ipv6 support. + dnl OVS also does not need dst_cache But this dependency allows us to write + dnl much cleaner code. + OVS_FIND_FIELD_IFELSE([$KSRC/include/net/ip_tunnels.h], [ip_tunnel_key], [label], [OVS_GREP_IFELSE([$KSRC/include/net/ip_tunnels.h], [iptunnel_pull_offloads], - [OVS_DEFINE([USE_UPSTREAM_TUNNEL])])]) + [OVS_GREP_IFELSE([$KSRC/include/net/dst_cache.h], [dst_cache], + [OVS_DEFINE([USE_UPSTREAM_TUNNEL])])])]) OVS_GREP_IFELSE([$KSRC/include/linux/net.h], [sock_create_kern.*net], [OVS_DEFINE([HAVE_SOCK_CREATE_KERN_NET])]) @@ -448,6 +456,8 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [__skb_gso_segment]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [can_checksum_protocol]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [ndo_get_iflink]) + OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [ndo_add_vxlan_port]) + OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [ndo_add_geneve_port]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [netdev_features_t]) dnl Ubuntu kernel 3.13 has defined this struct but not used for netdev->tstats. dnl So check type of tstats. @@ -511,6 +521,7 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ [OVS_DEFINE([HAVE_PROTO_DATA_VALID])]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [skb_checksum_start_offset]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_protocol]) + OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_protocol_type]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_mac_header]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_network_header]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [kfree_skb_list]) @@ -607,7 +618,6 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/net/vxlan.h], [struct vxlan_metadata], [OVS_DEFINE([HAVE_VXLAN_METADATA])]) - OVS_GREP_IFELSE([$KSRC/include/net/vxlan.h], [VXLAN_HF_RCO]) OVS_GREP_IFELSE([$KSRC/include/net/udp.h], [udp_flow_src_port], [OVS_GREP_IFELSE([$KSRC/include/net/udp.h], [inet_get_local_port_range(net], [OVS_DEFINE([HAVE_UDP_FLOW_SRC_PORT])])]) diff --git a/datapath/linux/compat/geneve.c b/datapath/linux/compat/geneve.c index 127b90e..2ed9ad1 100644 --- a/datapath/linux/compat/geneve.c +++ b/datapath/linux/compat/geneve.c @@ -18,18 +18,29 @@ #include <linux/if_link.h> #include <linux/if_vlan.h> +#include <net/addrconf.h> #include <net/dst_metadata.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/geneve.h> #include <net/protocol.h> +#include <net/udp_tunnel.h> +#include <net/ip6_route.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6.h> +#include <net/addrconf.h> +#include <net/ip6_tunnel.h> +#include <net/ip6_checksum.h> +#endif + #include "gso.h" #include "vport-netdev.h" #include "compat.h" #ifndef USE_UPSTREAM_TUNNEL + #define GENEVE_NETDEV_VER "0.6" #define GENEVE_UDP_PORT 6081 @@ -51,31 +62,50 @@ struct geneve_net { static int geneve_net_id; +union geneve_addr { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct sockaddr sa; +}; + +static union geneve_addr geneve_remote_unspec = { .sa.sa_family = AF_UNSPEC, }; + /* Pseudo network device */ struct geneve_dev { struct hlist_node hlist; /* vni hash table */ struct net *net; /* netns for packet i/o */ struct net_device *dev; /* netdev for geneve tunnel */ - struct geneve_sock *sock; /* socket used for geneve tunnel */ + struct geneve_sock *sock4; /* IPv4 socket used for geneve tunnel */ +#if IS_ENABLED(CONFIG_IPV6) + struct geneve_sock *sock6; /* IPv6 socket used for geneve tunnel */ +#endif u8 vni[3]; /* virtual network ID for tunnel */ u8 ttl; /* TTL override */ u8 tos; /* TOS override */ - struct sockaddr_in remote; /* IPv4 address for link partner */ + union geneve_addr remote; /* IP address for link partner */ struct list_head next; /* geneve's per namespace list */ + __be32 label; /* IPv6 flowlabel override */ __be16 dst_port; bool collect_md; + u32 flags; }; +/* Geneve device flags */ +#define GENEVE_F_UDP_ZERO_CSUM_TX BIT(0) +#define GENEVE_F_UDP_ZERO_CSUM6_TX BIT(1) +#define GENEVE_F_UDP_ZERO_CSUM6_RX BIT(2) + struct geneve_sock { bool collect_md; struct list_head list; struct socket *sock; struct rcu_head rcu; int refcnt; + struct hlist_head vni_list[VNI_HASH_SIZE]; + u32 flags; #ifdef HAVE_UDP_OFFLOAD struct udp_offload udp_offloads; #endif - struct hlist_head vni_list[VNI_HASH_SIZE]; }; static inline __u32 geneve_net_vni_hash(u8 vni[3]) @@ -97,6 +127,11 @@ static __be64 vni_to_tunnel_id(const __u8 *vni) #endif } +static sa_family_t geneve_get_sk_family(struct geneve_sock *gs) +{ + return gs->sock->sk->sk_family; +} + static struct geneve_dev *geneve_lookup(struct geneve_sock *gs, __be32 addr, u8 vni[]) { @@ -109,48 +144,95 @@ static struct geneve_dev *geneve_lookup(struct geneve_sock *gs, vni_list_head = &gs->vni_list[hash]; hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) { if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) && - addr == geneve->remote.sin_addr.s_addr) + addr == geneve->remote.sin.sin_addr.s_addr) + return geneve; + } + return NULL; +} + +#if IS_ENABLED(CONFIG_IPV6) +static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs, + struct in6_addr addr6, u8 vni[]) +{ + struct hlist_head *vni_list_head; + struct geneve_dev *geneve; + __u32 hash; + + /* Find the device for this VNI */ + hash = geneve_net_vni_hash(vni); + vni_list_head = &gs->vni_list[hash]; + hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) { + if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) && + ipv6_addr_equal(&addr6, &geneve->remote.sin6.sin6_addr)) return geneve; } return NULL; } +#endif static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) { return (struct genevehdr *)(udp_hdr(skb) + 1); } -/* geneve receive/decap routine */ -static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) +static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs, + struct sk_buff *skb) { - struct genevehdr *gnvh = geneve_hdr(skb); - struct metadata_dst *tun_dst; - struct geneve_dev *geneve = NULL; - struct pcpu_sw_netstats *stats; - struct iphdr *iph; u8 *vni; __be32 addr; - int err; - union { - struct metadata_dst dst; - char buf[sizeof(struct metadata_dst) + 256]; - } buf; + static u8 zero_vni[3]; +#if IS_ENABLED(CONFIG_IPV6) + static struct in6_addr zero_addr6; +#endif - iph = ip_hdr(skb); /* outer IP header... */ + if (geneve_get_sk_family(gs) == AF_INET) { + struct iphdr *iph; - if (gs->collect_md) { - static u8 zero_vni[3]; + iph = ip_hdr(skb); /* outer IP header... */ - vni = zero_vni; - addr = 0; - } else { - vni = gnvh->vni; - addr = iph->saddr; + if (gs->collect_md) { + vni = zero_vni; + addr = 0; + } else { + vni = geneve_hdr(skb)->vni; + addr = iph->saddr; + } + + return geneve_lookup(gs, addr, vni); +#if IS_ENABLED(CONFIG_IPV6) + } else if (geneve_get_sk_family(gs) == AF_INET6) { + struct ipv6hdr *ip6h; + struct in6_addr addr6; + + ip6h = ipv6_hdr(skb); /* outer IPv6 header... */ + + if (gs->collect_md) { + vni = zero_vni; + addr6 = zero_addr6; + } else { + vni = geneve_hdr(skb)->vni; + addr6 = ip6h->saddr; + } + + return geneve6_lookup(gs, addr6, vni); +#endif } + return NULL; +} - geneve = geneve_lookup(gs, addr, vni); - if (!geneve) - goto drop; +/* geneve receive/decap routine */ +static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, + struct sk_buff *skb) +{ + struct genevehdr *gnvh = geneve_hdr(skb); + struct metadata_dst *tun_dst = NULL; + struct pcpu_sw_netstats *stats; + int err = 0; + void *oiph; + union { + struct metadata_dst dst; + char buf[sizeof(struct metadata_dst) + 256]; + } buf; if (ip_tunnel_collect_metadata() || gs->collect_md) { __be16 flags; @@ -160,8 +242,12 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) (gnvh->critical ? TUNNEL_CRIT_OPT : 0); tun_dst = &buf.dst; - ovs_udp_tun_rx_dst(&tun_dst->u.tun_info, skb, AF_INET, flags, - vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4); + ovs_udp_tun_rx_dst(&tun_dst->u.tun_info, + skb, geneve_get_sk_family(gs), flags, + vni_to_tunnel_id(gnvh->vni), + gnvh->opt_len * 4); + if (!tun_dst) + goto drop; /* Update tunnel dst according to Geneve options. */ ip_tunnel_info_opts_set(&tun_dst->u.tun_info, gnvh->options, gnvh->opt_len * 4); @@ -169,7 +255,6 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) /* Drop packets w/ critical options, * since we don't support any... */ - tun_dst = NULL; if (gnvh->critical) goto drop; } @@ -180,30 +265,33 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) if (tun_dst) ovs_skb_dst_set(skb, &tun_dst->dst); - else - goto drop; + /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) goto drop; + oiph = skb_network_header(skb); skb_reset_network_header(skb); - err = IP_ECN_decapsulate(iph, skb); - - if (unlikely(err)) { - if (err > 1) { - ++geneve->dev->stats.rx_frame_errors; - ++geneve->dev->stats.rx_errors; - goto drop; - } + if (geneve_get_sk_family(gs) == AF_INET) + err = IP_ECN_decapsulate(oiph, skb); +#if IS_ENABLED(CONFIG_IPV6) + else + err = IP6_ECN_decapsulate(oiph, skb); +#endif + if (unlikely(err > 1)) { + ++geneve->dev->stats.rx_frame_errors; + ++geneve->dev->stats.rx_errors; + goto drop; } - stats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)geneve->dev->tstats); + stats = this_cpu_ptr(geneve->dev->tstats); u64_stats_update_begin(&stats->syncp); stats->rx_packets++; stats->rx_bytes += skb->len; u64_stats_update_end(&stats->syncp); - netdev_port_receive(skb, &tun_dst->u.tun_info); + + netdev_port_receive(skb, skb_tunnel_info(skb)); return; drop: /* Consume bad packet */ @@ -213,7 +301,7 @@ drop: /* Setup stats when device is created */ static int geneve_init(struct net_device *dev) { - dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; @@ -229,6 +317,7 @@ static void geneve_uninit(struct net_device *dev) static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct genevehdr *geneveh; + struct geneve_dev *geneve; struct geneve_sock *gs; int opts_len; @@ -244,16 +333,21 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) goto error; - opts_len = geneveh->opt_len * 4; - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, - htons(ETH_P_TEB), false)) - goto drop; - gs = rcu_dereference_sk_user_data(sk); if (!gs) goto drop; - geneve_rx(gs, skb); + geneve = geneve_lookup_skb(gs, skb); + if (!geneve) + goto drop; + + opts_len = geneveh->opt_len * 4; + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, + htons(ETH_P_TEB), + !net_eq(geneve->net, dev_net(geneve->dev)))) + goto drop; + + geneve_rx(geneve, gs, skb); return 0; drop: @@ -267,7 +361,7 @@ error: } static struct socket *geneve_create_sock(struct net *net, bool ipv6, - __be16 port) + __be16 port, u32 flags) { struct socket *sock; struct udp_port_cfg udp_conf; @@ -277,6 +371,9 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6, if (ipv6) { udp_conf.family = AF_INET6; + udp_conf.ipv6_v6only = 1; + udp_conf.use_udp6_rx_checksums = + !(flags & GENEVE_F_UDP_ZERO_CSUM6_RX); } else { udp_conf.family = AF_INET; udp_conf.local_ip.s_addr = htonl(INADDR_ANY); @@ -292,9 +389,24 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6, return sock; } -#ifdef HAVE_UDP_OFFLOAD static void geneve_notify_add_rx_port(struct geneve_sock *gs) { +#ifdef HAVE_NDO_ADD_GENEVE_PORT + struct net_device *dev; + struct sock *sk = gs->sock->sk; + struct net *net = sock_net(sk); + sa_family_t sa_family = geneve_get_sk_family(gs); + __be16 port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (dev->netdev_ops->ndo_add_geneve_port) + dev->netdev_ops->ndo_add_geneve_port(dev, sa_family, + port); + } + rcu_read_unlock(); +#else +#ifdef HAVE_UDP_OFFLOAD struct sock *sk = gs->sock->sk; sa_family_t sa_family = sk->sk_family; int err; @@ -305,8 +417,43 @@ static void geneve_notify_add_rx_port(struct geneve_sock *gs) pr_warn("geneve: udp_add_offload failed with status %d\n", err); } +#endif +#endif + +} + +static void geneve_notify_del_rx_port(struct geneve_sock *gs) +{ +#ifdef HAVE_NDO_ADD_GENEVE_PORT + struct net_device *dev; + struct sock *sk = gs->sock->sk; + struct net *net = sock_net(sk); + sa_family_t sa_family = geneve_get_sk_family(gs); + __be16 port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (dev->netdev_ops->ndo_del_geneve_port) + dev->netdev_ops->ndo_del_geneve_port(dev, sa_family, + port); + } + + rcu_read_unlock(); +#else +#ifdef HAVE_UDP_OFFLOAD + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + + if (sa_family == AF_INET) + udp_del_offload(&gs->udp_offloads); +#endif + +#endif } +#if defined(HAVE_UDP_OFFLOAD) || \ + defined(HAVE_UDP_TUNNEL_SOCK_CFG_GRO_RECEIVE) + static int geneve_hlen(struct genevehdr *gh) { return sizeof(*gh) + gh->opt_len * 4; @@ -348,8 +495,6 @@ static struct sk_buff **geneve_gro_receive(struct sk_buff **head, goto out; } - flush = 0; - for (p = *head; p; p = p->next) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -366,14 +511,13 @@ static struct sk_buff **geneve_gro_receive(struct sk_buff **head, rcu_read_lock(); ptype = gro_find_receive_by_type(type); - if (!ptype) { - flush = 1; + if (!ptype) goto out_unlock; - } skb_gro_pull(skb, gh_len); skb_gro_postpull_rcsum(skb, gh, gh_len); pp = ptype->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); @@ -414,7 +558,7 @@ static int geneve_gro_complete(struct sk_buff *skb, int nhoff, /* Create new listen socket if needed */ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, - bool ipv6) + bool ipv6, u32 flags) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; @@ -426,7 +570,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, if (!gs) return ERR_PTR(-ENOMEM); - sock = geneve_create_sock(net, ipv6, port); + sock = geneve_create_sock(net, ipv6, port, flags); if (IS_ERR(sock)) { kfree(gs); return ERR_CAST(sock); @@ -442,11 +586,17 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, gs->udp_offloads.port = port; gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; - geneve_notify_add_rx_port(gs); #endif + + geneve_notify_add_rx_port(gs); /* Mark socket as an encapsulation socket */ + memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = gs; tunnel_cfg.encap_type = 1; +#ifdef HAVE_UDP_TUNNEL_SOCK_CFG_GRO_RECEIVE + tunnel_cfg.gro_receive = geneve_gro_receive; + tunnel_cfg.gro_complete = geneve_gro_complete; +#endif tunnel_cfg.encap_rcv = geneve_udp_encap_recv; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, sock, &tunnel_cfg); @@ -454,135 +604,190 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, return gs; } -static void geneve_notify_del_rx_port(struct geneve_sock *gs) -{ -#ifdef HAVE_UDP_OFFLOAD - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - - if (sa_family == AF_INET) - udp_del_offload(&gs->udp_offloads); -#endif -} - -static void free_gs_rcu(struct rcu_head *rcu) -{ - struct geneve_sock *gs = container_of(rcu, struct geneve_sock, rcu); - - kfree(gs); -} - -static void geneve_sock_release(struct geneve_sock *gs) +static void __geneve_sock_release(struct geneve_sock *gs) { - if (--gs->refcnt) + if (!gs || --gs->refcnt) return; list_del(&gs->list); geneve_notify_del_rx_port(gs); udp_tunnel_sock_release(gs->sock); - call_rcu(&gs->rcu, free_gs_rcu); + kfree_rcu(gs, rcu); +} + +static void geneve_sock_release(struct geneve_dev *geneve) +{ + __geneve_sock_release(geneve->sock4); +#if IS_ENABLED(CONFIG_IPV6) + __geneve_sock_release(geneve->sock6); +#endif } static struct geneve_sock *geneve_find_sock(struct geneve_net *gn, + sa_family_t family, __be16 dst_port) { struct geneve_sock *gs; list_for_each_entry(gs, &gn->sock_list, list) { if (inet_sk(gs->sock->sk)->inet_sport == dst_port && - inet_sk(gs->sock->sk)->sk.sk_family == AF_INET) { + geneve_get_sk_family(gs) == family) { return gs; } } return NULL; } -static int geneve_open(struct net_device *dev) +static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) { - struct geneve_dev *geneve = netdev_priv(dev); struct net *net = geneve->net; struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; __u32 hash; - gs = geneve_find_sock(gn, geneve->dst_port); + gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->dst_port); if (gs) { gs->refcnt++; goto out; } - gs = geneve_socket_create(net, geneve->dst_port, false); + gs = geneve_socket_create(net, geneve->dst_port, ipv6, geneve->flags); if (IS_ERR(gs)) return PTR_ERR(gs); out: gs->collect_md = geneve->collect_md; - geneve->sock = gs; + gs->flags = geneve->flags; +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6) + geneve->sock6 = gs; + else +#endif + geneve->sock4 = gs; hash = geneve_net_vni_hash(geneve->vni); hlist_add_head_rcu(&geneve->hlist, &gs->vni_list[hash]); return 0; } +static int geneve_open(struct net_device *dev) +{ + struct geneve_dev *geneve = netdev_priv(dev); + bool ipv6 = geneve->remote.sa.sa_family == AF_INET6; + bool metadata = geneve->collect_md; + int ret = 0; + + geneve->sock4 = NULL; +#if IS_ENABLED(CONFIG_IPV6) + geneve->sock6 = NULL; + if (ipv6 || metadata) + ret = geneve_sock_add(geneve, true); +#endif + + if (!ret && (!ipv6 || metadata)) + ret = geneve_sock_add(geneve, false); + if (ret < 0) + geneve_sock_release(geneve); + + return ret; +} + static int geneve_stop(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); - struct geneve_sock *gs = geneve->sock; if (!hlist_unhashed(&geneve->hlist)) hlist_del_rcu(&geneve->hlist); - geneve_sock_release(gs); + geneve_sock_release(geneve); return 0; } +static void geneve_build_header(struct genevehdr *geneveh, + __be16 tun_flags, u8 vni[3], + u8 options_len, u8 *options) +{ + geneveh->ver = GENEVE_VER; + geneveh->opt_len = options_len / 4; + geneveh->oam = !!(tun_flags & TUNNEL_OAM); + geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); + geneveh->rsvd1 = 0; + memcpy(geneveh->vni, vni, 3); + geneveh->proto_type = htons(ETH_P_TEB); + geneveh->rsvd2 = 0; + + memcpy(geneveh->options, options, options_len); +} + static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb, __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum) + u32 flags, bool xnet) { struct genevehdr *gnvh; int min_headroom; int err; + bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM_TX); + + skb_scrub_packet(skb, xnet); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr); err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_rt; - skb = vlan_hwaccel_push_inside(skb); - if (!skb) { - err = -ENOMEM; - goto free_rt; - } - - err = udp_tunnel_handle_offloads(skb, csum, false); + err = udp_tunnel_handle_offloads(skb, udp_sum, false); if (err) goto free_rt; + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); - gnvh->ver = GENEVE_VER; - gnvh->opt_len = opt_len / 4; - gnvh->oam = !!(tun_flags & TUNNEL_OAM); - gnvh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); - gnvh->rsvd1 = 0; - memcpy(gnvh->vni, vni, 3); - gnvh->proto_type = htons(ETH_P_TEB); - gnvh->rsvd2 = 0; - memcpy(gnvh->options, opt, opt_len); + geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); return 0; free_rt: - kfree_skb(skb); ip_rt_put(rt); return err; } -static struct rtable *geneve_get_rt(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl4, - struct ip_tunnel_info *info) +#if IS_ENABLED(CONFIG_IPV6) +static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + u32 flags, bool xnet) +{ + struct genevehdr *gnvh; + int min_headroom; + int err; + bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM6_TX); + + skb_scrub_packet(skb, xnet); + + min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + + GENEVE_BASE_HLEN + opt_len + sizeof(struct ipv6hdr); + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) + goto free_dst; + + err = udp_tunnel_handle_offloads(skb, udp_sum, false); + if (err) + goto free_dst; + + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); + geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); + + ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + return 0; + +free_dst: + dst_release(dst); + return err; +} +#endif + +static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, + struct net_device *dev, + struct flowi4 *fl4, + struct ip_tunnel_info *info) { struct geneve_dev *geneve = netdev_priv(dev); struct rtable *rt = NULL; @@ -605,7 +810,7 @@ static struct rtable *geneve_get_rt(struct sk_buff *skb, } fl4->flowi4_tos = RT_TOS(tos); - fl4->daddr = geneve->remote.sin_addr.s_addr; + fl4->daddr = geneve->remote.sin.sin_addr.s_addr; } rt = ip_route_output_key(geneve->net, fl4); @@ -621,6 +826,61 @@ static struct rtable *geneve_get_rt(struct sk_buff *skb, return rt; } +#if IS_ENABLED(CONFIG_IPV6) +static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, + struct ip_tunnel_info *info) +{ + struct geneve_dev *geneve = netdev_priv(dev); + struct geneve_sock *gs6 = geneve->sock6; + struct dst_entry *dst = NULL; + __u8 prio; + + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_mark = skb->mark; + fl6->flowi6_proto = IPPROTO_UDP; + + if (info) { + fl6->daddr = info->key.u.ipv6.dst; + fl6->saddr = info->key.u.ipv6.src; + fl6->flowlabel = ip6_make_flowinfo(RT_TOS(info->key.tos), + info->key.label); + } else { + prio = geneve->tos; + if (prio == 1) { + const struct iphdr *iip = ip_hdr(skb); + + prio = ip_tunnel_get_dsfield(iip, skb); + } + + fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio), + geneve->label); + fl6->daddr = geneve->remote.sin6.sin6_addr; + } + +#ifdef HAVE_IPV6_DST_LOOKUP_NET + if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) { +#else +#ifdef HAVE_IPV6_STUB + if (ipv6_stub->ipv6_dst_lookup(gs6->sock->sk, &dst, fl6)) { +#else + if (ip6_dst_lookup(gs6->sock->sk, &dst, fl6)) { +#endif +#endif + netdev_dbg(dev, "no route to %pI6\n", &fl6->daddr); + return ERR_PTR(-ENETUNREACH); + } + if (dst->dev == dev) { /* is this necessary? */ + netdev_dbg(dev, "circular route to %pI6\n", &fl6->daddr); + dst_release(dst); + return ERR_PTR(-ELOOP); + } + + return dst; +} +#endif + /* Convert 64 bit tunnel ID to 24 bit VNI. */ static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) { @@ -635,24 +895,23 @@ static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) #endif } -netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb) +static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, + struct ip_tunnel_info *info) { - struct net_device *dev = skb->dev; struct geneve_dev *geneve = netdev_priv(dev); - struct geneve_sock *gs = geneve->sock; - struct ip_tunnel_info *info = NULL; + struct geneve_sock *gs4 = geneve->sock4; struct rtable *rt = NULL; const struct iphdr *iip; /* interior IP header */ int err = -EINVAL; struct flowi4 fl4; __u8 tos, ttl; __be16 sport; - bool udp_csum; __be16 df; + bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); + u32 flags = geneve->flags; if (geneve->collect_md) { - info = skb_tunnel_info(skb); - if (unlikely(info && !(info->mode & IP_TUNNEL_INFO_TX))) { + if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { netdev_dbg(dev, "no tunnel metadata\n"); goto tx_error; } @@ -660,9 +919,9 @@ netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb) goto tx_error; } - rt = geneve_get_rt(skb, dev, &fl4, info); + rt = geneve_get_v4_rt(skb, dev, &fl4, info); if (IS_ERR(rt)) { - netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr); + err = PTR_ERR(rt); goto tx_error; } @@ -677,24 +936,27 @@ netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb) u8 vni[3]; tunnel_id_to_vni(key->tun_id, vni); - if (key->tun_flags & TUNNEL_GENEVE_OPT) + if (info->options_len) opts = ip_tunnel_info_opts(info); - udp_csum = !!(key->tun_flags & TUNNEL_CSUM); + if (key->tun_flags & TUNNEL_CSUM) + flags &= ~GENEVE_F_UDP_ZERO_CSUM_TX; + else + flags |= GENEVE_F_UDP_ZERO_CSUM_TX; + err = geneve_build_skb(rt, skb, key->tun_flags, vni, - info->options_len, opts, udp_csum); + info->options_len, opts, flags, xnet); if (unlikely(err)) - goto err; + goto tx_error; tos = ip_tunnel_ecn_encap(key->tos, iip, skb); ttl = key->ttl; df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; } else { - udp_csum = false; err = geneve_build_skb(rt, skb, 0, geneve->vni, - 0, NULL, udp_csum); + 0, NULL, flags, xnet); if (unlikely(err)) - goto err; + goto tx_error; tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb); ttl = geneve->ttl; @@ -703,14 +965,16 @@ netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb) ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); df = 0; } - udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, fl4.saddr, fl4.daddr, + udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, fl4.saddr, fl4.daddr, tos, ttl, df, sport, geneve->dst_port, - !net_eq(geneve->net, dev_net(geneve->dev)), !udp_csum); + !net_eq(geneve->net, dev_net(geneve->dev)), + !!(flags & GENEVE_F_UDP_ZERO_CSUM_TX)); + return NETDEV_TX_OK; tx_error: dev_kfree_skb(skb); -err: + if (err == -ELOOP) dev->stats.collisions++; else if (err == -ENETUNREACH) @@ -719,7 +983,114 @@ err: dev->stats.tx_errors++; return NETDEV_TX_OK; } -EXPORT_SYMBOL(rpl_geneve_xmit); + +#if IS_ENABLED(CONFIG_IPV6) +static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, + struct ip_tunnel_info *info) +{ + struct geneve_dev *geneve = netdev_priv(dev); + struct geneve_sock *gs6 = geneve->sock6; + struct dst_entry *dst = NULL; + const struct iphdr *iip; /* interior IP header */ + int err = -EINVAL; + struct flowi6 fl6; + __u8 prio, ttl; + __be16 sport; + __be32 label; + bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); + u32 flags = geneve->flags; + + if (geneve->collect_md) { + if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { + netdev_dbg(dev, "no tunnel metadata\n"); + goto tx_error; + } + } + + dst = geneve_get_v6_dst(skb, dev, &fl6, info); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto tx_error; + } + + sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); + skb_reset_mac_header(skb); + + iip = ip_hdr(skb); + + if (info) { + const struct ip_tunnel_key *key = &info->key; + u8 *opts = NULL; + u8 vni[3]; + + tunnel_id_to_vni(key->tun_id, vni); + if (info->options_len) + opts = ip_tunnel_info_opts(info); + + if (key->tun_flags & TUNNEL_CSUM) + flags &= ~GENEVE_F_UDP_ZERO_CSUM6_TX; + else + flags |= GENEVE_F_UDP_ZERO_CSUM6_TX; + + err = geneve6_build_skb(dst, skb, key->tun_flags, vni, + info->options_len, opts, + flags, xnet); + if (unlikely(err)) + goto tx_error; + + prio = ip_tunnel_ecn_encap(key->tos, iip, skb); + ttl = key->ttl; + label = info->key.label; + } else { + err = geneve6_build_skb(dst, skb, 0, geneve->vni, + 0, NULL, flags, xnet); + if (unlikely(err)) + goto tx_error; + + prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel), + iip, skb); + ttl = geneve->ttl; + if (!ttl && ipv6_addr_is_multicast(&fl6.daddr)) + ttl = 1; + ttl = ttl ? : ip6_dst_hoplimit(dst); + label = geneve->label; + } + udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev, + &fl6.saddr, &fl6.daddr, prio, ttl, label, + sport, geneve->dst_port, + !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX)); + return NETDEV_TX_OK; + +tx_error: + dev_kfree_skb(skb); + + if (err == -ELOOP) + dev->stats.collisions++; + else if (err == -ENETUNREACH) + dev->stats.tx_carrier_errors++; + else + dev->stats.tx_errors++; + return NETDEV_TX_OK; +} +#endif + +netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct geneve_dev *geneve = netdev_priv(dev); + struct ip_tunnel_info *info = NULL; + + if (geneve->collect_md) + info = skb_tunnel_info(skb); + +#if IS_ENABLED(CONFIG_IPV6) + if ((info && ip_tunnel_info_af(info) == AF_INET6) || + (!info && geneve->remote.sa.sa_family == AF_INET6)) + return geneve6_xmit_skb(skb, dev, info); +#endif + return geneve_xmit_skb(skb, dev, info); +} +EXPORT_SYMBOL_GPL(rpl_geneve_xmit); static netdev_tx_t geneve_dev_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -739,7 +1110,7 @@ static int __geneve_change_mtu(struct net_device *dev, int new_mtu, bool strict) * configurations. */ int max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - sizeof(struct iphdr) - - dev->hard_header_len; + - dev->hard_header_len; if (new_mtu < 68) return -EINVAL; @@ -770,7 +1141,7 @@ int ovs_geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) if (ip_tunnel_info_af(info) != AF_INET) return -EINVAL; - rt = geneve_get_rt(skb, dev, &fl4, info); + rt = geneve_get_v4_rt(skb, dev, &fl4, info); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -786,15 +1157,15 @@ EXPORT_SYMBOL_GPL(ovs_geneve_fill_metadata_dst); static const struct net_device_ops geneve_netdev_ops = { .ndo_init = geneve_init, .ndo_uninit = geneve_uninit, - .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_open = geneve_open, .ndo_stop = geneve_stop, .ndo_start_xmit = geneve_dev_xmit, + .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_change_mtu = geneve_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef HAVE_NDO_FILL_METADATA_DST - .ndo_fill_metadata_dst = geneve_fill_metadata_dst, + .ndo_fill_metadata_dst = geneve_fill_metadata_dst, #endif }; @@ -815,6 +1186,34 @@ static struct device_type geneve_type = { .name = "geneve", }; +/* Calls the ndo_add_geneve_port of the caller in order to + * supply the listening GENEVE udp ports. Callers are expected + * to implement the ndo_add_geneve_port. + */ +static void geneve_push_rx_ports(struct net_device *dev) +{ +#ifdef HAVE_NDO_ADD_GENEVE_PORT + struct net *net = dev_net(dev); + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + sa_family_t sa_family; + struct sock *sk; + __be16 port; + + if (!dev->netdev_ops->ndo_add_geneve_port) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(gs, &gn->sock_list, list) { + sk = gs->sock->sk; + sa_family = sk->sk_family; + port = inet_sk(sk)->inet_sport; + dev->netdev_ops->ndo_add_geneve_port(dev, sa_family, port); + } + rcu_read_unlock(); +#endif +} + /* Initialize the device structure. */ static void geneve_setup(struct net_device *dev) { @@ -831,14 +1230,13 @@ static void geneve_setup(struct net_device *dev) dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; -#endif + #if 0 - /* Not required */ netif_keep_dst(dev); #endif + dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; eth_hw_addr_random(dev); } @@ -846,10 +1244,15 @@ static void geneve_setup(struct net_device *dev) static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_ID] = { .type = NLA_U32 }, [IFLA_GENEVE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_GENEVE_REMOTE6] = { .len = sizeof(struct in6_addr) }, [IFLA_GENEVE_TTL] = { .type = NLA_U8 }, [IFLA_GENEVE_TOS] = { .type = NLA_U8 }, + [IFLA_GENEVE_LABEL] = { .type = NLA_U32 }, [IFLA_GENEVE_PORT] = { .type = NLA_U16 }, [IFLA_GENEVE_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 }, + [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, + [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -877,7 +1280,7 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[]) static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, __be16 dst_port, - __be32 rem_addr, + union geneve_addr *remote, u8 vni[], bool *tun_on_same_port, bool *tun_collect_md) @@ -893,7 +1296,7 @@ static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, *tun_on_same_port = true; } if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) && - rem_addr == geneve->remote.sin_addr.s_addr && + !memcmp(remote, &geneve->remote, sizeof(geneve->remote)) && dst_port == geneve->dst_port) t = geneve; } @@ -901,18 +1304,20 @@ static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, } static int geneve_configure(struct net *net, struct net_device *dev, - __be32 rem_addr, __u32 vni, __u8 ttl, __u8 tos, - __be16 dst_port, bool metadata) + union geneve_addr *remote, + __u32 vni, __u8 ttl, __u8 tos, __be32 label, + __be16 dst_port, bool metadata, u32 flags) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); bool tun_collect_md, tun_on_same_port; - int err; + int err, encap_len; - if (metadata) { - if (rem_addr || vni || tos || ttl) - return -EINVAL; - } + if (!remote) + return -EINVAL; + if (metadata && + (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl || label)) + return -EINVAL; geneve->net = net; geneve->dev = dev; @@ -921,20 +1326,36 @@ static int geneve_configure(struct net *net, struct net_device *dev, geneve->vni[1] = (vni & 0x0000ff00) >> 8; geneve->vni[2] = vni & 0x000000ff; - geneve->remote.sin_addr.s_addr = rem_addr; - if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr))) + if ((remote->sa.sa_family == AF_INET && + IN_MULTICAST(ntohl(remote->sin.sin_addr.s_addr))) || + (remote->sa.sa_family == AF_INET6 && + ipv6_addr_is_multicast(&remote->sin6.sin6_addr))) + return -EINVAL; + if (label && remote->sa.sa_family != AF_INET6) return -EINVAL; + geneve->remote = *remote; + geneve->ttl = ttl; geneve->tos = tos; + geneve->label = label; geneve->dst_port = dst_port; geneve->collect_md = metadata; + geneve->flags = flags; - t = geneve_find_dev(gn, dst_port, rem_addr, geneve->vni, + t = geneve_find_dev(gn, dst_port, remote, geneve->vni, &tun_on_same_port, &tun_collect_md); if (t) return -EBUSY; + /* make enough headroom for basic scenario */ + encap_len = GENEVE_BASE_HLEN + ETH_HLEN; + if (remote->sa.sa_family == AF_INET) + encap_len += sizeof(struct iphdr); + else + encap_len += sizeof(struct ipv6hdr); + dev->needed_headroom = encap_len + ETH_HLEN; + if (metadata) { if (tun_on_same_port) return -EPERM; @@ -951,27 +1372,43 @@ static int geneve_configure(struct net *net, struct net_device *dev, return 0; } -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) -static int geneve_newlink(struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -{ - struct net *net = &init_net; -#else static int geneve_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { -#endif __be16 dst_port = htons(GENEVE_UDP_PORT); __u8 ttl = 0, tos = 0; bool metadata = false; - __be32 rem_addr; - __u32 vni; + union geneve_addr remote = geneve_remote_unspec; + __be32 label = 0; + __u32 vni = 0; + u32 flags = 0; - if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE]) + if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) return -EINVAL; - vni = nla_get_u32(data[IFLA_GENEVE_ID]); - rem_addr = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); + if (data[IFLA_GENEVE_REMOTE]) { + remote.sa.sa_family = AF_INET; + remote.sin.sin_addr.s_addr = + nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); + } + + if (data[IFLA_GENEVE_REMOTE6]) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + + remote.sa.sa_family = AF_INET6; + remote.sin6.sin6_addr = + nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]); + + if (ipv6_addr_type(&remote.sin6.sin6_addr) & + IPV6_ADDR_LINKLOCAL) { + netdev_dbg(dev, "link-local remote is unsupported\n"); + return -EINVAL; + } + } + + if (data[IFLA_GENEVE_ID]) + vni = nla_get_u32(data[IFLA_GENEVE_ID]); if (data[IFLA_GENEVE_TTL]) ttl = nla_get_u8(data[IFLA_GENEVE_TTL]); @@ -979,21 +1416,33 @@ static int geneve_newlink(struct net *net, struct net_device *dev, if (data[IFLA_GENEVE_TOS]) tos = nla_get_u8(data[IFLA_GENEVE_TOS]); + if (data[IFLA_GENEVE_LABEL]) + label = nla_get_be32(data[IFLA_GENEVE_LABEL]) & + IPV6_FLOWLABEL_MASK; + if (data[IFLA_GENEVE_PORT]) dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]); if (data[IFLA_GENEVE_COLLECT_METADATA]) metadata = true; - return geneve_configure(net, dev, rem_addr, vni, - ttl, tos, dst_port, metadata); + if (data[IFLA_GENEVE_UDP_CSUM] && + !nla_get_u8(data[IFLA_GENEVE_UDP_CSUM])) + flags |= GENEVE_F_UDP_ZERO_CSUM_TX; + + if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] && + nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX])) + flags |= GENEVE_F_UDP_ZERO_CSUM6_TX; + + if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] && + nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX])) + flags |= GENEVE_F_UDP_ZERO_CSUM6_RX; + + return geneve_configure(net, dev, &remote, vni, ttl, tos, label, + dst_port, metadata, flags); } -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) -static void geneve_dellink(struct net_device *dev) -#else static void geneve_dellink(struct net_device *dev, struct list_head *head) -#endif { struct geneve_dev *geneve = netdev_priv(dev); @@ -1004,11 +1453,15 @@ static void geneve_dellink(struct net_device *dev, struct list_head *head) static size_t geneve_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_GENEVE_ID */ - nla_total_size(sizeof(struct in_addr)) + /* IFLA_GENEVE_REMOTE */ + nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */ + nla_total_size(sizeof(__be32)) + /* IFLA_GENEVE_LABEL */ nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */ nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ 0; } @@ -1021,12 +1474,21 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u32(skb, IFLA_GENEVE_ID, vni)) goto nla_put_failure; - if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE, - geneve->remote.sin_addr.s_addr)) - goto nla_put_failure; + if (geneve->remote.sa.sa_family == AF_INET) { + if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE, + geneve->remote.sin.sin_addr.s_addr)) + goto nla_put_failure; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6, + &geneve->remote.sin6.sin6_addr)) + goto nla_put_failure; +#endif + } if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) || - nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos)) + nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos) || + nla_put_be32(skb, IFLA_GENEVE_LABEL, geneve->label)) goto nla_put_failure; if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port)) @@ -1037,6 +1499,14 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } + if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM, + !(geneve->flags & GENEVE_F_UDP_ZERO_CSUM_TX)) || + nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_TX)) || + nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_RX))) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1044,7 +1514,7 @@ nla_put_failure: } static struct rtnl_link_ops geneve_link_ops __read_mostly = { - .kind = "ovs_geneve", + .kind = "geneve", .maxtype = IFLA_GENEVE_MAX, .policy = geneve_policy, .priv_size = sizeof(struct geneve_dev), @@ -1057,19 +1527,21 @@ static struct rtnl_link_ops geneve_link_ops __read_mostly = { }; struct net_device *rpl_geneve_dev_create_fb(struct net *net, const char *name, - u8 name_assign_type, u16 dst_port) + u8 name_assign_type, u16 dst_port) { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; int err; memset(tb, 0, sizeof(tb)); - dev = rtnl_create_link(net, (char *) name, name_assign_type, + dev = rtnl_create_link(net, name, name_assign_type, &geneve_link_ops, tb); if (IS_ERR(dev)) return dev; - err = geneve_configure(net, dev, 0, 0, 0, 0, htons(dst_port), true); + err = geneve_configure(net, dev, &geneve_remote_unspec, + 0, 0, 0, 0, htons(dst_port), true, + GENEVE_F_UDP_ZERO_CSUM6_RX); if (err) goto err; @@ -1082,12 +1554,27 @@ struct net_device *rpl_geneve_dev_create_fb(struct net *net, const char *name, return dev; -err: + err: free_netdev(dev); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(rpl_geneve_dev_create_fb); +static int geneve_netdevice_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (event == NETDEV_OFFLOAD_PUSH_GENEVE) + geneve_push_rx_ports(dev); + + return NOTIFY_DONE; +} + +static struct notifier_block geneve_notifier_block __read_mostly = { + .notifier_call = geneve_netdevice_event, +}; + static __net_init int geneve_init_net(struct net *net) { struct geneve_net *gn = net_generic(net, geneve_net_id); @@ -1140,12 +1627,19 @@ int rpl_geneve_init_module(void) if (rc) goto out1; - rc = rtnl_link_register(&geneve_link_ops); + rc = register_netdevice_notifier(&geneve_notifier_block); if (rc) goto out2; + rc = rtnl_link_register(&geneve_link_ops); + if (rc) + goto out3; + pr_info("Geneve tunneling driver\n"); return 0; + +out3: + unregister_netdevice_notifier(&geneve_notifier_block); out2: unregister_pernet_subsys(&geneve_net_ops); out1: @@ -1155,6 +1649,8 @@ out1: void rpl_geneve_cleanup_module(void) { rtnl_link_unregister(&geneve_link_ops); + unregister_netdevice_notifier(&geneve_notifier_block); unregister_pernet_subsys(&geneve_net_ops); } + #endif diff --git a/datapath/linux/compat/include/linux/if_ether.h b/datapath/linux/compat/include/linux/if_ether.h index 25f63ca..b2cb56d 100644 --- a/datapath/linux/compat/include/linux/if_ether.h +++ b/datapath/linux/compat/include/linux/if_ether.h @@ -11,4 +11,10 @@ #define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ #endif +#ifndef HAVE_INNER_ETH_HDR +static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) +{ + return (struct ethhdr *)skb_inner_mac_header(skb); +} +#endif #endif diff --git a/datapath/linux/compat/include/linux/if_link.h b/datapath/linux/compat/include/linux/if_link.h index 6209dcb..bd77e33 100644 --- a/datapath/linux/compat/include/linux/if_link.h +++ b/datapath/linux/compat/include/linux/if_link.h @@ -26,6 +26,21 @@ enum { #define IFLA_GENEVE_COLLECT_METADATA rpl_IFLA_GENEVE_COLLECT_METADATA IFLA_GENEVE_COLLECT_METADATA, +#define IFLA_GENEVE_REMOTE6 rpl_IFLA_GENEVE_REMOTE6 + IFLA_GENEVE_REMOTE6, + +#define IFLA_GENEVE_UDP_CSUM rpl_IFLA_GENEVE_UDP_CSUM + IFLA_GENEVE_UDP_CSUM, + +#define IFLA_GENEVE_UDP_ZERO_CSUM6_TX rpl_IFLA_GENEVE_UDP_ZERO_CSUM6_TX + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + +#define IFLA_GENEVE_UDP_ZERO_CSUM6_RX rpl_IFLA_GENEVE_UDP_ZERO_CSUM6_RX + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + +#define IFLA_GENEVE_LABEL rpl_IFLA_GENEVE_LABEL + IFLA_GENEVE_LABEL, + #define __IFLA_GENEVE_MAX rpl__IFLA_GENEVE_MAX __IFLA_GENEVE_MAX }; @@ -100,6 +115,11 @@ enum { IFLA_VXLAN_REMCSUM_NOPARTIAL, #define IFLA_VXLAN_COLLECT_METADATA rpl_IFLA_VXLAN_COLLECT_METADATA IFLA_VXLAN_COLLECT_METADATA, +#define IFLA_VXLAN_LABEL rpl_IFLA_VXLAN_LABEL + IFLA_VXLAN_LABEL, +#define IFLA_VXLAN_GPE rpl_IFLA_VXLAN_GPE + IFLA_VXLAN_GPE, + #define __IFLA_VXLAN_MAX rpl___IFLA_VXLAN_MAX __IFLA_VXLAN_MAX }; diff --git a/datapath/linux/compat/include/linux/netdev_features.h b/datapath/linux/compat/include/linux/netdev_features.h index e4a3107..a39bd4a 100644 --- a/datapath/linux/compat/include/linux/netdev_features.h +++ b/datapath/linux/compat/include/linux/netdev_features.h @@ -32,6 +32,10 @@ #define NETIF_F_GSO_SIT 0 #endif +#ifndef NETIF_F_CSUM_MASK +#define NETIF_F_CSUM_MASK 0 +#endif + #ifndef NETIF_F_GSO_UDP_TUNNEL #define NETIF_F_GSO_UDP_TUNNEL 0 #else diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h index ac612ef..8bb5947 100644 --- a/datapath/linux/compat/include/linux/netdevice.h +++ b/datapath/linux/compat/include/linux/netdevice.h @@ -253,4 +253,13 @@ do { \ #define dev_fill_metadata_dst ovs_dev_fill_metadata_dst int ovs_dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); #endif + +#ifndef NETDEV_OFFLOAD_PUSH_VXLAN +#define NETDEV_OFFLOAD_PUSH_VXLAN 0x001C +#endif + +#ifndef NETDEV_OFFLOAD_PUSH_GENEVE +#define NETDEV_OFFLOAD_PUSH_GENEVE 0x001D +#endif + #endif /* __LINUX_NETDEVICE_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h index d79d498..7fe6a04 100644 --- a/datapath/linux/compat/include/net/ip_tunnels.h +++ b/datapath/linux/compat/include/net/ip_tunnels.h @@ -13,6 +13,7 @@ #include <linux/if_tunnel.h> #include <linux/types.h> #include <net/dsfield.h> +#include <net/dst_cache.h> #include <net/flow.h> #include <net/inet_ecn.h> #include <net/ip.h> @@ -137,6 +138,7 @@ struct ip_tunnel_key { struct ip_tunnel_info { struct ip_tunnel_key key; + struct dst_cache dst_cache; u8 options_len; u8 mode; }; @@ -195,7 +197,6 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, #define ip_tunnel_collect_metadata() true - #define ip_tunnel rpl_ip_tunnel struct ip_tunnel { diff --git a/datapath/linux/compat/include/net/rtnetlink.h b/datapath/linux/compat/include/net/rtnetlink.h index 6db4a76..74d6a27 100644 --- a/datapath/linux/compat/include/net/rtnetlink.h +++ b/datapath/linux/compat/include/net/rtnetlink.h @@ -24,7 +24,17 @@ static inline struct net_device *rpl_rtnl_create_link(struct net *net, const cha return rtnl_create_link(net, (char *)ifname, ops, tb); } #endif -#define rtnl_create_link rpl_rtnl_create_link +#else +/* This function is only defined to avoid warning related to ifname. Some backported + * function did not changed the name to const type. */ +static inline struct net_device *rpl_rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[]) +{ + return rtnl_create_link(net, (char *) ifname, name_assign_type, ops, tb); +} #endif +#define rtnl_create_link rpl_rtnl_create_link #endif diff --git a/datapath/linux/compat/include/net/udp_tunnel.h b/datapath/linux/compat/include/net/udp_tunnel.h index bdf1469..983041a 100644 --- a/datapath/linux/compat/include/net/udp_tunnel.h +++ b/datapath/linux/compat/include/net/udp_tunnel.h @@ -13,6 +13,7 @@ #else +#include <net/addrconf.h> #include <net/ip_tunnels.h> #include <net/udp.h> diff --git a/datapath/linux/compat/include/net/vxlan.h b/datapath/linux/compat/include/net/vxlan.h index 77d260c..a6a5f30 100644 --- a/datapath/linux/compat/include/net/vxlan.h +++ b/datapath/linux/compat/include/net/vxlan.h @@ -5,7 +5,6 @@ #include <net/udp_tunnel.h> #endif - #ifdef USE_UPSTREAM_TUNNEL #include_next <net/vxlan.h> @@ -18,7 +17,7 @@ static inline void rpl_vxlan_cleanup_module(void) #define vxlan_xmit dev_queue_xmit -#else +#else /* USE_UPSTREAM_TUNNEL */ #include <linux/ip.h> #include <linux/ipv6.h> @@ -31,17 +30,71 @@ static inline void rpl_vxlan_cleanup_module(void) #include "compat.h" #include "gso.h" +/* VXLAN protocol (RFC 7348) header: + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |R|R|R|R|I|R|R|R| Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | VXLAN Network Identifier (VNI) | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * I = VXLAN Network Identifier (VNI) present. + */ +struct vxlanhdr { + __be32 vx_flags; + __be32 vx_vni; +}; + +/* VXLAN header flags. */ +#define VXLAN_HF_VNI cpu_to_be32(BIT(27)) + +#define VXLAN_N_VID (1u << 24) +#define VXLAN_VID_MASK (VXLAN_N_VID - 1) +#define VXLAN_VNI_MASK cpu_to_be32(VXLAN_VID_MASK << 8) +#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) + #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1<<VNI_HASH_BITS) +#define FDB_HASH_BITS 8 +#define FDB_HASH_SIZE (1<<FDB_HASH_BITS) + +/* Remote checksum offload for VXLAN (VXLAN_F_REMCSUM_[RT]X): + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |R|R|R|R|I|R|R|R|R|R|C| Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | VXLAN Network Identifier (VNI) |O| Csum start | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * C = Remote checksum offload bit. When set indicates that the + * remote checksum offload data is present. + * + * O = Offset bit. Indicates the checksum offset relative to + * checksum start. + * + * Csum start = Checksum start divided by two. + * + * http://tools.ietf.org/html/draft-herbert-vxlan-rco + */ + +/* VXLAN-RCO header flags. */ +#define VXLAN_HF_RCO cpu_to_be32(BIT(21)) + +/* Remote checksum offload header option */ +#define VXLAN_RCO_MASK cpu_to_be32(0x7f) /* Last byte of vni field */ +#define VXLAN_RCO_UDP cpu_to_be32(0x80) /* Indicate UDP RCO (TCP when not set *) */ +#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ +#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) +#define VXLAN_MAX_REMCSUM_START (0x7f << VXLAN_RCO_SHIFT) /* - * VXLAN Group Based Policy Extension: + * VXLAN Group Based Policy Extension (VXLAN_F_GBP): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |1|-|-|-|1|-|-|-|R|D|R|R|A|R|R|R| Group Policy ID | + * |G|R|R|R|I|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * + * G = Group Policy ID present. + * * D = Don't Learn bit. When set, this bit indicates that the egress * VTEP MUST NOT learn the source address of the encapsulated frame. * @@ -49,18 +102,18 @@ static inline void rpl_vxlan_cleanup_module(void) * this packet. Policies MUST NOT be applied by devices when the * A bit is set. * - * [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy + * https://tools.ietf.org/html/draft-smith-vxlan-group-policy */ struct vxlanhdr_gbp { - __u8 vx_flags; + u8 vx_flags; #ifdef __LITTLE_ENDIAN_BITFIELD - __u8 reserved_flags1:3, + u8 reserved_flags1:3, policy_applied:1, reserved_flags2:2, dont_learn:1, reserved_flags3:1; #elif defined(__BIG_ENDIAN_BITFIELD) - __u8 reserved_flags1:1, + u8 reserved_flags1:1, dont_learn:1, reserved_flags2:2, policy_applied:1, @@ -72,7 +125,10 @@ struct vxlanhdr_gbp { __be32 vx_vni; }; -#define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | 0xFFFFFF) +/* VXLAN-GBP header flags. */ +#define VXLAN_HF_GBP cpu_to_be32(BIT(31)) + +#define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | cpu_to_be32(0xFFFFFF)) /* skb->mark mapping * @@ -84,61 +140,78 @@ struct vxlanhdr_gbp { #define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) #define VXLAN_GBP_ID_MASK (0xFFFF) -/* VXLAN protocol header: +/* + * VXLAN Generic Protocol Extension (VXLAN_F_GPE): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |G|R|R|R|I|R|R|C| Reserved | + * |R|R|Ver|I|P|R|O| Reserved |Next Protocol | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * - * G = 1 Group Policy (VXLAN-GBP) - * I = 1 VXLAN Network Identifier (VNI) present - * C = 1 Remote checksum offload (RCO) + * Ver = Version. Indicates VXLAN GPE protocol version. + * + * P = Next Protocol Bit. The P bit is set to indicate that the + * Next Protocol field is present. + * + * O = OAM Flag Bit. The O bit is set to indicate that the packet + * is an OAM packet. + * + * Next Protocol = This 8 bit field indicates the protocol header + * immediately following the VXLAN GPE header. + * + * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01 */ -struct vxlanhdr { - __be32 vx_flags; - __be32 vx_vni; + +struct vxlanhdr_gpe { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 oam_flag:1, + reserved_flags1:1, + np_applied:1, + instance_applied:1, + version:2, +reserved_flags2:2; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 reserved_flags2:2, + version:2, + instance_applied:1, + np_applied:1, + reserved_flags1:1, + oam_flag:1; +#endif + u8 reserved_flags3; + u8 reserved_flags4; + u8 next_protocol; + __be32 vx_vni; }; -/* VXLAN header flags. */ -#define VXLAN_HF_RCO BIT(21) -#define VXLAN_HF_VNI BIT(27) -#define VXLAN_HF_GBP BIT(31) +/* VXLAN-GPE header flags. */ +#define VXLAN_HF_VER cpu_to_be32(BIT(29) | BIT(28)) +#define VXLAN_HF_NP cpu_to_be32(BIT(26)) +#define VXLAN_HF_OAM cpu_to_be32(BIT(24)) -/* Remote checksum offload header option */ -#define VXLAN_RCO_MASK 0x7f /* Last byte of vni field */ -#define VXLAN_RCO_UDP 0x80 /* Indicate UDP RCO (TCP when not set *) */ -#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ -#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) -#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT) +#define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \ + cpu_to_be32(0xff)) -#define VXLAN_N_VID (1u << 24) -#define VXLAN_VID_MASK (VXLAN_N_VID - 1) -#define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8) -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) +/* VXLAN-GPE header Next Protocol. */ +#define VXLAN_GPE_NP_IPV4 0x01 +#define VXLAN_GPE_NP_IPV6 0x02 +#define VXLAN_GPE_NP_ETHERNET 0x03 +#define VXLAN_GPE_NP_NSH 0x04 struct vxlan_metadata { - __be32 vni; - u32 gbp; + u32 gbp; }; -#define VNI_HASH_BITS 10 -#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) -#define FDB_HASH_BITS 8 -#define FDB_HASH_SIZE (1<<FDB_HASH_BITS) - /* per UDP socket information */ struct vxlan_sock { struct hlist_node hlist; - struct work_struct del_work; struct socket *sock; - struct rcu_head rcu; struct hlist_head vni_list[VNI_HASH_SIZE]; atomic_t refcnt; + u32 flags; #ifdef HAVE_UDP_OFFLOAD struct udp_offload udp_offloads; #endif - u32 flags; }; union vxlan_addr { @@ -150,7 +223,7 @@ union vxlan_addr { struct vxlan_rdst { union vxlan_addr remote_ip; __be16 remote_port; - u32 remote_vni; + __be32 remote_vni; u32 remote_ifindex; struct list_head list; struct rcu_head rcu; @@ -159,14 +232,14 @@ struct vxlan_rdst { struct vxlan_config { union vxlan_addr remote_ip; union vxlan_addr saddr; - u32 vni; + __be32 vni; int remote_ifindex; int mtu; __be16 dst_port; - __u16 port_min; - __u16 port_max; - __u8 tos; - __u8 ttl; + u16 port_min; + u16 port_max; + u8 tos; + u8 ttl; __be32 label; u32 flags; unsigned long age_interval; @@ -178,7 +251,10 @@ struct vxlan_config { struct vxlan_dev { struct hlist_node hlist; /* vni hash table */ struct list_head next; /* vxlan's per namespace list */ - struct vxlan_sock *vn_sock; /* listening socket */ + struct vxlan_sock *vn4_sock; /* listening socket for IPv4 */ +#if IS_ENABLED(CONFIG_IPV6) + struct vxlan_sock *vn6_sock; /* listening socket for IPv6 */ +#endif struct net_device *dev; struct net *net; /* netns for packet i/o */ struct vxlan_rdst default_dst; /* default destination */ @@ -189,6 +265,7 @@ struct vxlan_dev { unsigned int addrcnt; struct vxlan_config cfg; + struct hlist_head fdb_head[FDB_HASH_SIZE]; }; @@ -198,7 +275,7 @@ struct vxlan_dev { #define VXLAN_F_L2MISS 0x08 #define VXLAN_F_L3MISS 0x10 #define VXLAN_F_IPV6 0x20 -#define VXLAN_F_UDP_CSUM 0x40 +#define VXLAN_F_UDP_ZERO_CSUM_TX 0x40 #define VXLAN_F_UDP_ZERO_CSUM6_TX 0x80 #define VXLAN_F_UDP_ZERO_CSUM6_RX 0x100 #define VXLAN_F_REMCSUM_TX 0x200 @@ -206,22 +283,38 @@ struct vxlan_dev { #define VXLAN_F_GBP 0x800 #define VXLAN_F_REMCSUM_NOPARTIAL 0x1000 #define VXLAN_F_COLLECT_METADATA 0x2000 +#define VXLAN_F_GPE 0x4000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable */ #define VXLAN_F_RCV_FLAGS (VXLAN_F_GBP | \ + VXLAN_F_GPE | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ VXLAN_F_COLLECT_METADATA) + +/* Flags that can be set together with VXLAN_F_GPE. */ +#define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \ + VXLAN_F_IPV6 | \ + VXLAN_F_UDP_ZERO_CSUM_TX | \ + VXLAN_F_UDP_ZERO_CSUM6_TX | \ + VXLAN_F_UDP_ZERO_CSUM6_RX | \ + VXLAN_F_COLLECT_METADATA) + #define vxlan_dev_create rpl_vxlan_dev_create struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); -static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan) +static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan, + unsigned short family) { - return inet_sk(vxlan->vn_sock->sock->sk)->inet_sport; +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6) + return inet_sk(vxlan->vn6_sock->sock->sk)->inet_sport; +#endif + return inet_sk(vxlan->vn4_sock->sock->sk)->inet_sport; } static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, @@ -240,17 +333,21 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, l4_hdr = ipv6_hdr(skb)->nexthdr; break; default: - return features; + return features;; } if ((l4_hdr == IPPROTO_UDP) && ( -#ifdef ENCAP_TYPE_ETHER +#ifdef HAVE_INNER_PROTOCOL_TYPE skb->inner_protocol_type != ENCAP_TYPE_ETHER || #endif - ovs_skb_get_inner_protocol(skb) != htons(ETH_P_TEB) || +#ifdef HAVE_INNER_PROTOCOL + skb->inner_protocol != htons(ETH_P_TEB) || +#endif (skb_inner_mac_header(skb) - skb_transport_header(skb) != - sizeof(struct udphdr) + sizeof(struct vxlanhdr)))) - return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + sizeof(struct udphdr) + sizeof(struct vxlanhdr)) || + (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, inner_eth_hdr(skb)->h_proto)))) + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; } @@ -260,6 +357,74 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, /* IPv6 header + UDP + VXLAN + Ethernet header */ #define VXLAN6_HEADROOM (40 + 8 + 8 + 14) +static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) +{ + return (struct vxlanhdr *)(udp_hdr(skb) + 1); +} + +static inline __be32 vxlan_vni(__be32 vni_field) +{ +#if defined(__BIG_ENDIAN) + return (__force __be32)((__force u32)vni_field >> 8); +#else + return (__force __be32)((__force u32)(vni_field & VXLAN_VNI_MASK) << 8); +#endif +} + +static inline __be32 vxlan_vni_field(__be32 vni) +{ +#if defined(__BIG_ENDIAN) + return (__force __be32)((__force u32)vni << 8); +#else + return (__force __be32)((__force u32)vni >> 8); +#endif +} + +static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id) +{ +#if defined(__BIG_ENDIAN) + return (__force __be32)tun_id; +#else + return (__force __be32)((__force u64)tun_id >> 32); +#endif +} + +static inline __be64 vxlan_vni_to_tun_id(__be32 vni) +{ +#if defined(__BIG_ENDIAN) + return (__force __be64)vni; +#else + return (__force __be64)((u64)(__force u32)vni << 32); +#endif +} + +static inline size_t vxlan_rco_start(__be32 vni_field) +{ + return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; +} + +static inline size_t vxlan_rco_offset(__be32 vni_field) +{ + return (vni_field & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check); +} + +static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset) +{ + __be32 vni_field = cpu_to_be32(start >> VXLAN_RCO_SHIFT); + + if (offset == offsetof(struct udphdr, check)) + vni_field |= VXLAN_RCO_UDP; + return vni_field; +} + +static inline void vxlan_get_rx_port(struct net_device *netdev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_OFFLOAD_PUSH_VXLAN, netdev); +} + static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) { return vs->sock->sk->sk_family; @@ -268,13 +433,15 @@ static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) int rpl_vxlan_init_module(void); void rpl_vxlan_cleanup_module(void); +#define vxlan_fill_metadata_dst ovs_vxlan_fill_metadata_dst +int ovs_vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); + #define vxlan_xmit rpl_vxlan_xmit netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb); -#endif + +#endif /* USE_UPSTREAM_TUNNEL */ #define vxlan_init_module rpl_vxlan_init_module #define vxlan_cleanup_module rpl_vxlan_cleanup_module -#define vxlan_fill_metadata_dst ovs_vxlan_fill_metadata_dst -int ovs_vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); #endif diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c index 12aefaf..a12b172 100644 --- a/datapath/linux/compat/vxlan.c +++ b/datapath/linux/compat/vxlan.c @@ -18,6 +18,7 @@ #include <linux/skbuff.h> #include <linux/rculist.h> #include <linux/netdevice.h> +#include <linux/netdev_features.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/udp.h> @@ -27,8 +28,8 @@ #include <linux/if_vlan.h> #include <linux/hash.h> #include <linux/ethtool.h> -#include <linux/netdev_features.h> #include <net/arp.h> +#include <net/dst_metadata.h> #include <net/ndisc.h> #include <net/ip.h> #include <net/ip_tunnels.h> @@ -43,20 +44,20 @@ #include <net/netns/generic.h> #include <net/vxlan.h> #include <net/protocol.h> -#include <net/udp_tunnel.h> -#include <net/ip6_route.h> + #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> +#include <net/ip6_route.h> #endif -#include <net/dst_metadata.h> -#ifndef USE_UPSTREAM_TUNNEL #include "gso.h" #include "vport-netdev.h" +#include "compat.h" +#ifndef USE_UPSTREAM_TUNNEL #define VXLAN_VERSION "0.1" #define PORT_HASH_BITS 8 @@ -64,10 +65,6 @@ #define FDB_AGE_DEFAULT 300 /* 5 min */ #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ -#ifndef NTF_SELF -#define NTF_SELF 0x02 -#endif - /* UDP port for VXLAN traffic. * The IANA assigned port is 4789, but the Linux default is 8472 * for compatibility with early adopters. @@ -79,10 +76,9 @@ MODULE_PARM_DESC(udp_port, "Destination UDP port"); static int vxlan_net_id; static struct rtnl_link_ops vxlan_link_ops; -static const u8 all_zeros_mac[ETH_ALEN]; +static const u8 all_zeros_mac[ETH_ALEN + 2]; -static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, - bool no_share, u32 flags); +static int vxlan_sock_add(struct vxlan_dev *vxlan); /* per-network namespace private data for this module */ struct vxlan_net { @@ -105,7 +101,6 @@ struct vxlan_fdb { /* salt for hash table */ static u32 vxlan_salt __read_mostly; -static struct workqueue_struct *vxlan_wq; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { @@ -158,13 +153,12 @@ static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) { return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); } - #endif /* Virtual Network hash table head */ -static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id) +static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni) { - return &vs->vni_list[hash_32(id, VNI_HASH_BITS)]; + return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)]; } /* Socket hash table head */ @@ -175,19 +169,6 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; } -/* First remote destination for a forwarding entry. - * Guaranteed to be non-NULL because remotes are never deleted. - */ -static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) -{ - return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); -} - -static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) -{ - return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); -} - /* Find VXLAN socket based on network namespace, address family and UDP port * and enabled unshareable flags. */ @@ -207,12 +188,16 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, return NULL; } -static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id) +static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, __be32 vni) { struct vxlan_dev *vxlan; - hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) { - if (vxlan->default_dst.remote_vni == id) + /* For flow based devices, map all packets to VNI 0 */ + if (vs->flags & VXLAN_F_COLLECT_METADATA) + vni = 0; + + hlist_for_each_entry_rcu(vxlan, vni_head(vs, vni), hlist) { + if (vxlan->default_dst.remote_vni == vni) return vxlan; } @@ -220,7 +205,7 @@ static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id) } /* Look up VNI in a per net namespace table */ -static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, +static struct vxlan_dev *vxlan_find_vni(struct net *net, __be32 vni, sa_family_t family, __be16 port, u32 flags) { @@ -230,18 +215,23 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, if (!vs) return NULL; - return vxlan_vs_find_vni(vs, id); + return vxlan_vs_find_vni(vs, vni); } -/* Fill in neighbour message in skbuff. */ -static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, - const struct vxlan_fdb *fdb, - u32 portid, u32 seq, int type, unsigned int flags, - const struct vxlan_rdst *rdst) +static int vxlan_fdb_create(struct vxlan_dev *vxlan, + const u8 *mac, union vxlan_addr *ip, + __u16 state, __u16 flags, + __be16 port, __u32 vni, __u32 ifindex, + __u8 ndm_flags) { return -EINVAL; } +static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) +{ + +} + static inline size_t vxlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) @@ -254,149 +244,14 @@ static inline size_t vxlan_nlmsg_size(void) + nla_total_size(sizeof(struct nda_cacheinfo)); } -static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, - struct vxlan_rdst *rd, int type) -{ - struct net *net = dev_net(vxlan->dev); - struct sk_buff *skb; - int err = -ENOBUFS; - - skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); - if (skb == NULL) - goto errout; - - err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd); - if (err < 0) { - /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ - WARN_ON(err == -EMSGSIZE); - kfree_skb(skb); - goto errout; - } - - rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); - return; -errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); -} - -/* Hash Ethernet address */ -static u32 eth_hash(const unsigned char *addr) -{ - u64 value = get_unaligned((u64 *)addr); - - /* only want 6 bytes */ -#ifdef __BIG_ENDIAN - value >>= 16; -#else - value <<= 16; -#endif - return hash_64(value, FDB_HASH_BITS); -} - -/* Hash chain to use given mac address */ -static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, - const u8 *mac) -{ - return &vxlan->fdb_head[eth_hash(mac)]; -} - -/* Look up Ethernet address in forwarding table */ -static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, - const u8 *mac) -{ - struct hlist_head *head = vxlan_fdb_head(vxlan, mac); - struct vxlan_fdb *f; - - hlist_for_each_entry_rcu(f, head, hlist) { - if (ether_addr_equal(mac, f->eth_addr)) - return f; - } - - return NULL; -} - -static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, - const u8 *mac) -{ - struct vxlan_fdb *f; - - f = __vxlan_find_mac(vxlan, mac); - if (f) - f->used = jiffies; - - return f; -} - -/* caller should hold vxlan->hash_lock */ -static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, - union vxlan_addr *ip, __be16 port, - __u32 vni, __u32 ifindex) -{ - struct vxlan_rdst *rd; - - list_for_each_entry(rd, &f->remotes, list) { - if (vxlan_addr_equal(&rd->remote_ip, ip) && - rd->remote_port == port && - rd->remote_vni == vni && - rd->remote_ifindex == ifindex) - return rd; - } - - return NULL; -} - -/* Replace destination of unicast mac */ -static int vxlan_fdb_replace(struct vxlan_fdb *f, - union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex) -{ - struct vxlan_rdst *rd; - - rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); - if (rd) - return 0; - - rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); - if (!rd) - return 0; - rd->remote_ip = *ip; - rd->remote_port = port; - rd->remote_vni = vni; - rd->remote_ifindex = ifindex; - return 1; -} - -/* Add/update destinations for multicast */ -static int vxlan_fdb_append(struct vxlan_fdb *f, - union vxlan_addr *ip, __be16 port, __u32 vni, - __u32 ifindex, struct vxlan_rdst **rdp) -{ - struct vxlan_rdst *rd; - - rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); - if (rd) - return 0; - - rd = kmalloc(sizeof(*rd), GFP_ATOMIC); - if (rd == NULL) - return -ENOBUFS; - rd->remote_ip = *ip; - rd->remote_port = port; - rd->remote_vni = vni; - rd->remote_ifindex = ifindex; - - list_add_tail_rcu(&rd->list, &f->remotes); - - *rdp = rd; - return 1; -} - #ifdef HAVE_UDP_OFFLOAD #ifdef HAVE_NETIF_F_GSO_TUNNEL_REMCSUM + static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, unsigned int off, struct vxlanhdr *vh, size_t hdrlen, - u32 data, struct gro_remcsum *grc, + __be32 vni_field, + struct gro_remcsum *grc, bool nopartial) { size_t start, offset; @@ -407,10 +262,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; - start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; - offset = start + ((data & VXLAN_RCO_UDP) ? - offsetof(struct udphdr, check) : - offsetof(struct tcphdr, check)); + start = vxlan_rco_start(vni_field); + offset = start + vxlan_rco_offset(vni_field); vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, start, offset, grc, nopartial); @@ -421,10 +274,10 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, } #else static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, - unsigned int off, - struct vxlanhdr *vh, size_t hdrlen, - u32 data, struct gro_remcsum *grc, - bool nopartial) + unsigned int off, + struct vxlanhdr *vh, size_t hdrlen, + u32 data, struct gro_remcsum *grc, + bool nopartial) { return NULL; } @@ -432,16 +285,16 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, #ifndef HAVE_UDP_OFFLOAD_ARG_UOFF static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, - struct sk_buff *skb) + struct sk_buff *skb) #else static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) + struct sk_buff *skb, + struct udp_offload *uoff) #endif { #ifdef HAVE_UDP_OFFLOAD_ARG_UOFF struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock, - udp_offloads); + udp_offloads); #else struct vxlan_sock *vs = NULL; #endif @@ -449,7 +302,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct vxlanhdr *vh, *vh2; unsigned int hlen, off_vx; int flush = 1; - u32 flags; + __be32 flags; struct gro_remcsum grc; skb_gro_remcsum_init(&grc); @@ -465,12 +318,11 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); - flags = ntohl(vh->vx_flags); - - if ((flags & VXLAN_HF_RCO) && vs && (vs->flags & VXLAN_F_REMCSUM_RX)) { + flags = vh->vx_flags; + if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), - ntohl(vh->vx_vni), &grc, + vh->vx_vni, &grc, !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); @@ -480,8 +332,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ - flush = 0; - for (p = *head; p; p = p->next) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -495,6 +345,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, } pp = eth_gro_receive(head, skb); + flush = 0; out: skb_gro_remcsum_cleanup(skb, &grc); @@ -507,29 +358,24 @@ out: static int vxlan_gro_complete(struct sk_buff *skb, int nhoff) #else static int vxlan_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) + struct udp_offload *uoff) #endif { udp_tunnel_gro_complete(skb, nhoff); return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); } +#endif /* Notify netdevs that UDP port started listening */ static void vxlan_notify_add_rx_port(struct vxlan_sock *vs) { +#ifdef HAVE_NDO_ADD_VXLAN_PORT struct net_device *dev; struct sock *sk = vs->sock->sk; struct net *net = sock_net(sk); sa_family_t sa_family = vxlan_get_sk_family(vs); __be16 port = inet_sk(sk)->inet_sport; - int err; - - if (sa_family == AF_INET) { - err = udp_add_offload(&vs->udp_offloads); - if (err) - pr_warn("vxlan: udp_add_offload failed with status %d\n", err); - } rcu_read_lock(); for_each_netdev_rcu(net, dev) { @@ -538,11 +384,29 @@ static void vxlan_notify_add_rx_port(struct vxlan_sock *vs) port); } rcu_read_unlock(); +#else + +#ifdef HAVE_UDP_OFFLOAD + struct net_device *dev; + struct sock *sk = vs->sock->sk; + sa_family_t sa_family = vxlan_get_sk_family(vs); + + if (sa_family == AF_INET) { + int err; + + err = udp_add_offload(&vs->udp_offloads); + if (err) + pr_warn("vxlan: udp_add_offload failed with status %d\n", err); + } + +#endif +#endif } /* Notify netdevs that UDP port is no more listening */ static void vxlan_notify_del_rx_port(struct vxlan_sock *vs) { +#ifdef HAVE_NDO_ADD_VXLAN_PORT struct net_device *dev; struct sock *sk = vs->sock->sk; struct net *net = sock_net(sk); @@ -556,186 +420,45 @@ static void vxlan_notify_del_rx_port(struct vxlan_sock *vs) port); } rcu_read_unlock(); +#else +#ifdef HAVE_UDP_OFFLOAD + struct sock *sk = vs->sock->sk; + sa_family_t sa_family = vxlan_get_sk_family(vs); - if (sa_family == AF_INET) + if (sa_family == AF_INET) { udp_del_offload(&vs->udp_offloads); -} #endif - -/* Add new entry to forwarding table -- assumes lock held */ -static int vxlan_fdb_create(struct vxlan_dev *vxlan, - const u8 *mac, union vxlan_addr *ip, - __u16 state, __u16 flags, - __be16 port, __u32 vni, __u32 ifindex, - __u8 ndm_flags) -{ - struct vxlan_rdst *rd = NULL; - struct vxlan_fdb *f; - int notify = 0; - - f = __vxlan_find_mac(vxlan, mac); - if (f) { - if (flags & NLM_F_EXCL) { - netdev_dbg(vxlan->dev, - "lost race to create %pM\n", mac); - return -EEXIST; - } - if (f->state != state) { - f->state = state; - f->updated = jiffies; - notify = 1; - } - if (f->flags != ndm_flags) { - f->flags = ndm_flags; - f->updated = jiffies; - notify = 1; - } - if ((flags & NLM_F_REPLACE)) { - /* Only change unicasts */ - if (!(is_multicast_ether_addr(f->eth_addr) || - is_zero_ether_addr(f->eth_addr))) { - notify |= vxlan_fdb_replace(f, ip, port, vni, - ifindex); - } else - return -EOPNOTSUPP; - } - if ((flags & NLM_F_APPEND) && - (is_multicast_ether_addr(f->eth_addr) || - is_zero_ether_addr(f->eth_addr))) { - int rc = vxlan_fdb_append(f, ip, port, vni, ifindex, - &rd); - - if (rc < 0) - return rc; - notify |= rc; - } - } else { - if (!(flags & NLM_F_CREATE)) - return -ENOENT; - - if (vxlan->cfg.addrmax && - vxlan->addrcnt >= vxlan->cfg.addrmax) - return -ENOSPC; - - /* Disallow replace to add a multicast entry */ - if ((flags & NLM_F_REPLACE) && - (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) - return -EOPNOTSUPP; - - netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); - f = kmalloc(sizeof(*f), GFP_ATOMIC); - if (!f) - return -ENOMEM; - - notify = 1; - f->state = state; - f->flags = ndm_flags; - f->updated = f->used = jiffies; - INIT_LIST_HEAD(&f->remotes); - memcpy(f->eth_addr, mac, ETH_ALEN); - - vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); - - ++vxlan->addrcnt; - hlist_add_head_rcu(&f->hlist, - vxlan_fdb_head(vxlan, mac)); - } - - if (notify) { - if (rd == NULL) - rd = first_remote_rtnl(f); - vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH); - } - - return 0; -} - -static void vxlan_fdb_free(struct rcu_head *head) -{ - struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); - struct vxlan_rdst *rd, *nd; - - list_for_each_entry_safe(rd, nd, &f->remotes, list) - kfree(rd); - kfree(f); -} - -static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) -{ - netdev_dbg(vxlan->dev, - "delete %pM\n", f->eth_addr); - - --vxlan->addrcnt; - vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH); - - hlist_del_rcu(&f->hlist); - call_rcu(&f->rcu, vxlan_fdb_free); -} - -/* Watch incoming packets to learn mapping between Ethernet address - * and Tunnel endpoint. - * Return true if packet is bogus and should be dropped. - */ -static bool vxlan_snoop(struct net_device *dev, - union vxlan_addr *src_ip, const u8 *src_mac) -{ - struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_fdb *f; - - f = vxlan_find_mac(vxlan, src_mac); - if (likely(f)) { - struct vxlan_rdst *rdst = first_remote_rcu(f); - - if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip))) - return false; - - /* Don't migrate static entries, drop packets */ - if (f->state & NUD_NOARP) - return true; - - if (net_ratelimit()) - netdev_info(dev, - "%pM migrated from %pIS to %pIS\n", - src_mac, &rdst->remote_ip.sa, &src_ip->sa); - - rdst->remote_ip = *src_ip; - f->updated = jiffies; - vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH); - } else { - /* learned new entry */ - spin_lock(&vxlan->hash_lock); - - /* close off race between vxlan_flush and incoming packets */ - if (netif_running(dev)) - vxlan_fdb_create(vxlan, src_mac, src_ip, - NUD_REACHABLE, - NLM_F_EXCL|NLM_F_CREATE, - vxlan->cfg.dst_port, - vxlan->default_dst.remote_vni, - 0, NTF_SELF); - spin_unlock(&vxlan->hash_lock); - } - - return false; +#endif } /* See if multicast group is already in use by other ID */ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) { struct vxlan_dev *vxlan; + unsigned short family = dev->default_dst.remote_ip.sa.sa_family; /* The vxlan_sock is only used by dev, leaving group has * no effect on other vxlan devices. */ - if (atomic_read(&dev->vn_sock->refcnt) == 1) + if (family == AF_INET && dev->vn4_sock && + atomic_read(&dev->vn4_sock->refcnt) == 1) + return false; +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6 && dev->vn6_sock && + atomic_read(&dev->vn6_sock->refcnt) == 1) return false; +#endif list_for_each_entry(vxlan, &vn->vxlan_list, next) { if (!netif_running(vxlan->dev) || vxlan == dev) continue; - if (vxlan->vn_sock != dev->vn_sock) + if (family == AF_INET && vxlan->vn4_sock != dev->vn4_sock) + continue; +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6 && vxlan->vn6_sock != dev->vn6_sock) continue; +#endif if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip, &dev->default_dst.remote_ip)) @@ -751,23 +474,44 @@ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) return false; } -static void vxlan_sock_release(struct vxlan_sock *vs) +static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) { - struct sock *sk = vs->sock->sk; - struct net *net = sock_net(sk); - struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_net *vn; + if (!vs) + return false; if (!atomic_dec_and_test(&vs->refcnt)) - return; + return false; + vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); -#ifdef HAVE_UDP_OFFLOAD vxlan_notify_del_rx_port(vs); -#endif spin_unlock(&vn->sock_lock); - queue_work(vxlan_wq, &vs->del_work); + return true; +} + +static void vxlan_sock_release(struct vxlan_dev *vxlan) +{ + bool ipv4 = __vxlan_sock_release_prep(vxlan->vn4_sock); +#if IS_ENABLED(CONFIG_IPV6) + bool ipv6 = __vxlan_sock_release_prep(vxlan->vn6_sock); +#endif + + synchronize_net(); + + if (ipv4) { + udp_tunnel_sock_release(vxlan->vn4_sock->sock); + kfree(vxlan->vn4_sock); + } + +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6) { + udp_tunnel_sock_release(vxlan->vn6_sock->sock); + kfree(vxlan->vn6_sock); + } +#endif } /* Update multicast group membership when first VNI on @@ -784,195 +528,207 @@ static int vxlan_igmp_leave(struct vxlan_dev *vxlan) return -EINVAL; } -#ifdef HAVE_VXLAN_HF_RCO -static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, - size_t hdrlen, u32 data, bool nopartial) +static bool vxlan_remcsum(struct vxlanhdr *unparsed, + struct sk_buff *skb, u32 vxflags) { - size_t start, offset, plen; - - if (skb->remcsum_offload) - return vh; - - start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; - offset = start + ((data & VXLAN_RCO_UDP) ? - offsetof(struct udphdr, check) : - offsetof(struct tcphdr, check)); - - plen = hdrlen + offset + sizeof(u16); +#ifndef USE_UPSTREAM_TUNNEL + return false; +#else + size_t start, offset; - if (!pskb_may_pull(skb, plen)) - return NULL; + if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) + goto out; - vh = (struct vxlanhdr *)(udp_hdr(skb) + 1); + start = vxlan_rco_start(unparsed->vx_vni); + offset = start + vxlan_rco_offset(unparsed->vx_vni); - skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset, - nopartial); + if (!pskb_may_pull(skb, offset + sizeof(u16))) + return false; - return vh; -} + skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, + !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); +out: + unparsed->vx_flags &= ~VXLAN_HF_RCO; + unparsed->vx_vni &= VXLAN_VNI_MASK; + return true; #endif +} -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, - struct vxlan_metadata *md, u32 vni, - struct metadata_dst *tun_dst) +static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, + struct sk_buff *skb, u32 vxflags, + struct vxlan_metadata *md) { - struct iphdr *oip = NULL; - struct ipv6hdr *oip6 = NULL; - struct vxlan_dev *vxlan; - struct pcpu_sw_netstats *stats; - union vxlan_addr saddr; - int err = 0; + struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; + struct metadata_dst *tun_dst; - /* For flow based devices, map all packets to VNI 0 */ - if (vs->flags & VXLAN_F_COLLECT_METADATA) - vni = 0; - - /* Is this VNI defined? */ - vxlan = vxlan_vs_find_vni(vs, vni); - if (!vxlan) - goto drop; - - skb_reset_mac_header(skb); - skb->protocol = eth_type_trans(skb, vxlan->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - - /* Ignore packet loops (and multicast echo) */ - if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) - goto drop; + if (!(unparsed->vx_flags & VXLAN_HF_GBP)) + goto out; - /* Get data from the outer IP header */ - if (vxlan_get_sk_family(vs) == AF_INET) { - oip = ip_hdr(skb); - saddr.sin.sin_addr.s_addr = oip->saddr; - saddr.sa.sa_family = AF_INET; -#if IS_ENABLED(CONFIG_IPV6) - } else { - oip6 = ipv6_hdr(skb); - saddr.sin6.sin6_addr = oip6->saddr; - saddr.sa.sa_family = AF_INET6; -#endif - } + md->gbp = ntohs(gbp->policy_id); + tun_dst = (struct metadata_dst *)skb_dst(skb); if (tun_dst) { - ovs_skb_dst_set(skb, (struct dst_entry *)tun_dst); - tun_dst = NULL; - } else { - goto drop; + tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; + tun_dst->u.tun_info.options_len = sizeof(*md); } + if (gbp->dont_learn) + md->gbp |= VXLAN_GBP_DONT_LEARN; - if ((vxlan->flags & VXLAN_F_LEARN) && - vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) - goto drop; + if (gbp->policy_applied) + md->gbp |= VXLAN_GBP_POLICY_APPLIED; - skb_reset_network_header(skb); /* In flow-based mode, GBP is carried in dst_metadata */ - if (!(vs->flags & VXLAN_F_COLLECT_METADATA)) + if (!(vxflags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; +out: + unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; +} - if (oip6) - err = IP6_ECN_decapsulate(oip6, skb); - if (oip) - err = IP_ECN_decapsulate(oip, skb); +static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed, + __be16 *protocol, + struct sk_buff *skb, u32 vxflags) +{ + struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed; - if (unlikely(err)) { - if (err > 1) { - ++vxlan->dev->stats.rx_frame_errors; - ++vxlan->dev->stats.rx_errors; - goto drop; - } + /* Need to have Next Protocol set for interfaces in GPE mode. */ + if (!gpe->np_applied) + return false; + /* "The initial version is 0. If a receiver does not support the + * version indicated it MUST drop the packet. + */ + if (gpe->version != 0) + return false; + /* "When the O bit is set to 1, the packet is an OAM packet and OAM + * processing MUST occur." However, we don't implement OAM + * processing, thus drop the packet. + */ + if (gpe->oam_flag) + return false; + + switch (gpe->next_protocol) { + case VXLAN_GPE_NP_IPV4: + *protocol = htons(ETH_P_IP); + break; + case VXLAN_GPE_NP_IPV6: + *protocol = htons(ETH_P_IPV6); + break; + case VXLAN_GPE_NP_ETHERNET: + *protocol = htons(ETH_P_TEB); + break; + default: + return false; } - stats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)vxlan->dev->tstats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(&stats->syncp); - netdev_port_receive(skb, skb_tunnel_info(skb)); - return; -drop: + unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS; + return true; +} - /* Consume bad packet */ - kfree_skb(skb); +static bool vxlan_set_mac(struct vxlan_dev *vxlan, + struct vxlan_sock *vs, + struct sk_buff *skb) +{ + return true; +} + +static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, + struct sk_buff *skb) +{ + int err = 0; + + if (vxlan_get_sk_family(vs) == AF_INET) + err = IP_ECN_decapsulate(oiph, skb); +#if IS_ENABLED(CONFIG_IPV6) + else + err = IP6_ECN_decapsulate(oiph, skb); +#endif + return err <= 1; } /* Callback from net/ipv4/udp.c to receive packets */ -static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { - struct vxlan_sock *vs; - struct vxlanhdr *vxh; - u32 flags, vni; - struct vxlan_metadata _md; - struct vxlan_metadata *md = &_md; union { struct metadata_dst dst; - char buf[sizeof(struct metadata_dst) + sizeof(*md)]; + char buf[sizeof(struct metadata_dst) + sizeof(struct vxlan_metadata)]; } buf; - /* Need Vxlan and inner Ethernet header to be present */ - if (!pskb_may_pull(skb, VXLAN_HLEN)) - goto error; - - vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); - flags = ntohl(vxh->vx_flags); - vni = ntohl(vxh->vx_vni); + struct pcpu_sw_netstats *stats; + struct vxlan_dev *vxlan; + struct vxlan_sock *vs; + struct vxlanhdr unparsed; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; + __be16 protocol = htons(ETH_P_TEB); + bool raw_proto = false; + void *oiph; - if (flags & VXLAN_HF_VNI) { - flags &= ~VXLAN_HF_VNI; - } else { - /* VNI flag always required to be set */ - goto bad_flags; + /* Need UDP and VXLAN header to be present */ + if (!pskb_may_pull(skb, VXLAN_HLEN)) + return 1; + + unparsed = *vxlan_hdr(skb); + /* VNI flag always required to be set */ + if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { + netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", + ntohl(vxlan_hdr(skb)->vx_flags), + ntohl(vxlan_hdr(skb)->vx_vni)); + /* Return non vxlan pkt */ + return 1; } - - if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB), false)) - goto drop; - vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); + unparsed.vx_flags &= ~VXLAN_HF_VNI; + unparsed.vx_vni &= ~VXLAN_VNI_MASK; vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; -#ifdef HAVE_VXLAN_HF_RCO - if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { - vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni, - !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); - if (!vxh) - goto drop; - - flags &= ~VXLAN_HF_RCO; - vni &= VXLAN_VNI_MASK; - } -#endif - - if (vxlan_collect_metadata(vs)) { - ovs_udp_tun_rx_dst(&buf.dst.u.tun_info, skb, AF_INET, TUNNEL_KEY, - cpu_to_be64(vni >> 8), sizeof(*md)); - - md = ip_tunnel_info_opts(&buf.dst.u.tun_info); - } else { - memset(md, 0, sizeof(*md)); - } + vxlan = vxlan_vs_find_vni(vs, vxlan_vni(vxlan_hdr(skb)->vx_vni)); + if (!vxlan) + goto drop; /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ - if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) { - struct vxlanhdr_gbp *gbp; + if (vs->flags & VXLAN_F_GPE) { + if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags)) + goto drop; + raw_proto = true; + } - gbp = (struct vxlanhdr_gbp *)vxh; - md->gbp = ntohs(gbp->policy_id); + if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, + !net_eq(vxlan->net, dev_net(vxlan->dev)))) + goto drop; - buf.dst.u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; + if (vxlan_collect_metadata(vs)) { + __be32 vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); + struct metadata_dst *tun_dst; - if (gbp->dont_learn) - md->gbp |= VXLAN_GBP_DONT_LEARN; + tun_dst = &buf.dst; + ovs_udp_tun_rx_dst(&tun_dst->u.tun_info, skb, + vxlan_get_sk_family(vs), TUNNEL_KEY, + vxlan_vni_to_tun_id(vni), sizeof(*md)); - if (gbp->policy_applied) - md->gbp |= VXLAN_GBP_POLICY_APPLIED; + if (!tun_dst) + goto drop; + + md = ip_tunnel_info_opts(&tun_dst->u.tun_info); - flags &= ~VXLAN_GBP_USED_BITS; + ovs_skb_dst_set(skb, (struct dst_entry *)tun_dst); + } else { + memset(md, 0, sizeof(*md)); } - if (flags || vni & ~VXLAN_VNI_MASK) { + if (vs->flags & VXLAN_F_REMCSUM_RX) + if (!vxlan_remcsum(&unparsed, skb, vs->flags)) + goto drop; + + if (vs->flags & VXLAN_F_GBP) + vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); + /* Note that GBP and GPE can never be active together. This is + * ensured in vxlan_dev_configure. + */ + + if (unparsed.vx_flags || unparsed.vx_vni) { /* If there are any unprocessed flags remaining treat * this as a malformed packet. This behavior diverges from * VXLAN RFC (RFC7348) which stipulates that bits in reserved @@ -981,25 +737,43 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) * is more robust and provides a little more security in * adding extensions to VXLAN. */ + goto drop; + } + + if (!raw_proto) { + if (!vxlan_set_mac(vxlan, vs, skb)) + goto drop; + skb_reset_mac_header(skb); + skb->protocol = eth_type_trans(skb, vxlan->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } else { + skb_reset_mac_header(skb); + skb->dev = vxlan->dev; + skb->pkt_type = PACKET_HOST; + } - goto bad_flags; + oiph = skb_network_header(skb); + skb_reset_network_header(skb); + + if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { + ++vxlan->dev->stats.rx_frame_errors; + ++vxlan->dev->stats.rx_errors; + goto drop; } - vxlan_rcv(vs, skb, md, vni >> 8, &buf.dst); + stats = this_cpu_ptr(vxlan->dev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); + + netdev_port_receive(skb, skb_tunnel_info(skb)); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; - -bad_flags: - netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", - ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); - -error: - /* Return non vxlan pkt */ - return 1; } static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, @@ -1011,7 +785,7 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, return; gbp = (struct vxlanhdr_gbp *)vxh; - vxh->vx_flags |= htonl(VXLAN_HF_GBP); + vxh->vx_flags |= VXLAN_HF_GBP; if (md->gbp & VXLAN_GBP_DONT_LEARN) gbp->dont_learn = 1; @@ -1022,116 +796,38 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); } -#if IS_ENABLED(CONFIG_IPV6) -static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, struct in6_addr *saddr, - struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, __be32 vni, - struct vxlan_metadata *md, bool xnet, u32 vxflags) +static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags, + __be16 protocol) { - void (*fix_segment)(struct sk_buff *); - struct vxlanhdr *vxh; - int min_headroom; - int err; - bool udp_sum = !(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX); - int type = 0; - - if ((vxflags & VXLAN_F_REMCSUM_TX) && - skb->ip_summed == CHECKSUM_PARTIAL) { - int csum_start = skb_checksum_start_offset(skb); + struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh; - if (csum_start <= VXLAN_MAX_REMCSUM_START && - !(csum_start & VXLAN_RCO_SHIFT_MASK) && - (skb->csum_offset == offsetof(struct udphdr, check) || - skb->csum_offset == offsetof(struct tcphdr, check))) { - udp_sum = false; - type |= SKB_GSO_TUNNEL_REMCSUM; - /* Add support for remote csum. */ - if (!SKB_GSO_TUNNEL_REMCSUM) { - kfree_skb(skb); - err = -EOPNOTSUPP; - goto err; - } - } - } + gpe->np_applied = 1; - skb_scrub_packet(skb, xnet); - - min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len - + VXLAN_HLEN + sizeof(struct ipv6hdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - /* Need space for new headers (invalidates iph ptr) */ - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) { - kfree_skb(skb); - goto err; - } - - skb = vlan_hwaccel_push_inside(skb); - if (WARN_ON(!skb)) { - err = -ENOMEM; - goto err; - } - - type |= udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - fix_segment = udp_sum ? ovs_udp_gso : ovs_udp_csum_gso; - err = ovs_iptunnel_handle_offloads(skb, udp_sum, type, fix_segment); - if (err) { - kfree_skb(skb); - goto err; - } - - vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); - vxh->vx_flags = htonl(VXLAN_HF_VNI); - vxh->vx_vni = vni; - - if (type & SKB_GSO_TUNNEL_REMCSUM) { - u16 hdrlen = sizeof(struct vxlanhdr); - u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> - VXLAN_RCO_SHIFT; - - if (skb->csum_offset == offsetof(struct udphdr, check)) - data |= VXLAN_RCO_UDP; - - vxh->vx_vni |= htonl(data); - vxh->vx_flags |= htonl(VXLAN_HF_RCO); - - if (!skb_is_gso(skb)) { - skb->ip_summed = CHECKSUM_NONE; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) - skb->encapsulation = 0; -#endif - } + switch (protocol) { + case htons(ETH_P_IP): + gpe->next_protocol = VXLAN_GPE_NP_IPV4; + return 0; + case htons(ETH_P_IPV6): + gpe->next_protocol = VXLAN_GPE_NP_IPV6; + return 0; + case htons(ETH_P_TEB): + gpe->next_protocol = VXLAN_GPE_NP_ETHERNET; + return 0; } - - if (vxflags & VXLAN_F_GBP) - vxlan_build_gbp_hdr(vxh, vxflags, md); - - ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - - udp_tunnel6_xmit_skb(dst, sk, skb, dev, saddr, daddr, prio, - ttl, label, src_port, dst_port, - !!(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX)); - return 0; -err: - dst_release(dst); - return err; + return -EPFNOSUPPORT; } -#endif -static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, __be32 vni, - struct vxlan_metadata *md, bool xnet, u32 vxflags) +static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, + int iphdr_len, __be32 vni, + struct vxlan_metadata *md, u32 vxflags, + bool udp_sum) { void (*fix_segment)(struct sk_buff *); struct vxlanhdr *vxh; int min_headroom; int err; - bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM); - int type = 0; + int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + __be16 inner_protocol = htons(ETH_P_TEB); if ((vxflags & VXLAN_F_REMCSUM_TX) && skb->ip_summed == CHECKSUM_PARTIAL) { @@ -1140,27 +836,18 @@ static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *sk if (csum_start <= VXLAN_MAX_REMCSUM_START && !(csum_start & VXLAN_RCO_SHIFT_MASK) && (skb->csum_offset == offsetof(struct udphdr, check) || - skb->csum_offset == offsetof(struct tcphdr, check))) { - udp_sum = false; + skb->csum_offset == offsetof(struct tcphdr, check))) type |= SKB_GSO_TUNNEL_REMCSUM; - - if (!SKB_GSO_TUNNEL_REMCSUM) { - kfree_skb(skb); - return -EOPNOTSUPP; - } - } } - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + VXLAN_HLEN + sizeof(struct iphdr) + min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + + VXLAN_HLEN + iphdr_len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } + if (unlikely(err)) + goto out_free; skb = vlan_hwaccel_push_inside(skb); if (WARN_ON(!skb)) @@ -1169,41 +856,112 @@ static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *sk type |= udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; fix_segment = udp_sum ? ovs_udp_gso : ovs_udp_csum_gso; err = ovs_iptunnel_handle_offloads(skb, udp_sum, type, fix_segment); - if (err) { - kfree_skb(skb); - return err; - } + if (err) + goto out_free; + vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); - vxh->vx_flags = htonl(VXLAN_HF_VNI); - vxh->vx_vni = vni; + vxh->vx_flags = VXLAN_HF_VNI; + vxh->vx_vni = vxlan_vni_field(vni); if (type & SKB_GSO_TUNNEL_REMCSUM) { - u16 hdrlen = sizeof(struct vxlanhdr); - u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> - VXLAN_RCO_SHIFT; - - if (skb->csum_offset == offsetof(struct udphdr, check)) - data |= VXLAN_RCO_UDP; + unsigned int start; - vxh->vx_vni |= htonl(data); - vxh->vx_flags |= htonl(VXLAN_HF_RCO); + start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr); + vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset); + vxh->vx_flags |= VXLAN_HF_RCO; if (!skb_is_gso(skb)) { skb->ip_summed = CHECKSUM_NONE; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) skb->encapsulation = 0; -#endif } } + if (vxflags & VXLAN_F_GBP) vxlan_build_gbp_hdr(vxh, vxflags, md); + if (vxflags & VXLAN_F_GPE) { + err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol); + if (err < 0) + goto out_free; + inner_protocol = skb->protocol; + } - ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - - udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos, - ttl, df, src_port, dst_port, xnet, - !(vxflags & VXLAN_F_UDP_CSUM)); + ovs_skb_set_inner_protocol(skb, inner_protocol); return 0; + +out_free: + kfree_skb(skb); + return err; +} + +static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, + struct sk_buff *skb, int oif, u8 tos, + __be32 daddr, __be32 *saddr, + const struct ip_tunnel_info *info) +{ + struct rtable *rt = NULL; + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_oif = oif; + fl4.flowi4_tos = RT_TOS(tos); + fl4.flowi4_mark = skb->mark; + fl4.flowi4_proto = IPPROTO_UDP; + fl4.daddr = daddr; + fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr; + + rt = ip_route_output_key(vxlan->net, &fl4); + if (!IS_ERR(rt)) { + *saddr = fl4.saddr; + } + return rt; +} + +#if IS_ENABLED(CONFIG_IPV6) +static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, + struct sk_buff *skb, int oif, u8 tos, + __be32 label, + const struct in6_addr *daddr, + struct in6_addr *saddr, + const struct ip_tunnel_info *info) +{ + struct dst_entry *ndst; + struct flowi6 fl6; + int err; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = oif; + fl6.daddr = *daddr; + fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr; + fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = IPPROTO_UDP; + +#ifdef HAVE_IPV6_DST_LOOKUP_NET + err = ipv6_stub->ipv6_dst_lookup(vxlan->net, + vxlan->vn6_sock->sock->sk, + &ndst, &fl6); +#else +#ifdef HAVE_IPV6_STUB + err = ipv6_stub->ipv6_dst_lookup(vxlan->vn6_sock->sock->sk, + &ndst, &fl6); +#else + err = ip6_dst_lookup(vxlan->vn6_sock->sock->sk, &ndst, &fl6); +#endif +#endif + if (err < 0) + return ERR_PTR(err); + + *saddr = fl6.saddr; + return ndst; +} +#endif + +/* Bypass encapsulation if the destination is local */ +static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, + struct vxlan_dev *dst_vxlan) +{ + skb->dev->stats.rx_dropped++; + kfree_skb(skb); } static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, @@ -1211,21 +969,21 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, { struct ip_tunnel_info *info; struct vxlan_dev *vxlan = netdev_priv(dev); - struct sock *sk = vxlan->vn_sock->sock->sk; - unsigned short family = vxlan_get_sk_family(vxlan->vn_sock); + struct sock *sk; struct rtable *rt = NULL; const struct iphdr *old_iph; - struct flowi4 fl4; union vxlan_addr *dst; union vxlan_addr remote_ip; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 src_port = 0, dst_port; - u32 vni, label; + __be32 vni, label; __be16 df = 0; __u8 tos, ttl; int err; u32 flags = vxlan->flags; + bool udp_sum = false; + bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); info = skb_tunnel_info(skb); @@ -1239,13 +997,10 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dev->name); goto drop; } - if (family != ip_tunnel_info_af(info)) - goto drop; - dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; - vni = be64_to_cpu(info->key.tun_id); - remote_ip.sa.sa_family = family; - if (family == AF_INET) + vni = vxlan_tun_id_to_vni(info->key.tun_id); + remote_ip.sa.sa_family = ip_tunnel_info_af(info); + if (remote_ip.sa.sa_family == AF_INET) remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; else remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; @@ -1255,8 +1010,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (vxlan_addr_any(dst)) { if (did_rsc) { /* short-circuited back to local bridge */ - WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n", - dev->name); + vxlan_encap_bypass(skb, vxlan, vxlan); + return; } goto drop; } @@ -1276,14 +1031,10 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vxlan->cfg.port_max, true); if (info) { - if (info->key.tun_flags & TUNNEL_CSUM) - flags |= VXLAN_F_UDP_CSUM; - else - flags &= ~VXLAN_F_UDP_CSUM; - ttl = info->key.ttl; tos = info->key.tos; label = info->key.label; + udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); if (info->options_len) md = ip_tunnel_info_opts(info); @@ -1292,18 +1043,16 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } if (dst->sa.sa_family == AF_INET) { - if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)) - df = htons(IP_DF); + __be32 saddr; - memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0; - fl4.flowi4_tos = RT_TOS(tos); - fl4.flowi4_mark = skb->mark; - fl4.flowi4_proto = IPPROTO_UDP; - fl4.daddr = dst->sin.sin_addr.s_addr; - fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr; + if (!vxlan->vn4_sock) + goto drop; + sk = vxlan->vn4_sock->sock->sk; - rt = ip_route_output_key(vxlan->net, &fl4); + rt = vxlan_get_route(vxlan, skb, + rdst ? rdst->remote_ifindex : 0, tos, + dst->sin.sin_addr.s_addr, &saddr, + info); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &dst->sin.sin_addr.s_addr); @@ -1329,41 +1078,40 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vxlan->flags); if (!dst_vxlan) goto tx_error; - WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n", - dev->name); - goto tx_error; + vxlan_encap_bypass(skb, vxlan, dst_vxlan); + return; } + if (!info) + udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); + else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - vxlan_xmit_skb(rt, sk, skb, fl4.saddr, - dst->sin.sin_addr.s_addr, tos, ttl, df, - src_port, dst_port, htonl(vni << 8), md, - !net_eq(vxlan->net, dev_net(vxlan->dev)), flags); + err = vxlan_build_skb(skb, &rt->dst, sizeof(struct iphdr), + vni, md, flags, udp_sum); + if (err < 0) + goto xmit_tx_error; + + udp_tunnel_xmit_skb(rt, sk, skb, saddr, + dst->sin.sin_addr.s_addr, tos, ttl, df, + src_port, dst_port, xnet, !udp_sum); #if IS_ENABLED(CONFIG_IPV6) } else { struct dst_entry *ndst; - struct flowi6 fl6; + struct in6_addr saddr; u32 rt6i_flags; - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0; - fl6.daddr = dst->sin6.sin6_addr; - fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr; - fl6.flowi6_mark = skb->mark; - fl6.flowi6_proto = IPPROTO_UDP; - fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label); + if (!vxlan->vn6_sock) + goto drop; + sk = vxlan->vn6_sock->sock->sk; -#ifdef HAVE_IPV6_DST_LOOKUP_NET - if (ipv6_stub->ipv6_dst_lookup(vxlan->net, sk, &ndst, &fl6)) { -#else -#ifdef HAVE_IPV6_STUB - if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) { -#else - ndst = ip6_route_output(vxlan->net, sk, &fl6); - if (ndst->error) { -#endif -#endif + ndst = vxlan6_get_route(vxlan, skb, + rdst ? rdst->remote_ifindex : 0, tos, + label, &dst->sin6.sin6_addr, &saddr, + info); + if (IS_ERR(ndst)) { netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); dev->stats.tx_carrier_errors++; @@ -1390,16 +1138,25 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vxlan->flags); if (!dst_vxlan) goto tx_error; - WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n", - dev->name); - goto tx_error; + vxlan_encap_bypass(skb, vxlan, dst_vxlan); + return; } + if (!info) + udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); + + tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); - err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr, - 0, ttl, label, src_port, dst_port, htonl(vni << 8), md, - !net_eq(vxlan->net, dev_net(vxlan->dev)), - flags); + skb_scrub_packet(skb, xnet); + err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), + vni, md, flags, udp_sum); + if (err < 0) { + dst_release(ndst); + return; + } + udp_tunnel6_xmit_skb(ndst, sk, skb, dev, + &saddr, &dst->sin6.sin6_addr, tos, ttl, + label, src_port, dst_port, !udp_sum); #endif } @@ -1409,6 +1166,9 @@ drop: dev->stats.tx_dropped++; goto tx_free; +xmit_tx_error: + /* skb is already freed. */ + skb = NULL; rt_tx_error: ip_rt_put(rt); tx_error: @@ -1430,23 +1190,19 @@ netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb) const struct ip_tunnel_info *info; info = skb_tunnel_info(skb); - skb_reset_mac_header(skb); - - if ((vxlan->flags & VXLAN_F_PROXY)) - goto out; - - if (vxlan->flags & VXLAN_F_COLLECT_METADATA && - info && info->mode & IP_TUNNEL_INFO_TX) { - vxlan_xmit_one(skb, dev, NULL, false); - return NETDEV_TX_OK; + if (vxlan->flags & VXLAN_F_COLLECT_METADATA) { + if (info && info->mode & IP_TUNNEL_INFO_TX) { + vxlan_xmit_one(skb, dev, NULL, false); + return NETDEV_TX_OK; + } } -out: - pr_warn("vxlan: unsupported flag set %x", vxlan->flags); + + dev->stats.tx_dropped++; kfree_skb(skb); return NETDEV_TX_OK; } -EXPORT_SYMBOL(rpl_vxlan_xmit); +EXPORT_SYMBOL_GPL(rpl_vxlan_xmit); /* Walk the forwarding table and purge stale entries */ static void vxlan_cleanup(unsigned long arg) @@ -1489,9 +1245,8 @@ static void vxlan_cleanup(unsigned long arg) static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); - __u32 vni = vxlan->default_dst.remote_vni; + __be32 vni = vxlan->default_dst.remote_vni; - vxlan->vn_sock = vs; spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); spin_unlock(&vn->sock_lock); @@ -1500,7 +1255,7 @@ static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) /* Setup stats when device is created */ static int vxlan_init(struct net_device *dev) { - dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; @@ -1509,13 +1264,6 @@ static int vxlan_init(struct net_device *dev) static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan) { - struct vxlan_fdb *f; - - spin_lock_bh(&vxlan->hash_lock); - f = __vxlan_find_mac(vxlan, all_zeros_mac); - if (f) - vxlan_fdb_destroy(vxlan, f); - spin_unlock_bh(&vxlan->hash_lock); } static void vxlan_uninit(struct net_device *dev) @@ -1531,22 +1279,18 @@ static void vxlan_uninit(struct net_device *dev) static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_sock *vs; - int ret = 0; - - vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, - vxlan->cfg.no_share, vxlan->flags); - if (IS_ERR(vs)) - return PTR_ERR(vs); + int ret; - vxlan_vs_add_dev(vs, vxlan); + ret = vxlan_sock_add(vxlan); + if (ret < 0) + return ret; if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { ret = vxlan_igmp_join(vxlan); if (ret == -EADDRINUSE) ret = 0; if (ret) { - vxlan_sock_release(vs); + vxlan_sock_release(vxlan); return ret; } } @@ -1565,7 +1309,6 @@ static void vxlan_flush(struct vxlan_dev *vxlan) spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); @@ -1582,7 +1325,6 @@ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); - struct vxlan_sock *vs = vxlan->vn_sock; int ret = 0; if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && @@ -1592,7 +1334,7 @@ static int vxlan_stop(struct net_device *dev) del_timer_sync(&vxlan->age_timer); vxlan_flush(vxlan); - vxlan_sock_release(vs); + vxlan_sock_release(vxlan); return ret; } @@ -1639,71 +1381,86 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu) return __vxlan_change_mtu(dev, lowerdev, dst, new_mtu, true); } -static netdev_tx_t vxlan_dev_xmit(struct sk_buff *skb, struct net_device *dev) -{ - /* Drop All packets coming from networking stack. OVS-CB is - * not initialized for these packets. - */ - - dev_kfree_skb(skb); - dev->stats.tx_dropped++; - return NETDEV_TX_OK; -} - -static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb, - struct ip_tunnel_info *info, - __be16 sport, __be16 dport) +int ovs_vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct rtable *rt; - struct flowi4 fl4; + struct ip_tunnel_info *info = skb_tunnel_info(skb); + __be16 sport, dport; - memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_tos = RT_TOS(info->key.tos); - fl4.flowi4_mark = skb->mark; - fl4.flowi4_proto = IPPROTO_UDP; - fl4.daddr = info->key.u.ipv4.dst; + sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, + vxlan->cfg.port_max, true); + dport = info->key.tp_dst ? : vxlan->cfg.dst_port; - rt = ip_route_output_key(vxlan->net, &fl4); - if (IS_ERR(rt)) - return PTR_ERR(rt); - ip_rt_put(rt); + if (ip_tunnel_info_af(info) == AF_INET) { + struct rtable *rt; + + if (!vxlan->vn4_sock) + return -EINVAL; + rt = vxlan_get_route(vxlan, skb, 0, info->key.tos, + info->key.u.ipv4.dst, + &info->key.u.ipv4.src, info); + if (IS_ERR(rt)) + return PTR_ERR(rt); + ip_rt_put(rt); + } else { +#if IS_ENABLED(CONFIG_IPV6) + struct dst_entry *ndst; - info->key.u.ipv4.src = fl4.saddr; + if (!vxlan->vn6_sock) + return -EINVAL; + ndst = vxlan6_get_route(vxlan, skb, 0, info->key.tos, + info->key.label, &info->key.u.ipv6.dst, + &info->key.u.ipv6.src, info); + if (IS_ERR(ndst)) + return PTR_ERR(ndst); + dst_release(ndst); +#else /* !CONFIG_IPV6 */ + return -EPFNOSUPPORT; +#endif + } info->key.tp_src = sport; info->key.tp_dst = dport; return 0; } +EXPORT_SYMBOL_GPL(ovs_vxlan_fill_metadata_dst); -int ovs_vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +static netdev_tx_t vxlan_dev_xmit(struct sk_buff *skb, struct net_device *dev) { - struct vxlan_dev *vxlan = netdev_priv(dev); - struct ip_tunnel_info *info = skb_tunnel_info(skb); - __be16 sport, dport; - - sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, - vxlan->cfg.port_max, true); - dport = info->key.tp_dst ? : vxlan->cfg.dst_port; + /* Drop All packets coming from networking stack. OVS-CB is + * not initialized for these packets. + */ - if (ip_tunnel_info_af(info) == AF_INET) - return egress_ipv4_tun_info(dev, skb, info, sport, dport); - return -EINVAL; + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; } -EXPORT_SYMBOL_GPL(ovs_vxlan_fill_metadata_dst); -static const struct net_device_ops vxlan_netdev_ops = { +static const struct net_device_ops vxlan_netdev_ether_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, - .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_dev_xmit, + .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_set_rx_mode = vxlan_set_multicast_list, .ndo_change_mtu = vxlan_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef HAVE_NDO_FILL_METADATA_DST - .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, + .ndo_fill_metadata_dst = ovs_vxlan_fill_metadata_dst, +#endif +}; + +static const struct net_device_ops vxlan_netdev_raw_ops = { + .ndo_init = vxlan_init, + .ndo_uninit = vxlan_uninit, + .ndo_open = vxlan_open, + .ndo_stop = vxlan_stop, + .ndo_start_xmit = vxlan_dev_xmit, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_change_mtu = vxlan_change_mtu, +#ifdef HAVE_NDO_FILL_METADATA_DST + .ndo_fill_metadata_dst = ovs_vxlan_fill_metadata_dst, #endif }; @@ -1712,6 +1469,36 @@ static struct device_type vxlan_type = { .name = "vxlan", }; +/* Calls the ndo_add_vxlan_port of the caller in order to + * supply the listening VXLAN udp ports. Callers are expected + * to implement the ndo_add_vxlan_port. + */ +static void vxlan_push_rx_ports(struct net_device *dev) +{ +#ifdef HAVE_NDO_ADD_VXLAN_PORT + struct vxlan_sock *vs; + struct net *net = dev_net(dev); + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + sa_family_t sa_family; + __be16 port; + unsigned int i; + + if (!dev->netdev_ops->ndo_add_vxlan_port) + return; + + spin_lock(&vn->sock_lock); + for (i = 0; i < PORT_HASH_SIZE; ++i) { + hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { + port = inet_sk(vs->sock->sk)->inet_sport; + sa_family = vxlan_get_sk_family(vs); + dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, + port); + } + } + spin_unlock(&vn->sock_lock); +#endif +} + /* Initialize the device structure. */ static void vxlan_setup(struct net_device *dev) { @@ -1721,7 +1508,6 @@ static void vxlan_setup(struct net_device *dev) eth_hw_addr_random(dev); ether_setup(dev); - dev->netdev_ops = &vxlan_netdev_ops; dev->destructor = free_netdev; SET_NETDEV_DEVTYPE(dev, &vxlan_type); @@ -1732,16 +1518,13 @@ static void vxlan_setup(struct net_device *dev) dev->vlan_features = dev->features; dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; -#endif - #if 0 netif_keep_dst(dev); #endif - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + dev->priv_flags |= IFF_NO_QUEUE; INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); @@ -1758,8 +1541,51 @@ static void vxlan_setup(struct net_device *dev) INIT_HLIST_HEAD(&vxlan->fdb_head[h]); } +static void vxlan_ether_setup(struct net_device *dev) +{ + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + dev->netdev_ops = &vxlan_netdev_ether_ops; +} + +static void vxlan_raw_setup(struct net_device *dev) +{ + dev->header_ops = NULL; + dev->type = ARPHRD_NONE; + dev->hard_header_len = 0; + dev->addr_len = 0; + dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; + dev->netdev_ops = &vxlan_netdev_raw_ops; +} + static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { + [IFLA_VXLAN_ID] = { .type = NLA_U32 }, + [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, + [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, + [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, + [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, + [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, + [IFLA_VXLAN_LABEL] = { .type = NLA_U32 }, + [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, + [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, + [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, + [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, + [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, + [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, + [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, + [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, + [IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 }, [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, + [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, + [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, + [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, + [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, + [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, + [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, + [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, + [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -1811,21 +1637,6 @@ static const struct ethtool_ops vxlan_ethtool_ops = { .get_link = ethtool_op_get_link, }; -static void free_vs_rcu(struct rcu_head *rcu) -{ - struct vxlan_sock *vs = container_of(rcu, struct vxlan_sock, rcu); - - kfree(vs); -} - -static void vxlan_del_work(struct work_struct *work) -{ - struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work); - udp_tunnel_sock_release(vs->sock); - - call_rcu(&vs->rcu, free_vs_rcu); -} - static struct socket *vxlan_create_sock(struct net *net, bool ipv6, __be16 port, u32 flags) { @@ -1855,14 +1666,13 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, } /* Create new listen socket if needed */ -static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, - u32 flags) +static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, + __be16 port, u32 flags) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct socket *sock; unsigned int h; - bool ipv6 = !!(flags & VXLAN_F_IPV6); struct udp_tunnel_sock_cfg tunnel_cfg; vs = kzalloc(sizeof(*vs), GFP_KERNEL); @@ -1872,8 +1682,6 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vs->vni_list[h]); - INIT_WORK(&vs->del_work, vxlan_del_work); - sock = vxlan_create_sock(net, ipv6, port, flags); if (IS_ERR(sock)) { pr_info("Cannot bind port %d, err=%ld\n", ntohs(port), @@ -1886,63 +1694,108 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, atomic_set(&vs->refcnt, 1); vs->flags = (flags & VXLAN_F_RCV_FLAGS); - /* Initialize the vxlan udp offloads structure */ #ifdef HAVE_UDP_OFFLOAD vs->udp_offloads.port = port; vs->udp_offloads.callbacks.gro_receive = vxlan_gro_receive; vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete; - vxlan_notify_add_rx_port(vs); #endif spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); + vxlan_notify_add_rx_port(vs); spin_unlock(&vn->sock_lock); /* Mark socket as an encapsulation socket. */ + memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = vs; tunnel_cfg.encap_type = 1; - tunnel_cfg.encap_rcv = vxlan_udp_encap_recv; + tunnel_cfg.encap_rcv = vxlan_rcv; tunnel_cfg.encap_destroy = NULL; - +#ifdef HAVE_UDP_TUNNEL_SOCK_CFG_GRO_RECEIVE + tunnel_cfg.gro_receive = vxlan_gro_receive; + tunnel_cfg.gro_complete = vxlan_gro_complete; +#endif setup_udp_tunnel_sock(net, sock, &tunnel_cfg); return vs; } -static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, - bool no_share, u32 flags) +static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - struct vxlan_sock *vs; - bool ipv6 = flags & VXLAN_F_IPV6; + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + struct vxlan_sock *vs = NULL; - if (!no_share) { + if (!vxlan->cfg.no_share) { spin_lock(&vn->sock_lock); - vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, - flags); - if (vs) { - if (!atomic_add_unless(&vs->refcnt, 1, 0)) - vs = ERR_PTR(-EBUSY); + vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, + vxlan->cfg.dst_port, vxlan->flags); + if (vs && !atomic_add_unless(&vs->refcnt, 1, 0)) { spin_unlock(&vn->sock_lock); - return vs; + return -EBUSY; } spin_unlock(&vn->sock_lock); } + if (!vs) + vs = vxlan_socket_create(vxlan->net, ipv6, + vxlan->cfg.dst_port, vxlan->flags); + if (IS_ERR(vs)) + return PTR_ERR(vs); +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6) + vxlan->vn6_sock = vs; + else +#endif + vxlan->vn4_sock = vs; + vxlan_vs_add_dev(vs, vxlan); + return 0; +} - return vxlan_socket_create(net, port, flags); +static int vxlan_sock_add(struct vxlan_dev *vxlan) +{ + bool ipv6 = vxlan->flags & VXLAN_F_IPV6; + bool metadata = vxlan->flags & VXLAN_F_COLLECT_METADATA; + int ret = 0; + + vxlan->vn4_sock = NULL; +#if IS_ENABLED(CONFIG_IPV6) + vxlan->vn6_sock = NULL; + if (ipv6 || metadata) + ret = __vxlan_sock_add(vxlan, true); +#endif + if (!ret && (!ipv6 || metadata)) + ret = __vxlan_sock_add(vxlan, false); + if (ret < 0) + vxlan_sock_release(vxlan); + return ret; } static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, struct vxlan_config *conf) { struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); - struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_dev *vxlan = netdev_priv(dev), *tmp; struct vxlan_rdst *dst = &vxlan->default_dst; + unsigned short needed_headroom = ETH_HLEN; int err; bool use_ipv6 = false; __be16 default_port = vxlan->cfg.dst_port; struct net_device *lowerdev = NULL; + if (conf->flags & VXLAN_F_GPE) { + if (conf->flags & ~VXLAN_F_ALLOWED_GPE) + return -EINVAL; + /* For now, allow GPE only together with COLLECT_METADATA. + * This can be relaxed later; in such case, the other side + * of the PtP link will have to be provided. + */ + if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) + return -EINVAL; + + vxlan_raw_setup(dev); + } else { + vxlan_ether_setup(dev); + } + vxlan->net = src_net; dst->remote_vni = conf->vni; @@ -1958,6 +1811,12 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, if (!IS_ENABLED(CONFIG_IPV6)) return -EPFNOSUPPORT; use_ipv6 = true; + vxlan->flags |= VXLAN_F_IPV6; + } + + if (conf->label && !use_ipv6) { + pr_info("label only supported in use with IPv6\n"); + return -EINVAL; } if (conf->remote_ifindex) { @@ -1976,20 +1835,13 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, pr_info("IPv6 is disabled via sysctl\n"); return -EPERM; } - vxlan->flags |= VXLAN_F_IPV6; } #endif if (!conf->mtu) dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); - dev->needed_headroom = lowerdev->hard_header_len + - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); - } else if (use_ipv6) { - vxlan->flags |= VXLAN_F_IPV6; - dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM; - } else { - dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM; + needed_headroom = lowerdev->hard_header_len; } if (conf->mtu) { @@ -1998,17 +1850,33 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, return err; } + if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA) + needed_headroom += VXLAN6_HEADROOM; + else + needed_headroom += VXLAN_HEADROOM; + dev->needed_headroom = needed_headroom; + memcpy(&vxlan->cfg, conf, sizeof(*conf)); - if (!vxlan->cfg.dst_port) - vxlan->cfg.dst_port = default_port; + if (!vxlan->cfg.dst_port) { + if (conf->flags & VXLAN_F_GPE) + vxlan->cfg.dst_port = 4790; /* IANA assigned VXLAN-GPE port */ + else + vxlan->cfg.dst_port = default_port; + } vxlan->flags |= conf->flags; if (!vxlan->cfg.age_interval) vxlan->cfg.age_interval = FDB_AGE_DEFAULT; - if (vxlan_find_vni(src_net, conf->vni, use_ipv6 ? AF_INET6 : AF_INET, - vxlan->cfg.dst_port, vxlan->flags)) + list_for_each_entry(tmp, &vn->vxlan_list, next) { + if (tmp->cfg.vni == conf->vni && + (tmp->default_dst.remote_ip.sa.sa_family == AF_INET6 || + tmp->cfg.saddr.sa.sa_family == AF_INET6) == use_ipv6 && + tmp->cfg.dst_port == vxlan->cfg.dst_port && + (tmp->flags & VXLAN_F_RCV_FLAGS) == + (vxlan->flags & VXLAN_F_RCV_FLAGS)) return -EEXIST; + } dev->ethtool_ops = &vxlan_ethtool_ops; @@ -2038,7 +1906,7 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, } struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name, - u8 name_assign_type, struct vxlan_config *conf) + u8 name_assign_type, struct vxlan_config *conf) { struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev; @@ -2046,7 +1914,7 @@ struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name, memset(&tb, 0, sizeof(tb)); - dev = rtnl_create_link(net, (char *)name, name_assign_type, + dev = rtnl_create_link(net, name, name_assign_type, &vxlan_link_ops, tb); if (IS_ERR(dev)) return dev; @@ -2056,27 +1924,18 @@ struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name, free_netdev(dev); return ERR_PTR(err); } - return dev; } EXPORT_SYMBOL_GPL(rpl_vxlan_dev_create); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) static int vxlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) -#else -static int vxlan_newlink(struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -#endif { + pr_info("unsupported operation\n"); return -EINVAL; } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) static void vxlan_dellink(struct net_device *dev, struct list_head *head) -#else -static void vxlan_dellink(struct net_device *dev) -#endif { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); @@ -2099,6 +1958,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ + nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ @@ -2120,8 +1980,88 @@ static size_t vxlan_get_size(const struct net_device *dev) static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { const struct vxlan_dev *vxlan = netdev_priv(dev); + const struct vxlan_rdst *dst = &vxlan->default_dst; + struct ifla_vxlan_port_range ports = { + .low = htons(vxlan->cfg.port_min), + .high = htons(vxlan->cfg.port_max), + }; + + if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni))) + goto nla_put_failure; + + if (!vxlan_addr_any(&dst->remote_ip)) { + if (dst->remote_ip.sa.sa_family == AF_INET) { + if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP, + dst->remote_ip.sin.sin_addr.s_addr)) + goto nla_put_failure; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6, + &dst->remote_ip.sin6.sin6_addr)) + goto nla_put_failure; +#endif + } + } + + if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) + goto nla_put_failure; + + if (!vxlan_addr_any(&vxlan->cfg.saddr)) { + if (vxlan->cfg.saddr.sa.sa_family == AF_INET) { + if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL, + vxlan->cfg.saddr.sin.sin_addr.s_addr)) + goto nla_put_failure; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6, + &vxlan->cfg.saddr.sin6.sin6_addr)) + goto nla_put_failure; +#endif + } + } + + if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || + nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || + nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || + nla_put_u8(skb, IFLA_VXLAN_LEARNING, + !!(vxlan->flags & VXLAN_F_LEARN)) || + nla_put_u8(skb, IFLA_VXLAN_PROXY, + !!(vxlan->flags & VXLAN_F_PROXY)) || + nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) || + nla_put_u8(skb, IFLA_VXLAN_L2MISS, + !!(vxlan->flags & VXLAN_F_L2MISS)) || + nla_put_u8(skb, IFLA_VXLAN_L3MISS, + !!(vxlan->flags & VXLAN_F_L3MISS)) || + nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA, + !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) || + nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || + nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || + nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || + nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, + !(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM_TX)) || + nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, + !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || + nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, + !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || + nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, + !!(vxlan->flags & VXLAN_F_REMCSUM_TX)) || + nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, + !!(vxlan->flags & VXLAN_F_REMCSUM_RX))) + goto nla_put_failure; + + if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) + goto nla_put_failure; + + if (vxlan->flags & VXLAN_F_GBP && + nla_put_flag(skb, IFLA_VXLAN_GBP)) + goto nla_put_failure; + + if (vxlan->flags & VXLAN_F_GPE && + nla_put_flag(skb, IFLA_VXLAN_GPE)) + goto nla_put_failure; - if (nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port)) + if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL && + nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) goto nla_put_failure; return 0; @@ -2171,30 +2111,28 @@ static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, * is 0 here, so no matches. */ if (dst->remote_ifindex == dev->ifindex) -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) vxlan_dellink(vxlan->dev, &list_kill); -#else - vxlan_dellink(vxlan->dev); -#endif } unregister_netdevice_many(&list_kill); } -static int vxlan_lowerdev_event(struct notifier_block *unused, - unsigned long event, void *ptr) +static int vxlan_netdevice_event(struct notifier_block *unused, + unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); if (event == NETDEV_UNREGISTER) vxlan_handle_lowerdev_unregister(vn, dev); + else if (event == NETDEV_OFFLOAD_PUSH_VXLAN) + vxlan_push_rx_ports(dev); return NOTIFY_DONE; } static struct notifier_block vxlan_notifier_block __read_mostly = { - .notifier_call = vxlan_lowerdev_event, + .notifier_call = vxlan_netdevice_event, }; static __net_init int vxlan_init_net(struct net *net) @@ -2227,8 +2165,9 @@ static void __net_exit vxlan_exit_net(struct net *net) /* If vxlan->dev is in the same netns, it has already been added * to the list by the previous loop. */ - if (!net_eq(dev_net(vxlan->dev), net)) + if (!net_eq(dev_net(vxlan->dev), net)) { unregister_netdevice_queue(vxlan->dev, &list); + } } unregister_netdevice_many(&list); @@ -2246,14 +2185,6 @@ int rpl_vxlan_init_module(void) { int rc; -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) - vxlan_wq = create_workqueue("vxlan"); -#else - vxlan_wq = alloc_workqueue("vxlan", 0, 0); -#endif - if (!vxlan_wq) - return -ENOMEM; - get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); rc = register_pernet_subsys(&vxlan_net_ops); @@ -2275,7 +2206,6 @@ out3: out2: unregister_pernet_subsys(&vxlan_net_ops); out1: - destroy_workqueue(vxlan_wq); return rc; } @@ -2283,7 +2213,6 @@ void rpl_vxlan_cleanup_module(void) { rtnl_link_unregister(&vxlan_link_ops); unregister_netdevice_notifier(&vxlan_notifier_block); - destroy_workqueue(vxlan_wq); unregister_pernet_subsys(&vxlan_net_ops); /* rcu_barrier() is called by netns */ }

[ovs-dev,v3,16/28] datapath: compat: Update Geneve and VxLAN modules.

Commit Message

Patch