From patchwork Wed Apr 17 05:10:20 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Amerigo Wang X-Patchwork-Id: 237149 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id DFFDC2C014D for ; Wed, 17 Apr 2013 15:11:13 +1000 (EST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756777Ab3DQFLJ (ORCPT ); Wed, 17 Apr 2013 01:11:09 -0400 Received: from mx1.redhat.com ([209.132.183.28]:6892 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752204Ab3DQFLH (ORCPT ); Wed, 17 Apr 2013 01:11:07 -0400 Received: from int-mx12.intmail.prod.int.phx2.redhat.com (int-mx12.intmail.prod.int.phx2.redhat.com [10.5.11.25]) by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id r3H5B43a025798 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Wed, 17 Apr 2013 01:11:04 -0400 Received: from cr0.sunraytvi.com (vpn1-115-1.nay.redhat.com [10.66.115.1]) by int-mx12.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id r3H5ARnc025015; Wed, 17 Apr 2013 01:10:57 -0400 From: Cong Wang To: netdev@vger.kernel.org Cc: David Stevens , Stephen Hemminger , "David S. Miller" , Cong Wang Subject: [Patch net-next v4 3/5] vxlan: add ipv6 support Date: Wed, 17 Apr 2013 13:10:20 +0800 Message-Id: <1366175423-27310-4-git-send-email-amwang@redhat.com> In-Reply-To: <1366175423-27310-1-git-send-email-amwang@redhat.com> References: <1366175423-27310-1-git-send-email-amwang@redhat.com> X-Scanned-By: MIMEDefang 2.68 on 10.5.11.25 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Cong Wang This patch adds IPv6 support to vxlan device, as the new version RFC already mentioned it: http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-03 Cc: David Stevens Cc: Stephen Hemminger Cc: David S. Miller Signed-off-by: Cong Wang --- drivers/net/vxlan.c | 639 +++++++++++++++++++++++++++++++++--------- include/uapi/linux/if_link.h | 2 + 2 files changed, 505 insertions(+), 136 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index f8ac900..43ed40f 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -9,7 +9,6 @@ * * TODO * - use IANA UDP port number (when defined) - * - IPv6 (not in RFC) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -42,6 +41,11 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#include +#endif #define VXLAN_VERSION "0.1" @@ -56,6 +60,7 @@ #define VXLAN_VID_MASK (VXLAN_N_VID - 1) /* IP header + UDP + VXLAN + Ethernet header */ #define VXLAN_HEADROOM (20 + 8 + 8 + 14) +#define VXLAN6_HEADROOM (40 + 8 + 8 + 14) #define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ @@ -81,9 +86,20 @@ struct vxlan_net { struct hlist_head vni_list[VNI_HASH_SIZE]; }; +struct vxlan_addr { + union { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct sockaddr sa; + } u; +#define va_sin u.sin.sin_addr.s_addr +#define va_sin6 u.sin6.sin6_addr +#define va_sa u.sa.sa_family +}; + struct vxlan_rdst { struct rcu_head rcu; - __be32 remote_ip; + struct vxlan_addr remote_ip; __be16 remote_port; u32 remote_vni; u32 remote_ifindex; @@ -106,7 +122,7 @@ struct vxlan_dev { struct hlist_node hlist; struct net_device *dev; struct vxlan_rdst default_dst; /* default destination */ - __be32 saddr; /* source address */ + struct vxlan_addr saddr; /* source address */ __u16 port_min; /* source port range */ __u16 port_max; __u8 tos; /* TOS override */ @@ -128,6 +144,69 @@ struct vxlan_dev { #define VXLAN_F_L2MISS 0x08 #define VXLAN_F_L3MISS 0x10 +static inline +bool vxlan_addr_equal(const struct vxlan_addr *a, const struct vxlan_addr *b) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (a->va_sa != b->va_sa) + return false; + if (a->va_sa == AF_INET6) + return ipv6_addr_equal(&a->va_sin6, &b->va_sin6); + else +#endif + return a->va_sin == b->va_sin; +} + +static inline bool vxlan_addr_any(const struct vxlan_addr *ipa) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (ipa->va_sa == AF_INET6) + return ipv6_addr_any(&ipa->va_sin6); + else +#endif + return ipa->va_sin == htonl(INADDR_ANY); +} + +static inline bool vxlan_addr_multicast(const struct vxlan_addr *ipa) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (ipa->va_sa == AF_INET6) + return ipv6_addr_is_multicast(&ipa->va_sin6); + else +#endif + return IN_MULTICAST(ntohl(ipa->va_sin)); +} + +static int vxlan_nla_get_addr(struct vxlan_addr *ip, struct nlattr *nla) +{ + if (nla_len(nla) >= sizeof(struct in6_addr)) { +#if IS_ENABLED(CONFIG_IPV6) + nla_memcpy(&ip->va_sin6, nla, sizeof(struct in6_addr)); + ip->va_sa = AF_INET6; + return 0; +#else + return -EAFNOSUPPORT; +#endif + } else if (nla_len(nla) >= sizeof(__be32)) { + ip->va_sin = nla_get_be32(nla); + ip->va_sa = AF_INET; + return 0; + } else { + return -EAFNOSUPPORT; + } +} + +static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, + const struct vxlan_addr *ip) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (ip->va_sa == AF_INET6) + return nla_put(skb, attr, sizeof(struct in6_addr), &ip->va_sin6); + else +#endif + return nla_put_be32(skb, attr, ip->va_sin); +} + /* salt for hash table */ static u32 vxlan_salt __read_mostly; @@ -174,7 +253,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (type == RTM_GETNEIGH) { ndm->ndm_family = AF_INET; - send_ip = rdst->remote_ip != htonl(INADDR_ANY); + send_ip = !vxlan_addr_any(&rdst->remote_ip); send_eth = !is_zero_ether_addr(fdb->eth_addr); } else ndm->ndm_family = AF_BRIDGE; @@ -186,7 +265,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; - if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip)) + if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) goto nla_put_failure; if (rdst->remote_port && rdst->remote_port != vxlan_port && @@ -218,7 +297,7 @@ static inline size_t vxlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ - + nla_total_size(sizeof(__be32)) /* NDA_DST */ + + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ + nla_total_size(sizeof(__be32)) /* NDA_PORT */ + nla_total_size(sizeof(__be32)) /* NDA_VNI */ + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ @@ -251,14 +330,14 @@ errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) +static void vxlan_ip_miss(struct net_device *dev, struct vxlan_addr *ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb f; memset(&f, 0, sizeof f); f.state = NUD_STALE; - f.remote.remote_ip = ipa; /* goes to NDA_DST */ + f.remote.remote_ip = *ipa; /* goes to NDA_DST */ f.remote.remote_vni = VXLAN_N_VID; vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); @@ -313,14 +392,14 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, } /* Add/update destinations for multicast */ -static int vxlan_fdb_append(struct vxlan_fdb *f, - __be32 ip, __u32 port, __u32 vni, __u32 ifindex) +static int vxlan_fdb_append(struct vxlan_fdb *f, struct vxlan_addr *ip, + __u32 port, __u32 vni, __u32 ifindex) { struct vxlan_rdst *rd_prev, *rd; rd_prev = NULL; for (rd = &f->remote; rd; rd = rd->remote_next) { - if (rd->remote_ip == ip && + if (vxlan_addr_equal(&rd->remote_ip, ip) && rd->remote_port == port && rd->remote_vni == vni && rd->remote_ifindex == ifindex) @@ -330,7 +409,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOBUFS; - rd->remote_ip = ip; + rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; @@ -341,7 +420,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, /* Add new entry to forwarding table -- assumes lock held */ static int vxlan_fdb_create(struct vxlan_dev *vxlan, - const u8 *mac, __be32 ip, + const u8 *mac, struct vxlan_addr *ip, __u16 state, __u16 flags, __u32 port, __u32 vni, __u32 ifindex) { @@ -375,13 +454,18 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax) return -ENOSPC; - netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip); +#if IS_ENABLED(CONFIG_IPV6) + if (ip->va_sa == AF_INET6) + netdev_dbg(vxlan->dev, "add %pM -> %pI6\n", mac, &ip->va_sin6); + else +#endif + netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip->va_sin); f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return -ENOMEM; notify = 1; - f->remote.remote_ip = ip; + f->remote.remote_ip = *ip; f->remote.remote_port = port; f->remote.remote_vni = vni; f->remote.remote_ifindex = ifindex; @@ -433,7 +517,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], { struct vxlan_dev *vxlan = netdev_priv(dev); struct net *net = dev_net(vxlan->dev); - __be32 ip; + struct vxlan_addr ip; u32 port, vni, ifindex; int err; @@ -446,10 +530,9 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (tb[NDA_DST] == NULL) return -EINVAL; - if (nla_len(tb[NDA_DST]) != sizeof(__be32)) - return -EAFNOSUPPORT; - - ip = nla_get_be32(tb[NDA_DST]); + err = vxlan_nla_get_addr(&ip, tb[NDA_DST]); + if (err) + return err; if (tb[NDA_PORT]) { if (nla_len(tb[NDA_PORT]) != sizeof(u32)) @@ -479,7 +562,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], ifindex = 0; spin_lock_bh(&vxlan->hash_lock); - err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags, port, + err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags, port, vni, ifindex); spin_unlock_bh(&vxlan->hash_lock); @@ -543,7 +626,7 @@ skip: * and Tunnel endpoint. */ static void vxlan_snoop(struct net_device *dev, - __be32 src_ip, const u8 *src_mac) + struct vxlan_addr *src_ip, const u8 *src_mac) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; @@ -552,15 +635,25 @@ static void vxlan_snoop(struct net_device *dev, f = vxlan_find_mac(vxlan, src_mac); if (likely(f)) { f->used = jiffies; - if (likely(f->remote.remote_ip == src_ip)) + if (likely(vxlan_addr_equal(&f->remote.remote_ip, src_ip))) return; - if (net_ratelimit()) - netdev_info(dev, - "%pM migrated from %pI4 to %pI4\n", - src_mac, &f->remote.remote_ip, &src_ip); + if (net_ratelimit()) { +#if IS_ENABLED(CONFIG_IPV6) + if (src_ip->va_sa == AF_INET6) + netdev_info(dev, + "%pM migrated from %pI6 to %pI6\n", + src_mac, &f->remote.remote_ip.va_sin6, + &src_ip->va_sin6); + else +#endif + netdev_info(dev, + "%pM migrated from %pI4 to %pI4\n", + src_mac, &f->remote.remote_ip.va_sin, + &src_ip->va_sin); + } - f->remote.remote_ip = src_ip; + f->remote.remote_ip = *src_ip; f->updated = jiffies; } else { /* learned new entry */ @@ -589,7 +682,8 @@ static bool vxlan_group_used(struct vxlan_net *vn, if (!netif_running(vxlan->dev)) continue; - if (vxlan->default_dst.remote_ip == this->default_dst.remote_ip) + if (vxlan_addr_equal(&vxlan->default_dst.remote_ip, + &this->default_dst.remote_ip)) return true; } @@ -603,7 +697,7 @@ static int vxlan_join_group(struct net_device *dev) struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct sock *sk = vn->sock->sk; struct ip_mreqn mreq = { - .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, + .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip.va_sin, .imr_ifindex = vxlan->default_dst.remote_ifindex, }; int err; @@ -615,7 +709,13 @@ static int vxlan_join_group(struct net_device *dev) /* Need to drop RTNL to call multicast join */ rtnl_unlock(); lock_sock(sk); - err = ip_mc_join_group(sk, &mreq); +#if IS_ENABLED(CONFIG_IPV6) + if (vxlan->default_dst.remote_ip.va_sa == AF_INET6) + err = ipv6_sock_mc_join(sk, vxlan->default_dst.remote_ifindex, + &vxlan->default_dst.remote_ip.va_sin6); + else +#endif + err = ip_mc_join_group(sk, &mreq); release_sock(sk); rtnl_lock(); @@ -631,7 +731,7 @@ static int vxlan_leave_group(struct net_device *dev) int err = 0; struct sock *sk = vn->sock->sk; struct ip_mreqn mreq = { - .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, + .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip.va_sin, .imr_ifindex = vxlan->default_dst.remote_ifindex, }; @@ -642,7 +742,13 @@ static int vxlan_leave_group(struct net_device *dev) /* Need to drop RTNL to call multicast leave */ rtnl_unlock(); lock_sock(sk); - err = ip_mc_leave_group(sk, &mreq); +#if IS_ENABLED(CONFIG_IPV6) + if (vxlan->default_dst.remote_ip.va_sa == AF_INET6) + err = ipv6_sock_mc_drop(sk, vxlan->default_dst.remote_ifindex, + &vxlan->default_dst.remote_ip.va_sin6); + else +#endif + err = ip_mc_leave_group(sk, &mreq); release_sock(sk); rtnl_lock(); @@ -652,12 +758,16 @@ static int vxlan_leave_group(struct net_device *dev) /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { - struct iphdr *oip; + struct iphdr *oip = NULL; +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *oip6 = NULL; +#endif struct vxlanhdr *vxh; struct vxlan_dev *vxlan; struct pcpu_tstats *stats; + struct vxlan_addr src_ip; __u32 vni; - int err; + int err = 0; /* pop off outer UDP header */ __skb_pull(skb, sizeof(struct udphdr)); @@ -694,7 +804,13 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) skb_reset_mac_header(skb); /* Re-examine inner Ethernet packet */ - oip = ip_hdr(skb); + if (skb->protocol == htons(ETH_P_IP)) + oip = ip_hdr(skb); +#if IS_ENABLED(CONFIG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) + oip6 = ipv6_hdr(skb); +#endif + skb->protocol = eth_type_trans(skb, vxlan->dev); /* Ignore packet loops (and multicast echo) */ @@ -702,8 +818,19 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) vxlan->dev->dev_addr) == 0) goto drop; - if (vxlan->flags & VXLAN_F_LEARN) - vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source); + if (vxlan->flags & VXLAN_F_LEARN) { + if (oip) { + src_ip.va_sin = oip->saddr; + src_ip.va_sa = AF_INET; + } +#if IS_ENABLED(CONFIG_IPV6) + if (oip6) { + src_ip.va_sin6 = oip6->saddr; + src_ip.va_sa = AF_INET6; + } +#endif + vxlan_snoop(skb->dev, &src_ip, eth_hdr(skb)->h_source); + } __skb_tunnel_rx(skb, vxlan->dev); skb_reset_network_header(skb); @@ -719,11 +846,24 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) skb->encapsulation = 0; - err = IP_ECN_decapsulate(oip, skb); +#if IS_ENABLED(CONFIG_IPV6) + if (oip6) + err = IP6_ECN_decapsulate(oip6, skb); +#endif + if (oip) + err = IP_ECN_decapsulate(oip, skb); + if (unlikely(err)) { - if (log_ecn_error) - net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", - &oip->saddr, oip->tos); + if (log_ecn_error) { +#if IS_ENABLED(CONFIG_IPV6) + if (oip6) + net_info_ratelimited("non-ECT from %pI6\n", + &oip6->saddr); +#endif + if (oip) + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", + &oip->saddr, oip->tos); + } if (err > 1) { ++vxlan->dev->stats.rx_frame_errors; ++vxlan->dev->stats.rx_errors; @@ -758,6 +898,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) u8 *arpptr, *sha; __be32 sip, tip; struct neighbour *n; + struct vxlan_addr ipa; if (dev->flags & IFF_NOARP) goto out; @@ -799,7 +940,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) } f = vxlan_find_mac(vxlan, n->ha); - if (f && f->remote.remote_ip == htonl(INADDR_ANY)) { + if (f && vxlan_addr_any(&f->remote.remote_ip)) { /* bridge-local neighbor */ neigh_release(n); goto out; @@ -817,8 +958,11 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) if (netif_rx_ni(reply) == NET_RX_DROP) dev->stats.rx_dropped++; - } else if (vxlan->flags & VXLAN_F_L3MISS) - vxlan_ip_miss(dev, tip); + } else if (vxlan->flags & VXLAN_F_L3MISS) { + ipa.va_sin = tip; + ipa.va_sa = AF_INET; + vxlan_ip_miss(dev, &ipa); + } out: consume_skb(skb); return NETDEV_TX_OK; @@ -840,6 +984,14 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) return false; pip = ip_hdr(skb); n = neigh_lookup(&arp_tbl, &pip->daddr, dev); + if (!n && vxlan->flags & VXLAN_F_L3MISS) { + struct vxlan_addr ipa; + ipa.va_sin = pip->daddr; + ipa.va_sa = AF_INET; + vxlan_ip_miss(dev, &ipa); + return false; + } + break; default: return false; @@ -856,8 +1008,8 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) } neigh_release(n); return diff; - } else if (vxlan->flags & VXLAN_F_L3MISS) - vxlan_ip_miss(dev, pip->daddr); + } + return false; } @@ -867,7 +1019,8 @@ static void vxlan_sock_free(struct sk_buff *skb) } /* On transmit, associate with the tunnel socket */ -static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) +static inline void vxlan_set_owner(struct net_device *dev, + struct sk_buff *skb) { struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct sock *sk = vn->sock->sk; @@ -916,15 +1069,26 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, { struct pcpu_tstats *tx_stats = this_cpu_ptr(src_vxlan->dev->tstats); struct pcpu_tstats *rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats); + struct vxlan_addr loopback; skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb->dev = dst_vxlan->dev; __skb_pull(skb, skb_network_offset(skb)); + if (dst_vxlan->default_dst.remote_ip.va_sa == AF_INET) { + loopback.va_sin = htonl(INADDR_LOOPBACK); + loopback.va_sa = AF_INET; + } +#if IS_ENABLED(CONFIG_IPV6) + else { + loopback.va_sin6 = in6addr_loopback; + loopback.va_sa = AF_INET6; + } +#endif + if (dst_vxlan->flags & VXLAN_F_LEARN) - vxlan_snoop(skb->dev, htonl(INADDR_LOOPBACK), - eth_hdr(skb)->h_source); + vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source); u64_stats_update_begin(&tx_stats->syncp); tx_stats->tx_packets++; @@ -946,22 +1110,29 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct rtable *rt; - const struct iphdr *old_iph; + const struct iphdr *old_iph = NULL; struct iphdr *iph; struct vxlanhdr *vxh; struct udphdr *uh; struct flowi4 fl4; - __be32 dst; - __u16 src_port, dst_port; +#if IS_ENABLED(CONFIG_IPV6) + struct flowi6 fl6; + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); + struct sock *sk = vn->sock->sk; + struct ipv6hdr *ip6h; +#endif + const struct vxlan_addr *dst; + struct dst_entry *ndst = NULL; + __u16 src_port = 0, dst_port; u32 vni; __be16 df = 0; __u8 tos, ttl; dst_port = rdst->remote_port ? rdst->remote_port : vxlan_port; vni = rdst->remote_vni; - dst = rdst->remote_ip; + dst = &rdst->remote_ip; - if (!dst) { + if (vxlan_addr_any(dst)) { if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan); @@ -975,60 +1146,115 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, skb->encapsulation = 1; } - /* Need space for new headers (invalidates iph ptr) */ - if (skb_cow_head(skb, VXLAN_HEADROOM)) - goto drop; + ttl = vxlan->ttl; + tos = vxlan->tos; + if (dst->va_sa == AF_INET) { + /* Need space for new headers (invalidates iph ptr) */ + if (skb_cow_head(skb, VXLAN_HEADROOM)) + goto drop; - old_iph = ip_hdr(skb); + old_iph = ip_hdr(skb); + if (!ttl && IN_MULTICAST(ntohl(dst->va_sin))) + ttl = 1; - ttl = vxlan->ttl; - if (!ttl && IN_MULTICAST(ntohl(dst))) - ttl = 1; + if (tos == 1) + tos = ip_tunnel_get_dsfield(old_iph, skb); - tos = vxlan->tos; - if (tos == 1) - tos = ip_tunnel_get_dsfield(old_iph, skb); - - src_port = vxlan_src_port(vxlan, skb); - - memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = rdst->remote_ifindex; - fl4.flowi4_tos = RT_TOS(tos); - fl4.daddr = dst; - fl4.saddr = vxlan->saddr; - - rt = ip_route_output_key(dev_net(dev), &fl4); - if (IS_ERR(rt)) { - netdev_dbg(dev, "no route to %pI4\n", &dst); - dev->stats.tx_carrier_errors++; - goto tx_error; - } + src_port = vxlan_src_port(vxlan, skb); - if (rt->dst.dev == dev) { - netdev_dbg(dev, "circular route to %pI4\n", &dst); - ip_rt_put(rt); - dev->stats.collisions++; - goto tx_error; - } + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_oif = rdst->remote_ifindex; + fl4.flowi4_tos = RT_TOS(tos); + fl4.daddr = dst->va_sin; + fl4.saddr = vxlan->saddr.va_sin; + + rt = ip_route_output_key(dev_net(dev), &fl4); + if (IS_ERR(rt)) { + netdev_dbg(dev, "no route to %pI4\n", &dst->va_sin); + dev->stats.tx_carrier_errors++; + goto tx_error; + } + + if (rt->dst.dev == dev) { + netdev_dbg(dev, "circular route to %pI4\n", &dst->va_sin); + ip_rt_put(rt); + dev->stats.collisions++; + goto tx_error; + } - /* Bypass encapsulation if the destination is local */ - if (rt->rt_flags & RTCF_LOCAL && - !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { - struct vxlan_dev *dst_vxlan; + /* Bypass encapsulation if the destination is local */ + if (rt->rt_flags & RTCF_LOCAL && + !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + struct vxlan_dev *dst_vxlan; - ip_rt_put(rt); - dst_vxlan = vxlan_find_vni(dev_net(dev), vni); - if (!dst_vxlan) + ip_rt_put(rt); + dst_vxlan = vxlan_find_vni(dev_net(dev), vni); + if (!dst_vxlan) + goto tx_error; + vxlan_encap_bypass(skb, vxlan, dst_vxlan); + return NETDEV_TX_OK; + } + + ndst = &rt->dst; + } else { +#if IS_ENABLED(CONFIG_IPV6) + const struct ipv6hdr *old_iph6; + u32 flags; + + /* Need space for new headers (invalidates iph ptr) */ + if (skb_cow_head(skb, VXLAN6_HEADROOM)) + goto drop; + + old_iph6 = ipv6_hdr(skb); + if (!ttl && ipv6_addr_is_multicast(&dst->va_sin6)) + ttl = 1; + + if (tos == 1) + tos = ipv6_get_dsfield(old_iph6); + + src_port = vxlan_src_port(vxlan, skb); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = rdst->remote_ifindex; + fl6.flowi6_tos = RT_TOS(tos); + fl6.daddr = dst->va_sin6; + fl6.saddr = vxlan->saddr.va_sin6; + fl6.flowi6_proto = skb->protocol; + + if (ip6_dst_lookup(sk, &ndst, &fl6)) { + netdev_dbg(dev, "no route to %pI6\n", &dst->va_sin6); + dev->stats.tx_carrier_errors++; goto tx_error; - vxlan_encap_bypass(skb, vxlan, dst_vxlan); - return NETDEV_TX_OK; + } + + if (ndst->dev == dev) { + netdev_dbg(dev, "circular route to %pI6\n", &dst->va_sin6); + dst_release(ndst); + dev->stats.collisions++; + goto tx_error; + } + + /* Bypass encapsulation if the destination is local */ + flags = ((struct rt6_info *)ndst)->rt6i_flags; + if (flags & RTF_LOCAL && + !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + struct vxlan_dev *dst_vxlan; + + dst_release(ndst); + dst_vxlan = vxlan_find_vni(dev_net(dev), vni); + if (!dst_vxlan) + goto tx_error; + vxlan_encap_bypass(skb, vxlan, dst_vxlan); + return NETDEV_TX_OK; + } +#endif } memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); + skb_dst_set(skb, ndst); vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_FLAGS); @@ -1044,27 +1270,55 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, uh->len = htons(skb->len); uh->check = 0; - __skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; - iph->protocol = IPPROTO_UDP; - iph->tos = ip_tunnel_ecn_encap(tos, old_iph, skb); - iph->daddr = dst; - iph->saddr = fl4.saddr; - iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - tunnel_ip_select_ident(skb, old_iph, &rt->dst); - - nf_reset(skb); + if (dst->va_sa == AF_INET) { + __skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = IPPROTO_UDP; + iph->tos = ip_tunnel_ecn_encap(tos, old_iph, skb); + iph->daddr = dst->va_sin; + iph->saddr = fl4.saddr; + iph->ttl = ttl ? : ip4_dst_hoplimit(ndst); + tunnel_ip_select_ident(skb, old_iph, ndst); + } else { +#if IS_ENABLED(CONFIG_IPV6) + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + } else + uh->check = csum_ipv6_magic(&fl6.saddr, &fl6.daddr, + skb->len, IPPROTO_UDP, + csum_partial(uh, skb->len, 0)); + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6h->version = 6; + ip6h->priority = 0; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = ttl ? : ip6_dst_hoplimit(ndst); + ip6h->daddr = fl6.daddr; + ip6h->saddr = fl6.saddr; +#endif + } vxlan_set_owner(dev, skb); if (handle_offloads(skb)) goto drop; - iptunnel_xmit(skb, dev); +#if IS_ENABLED(CONFIG_IPV6) + if (dst->va_sa == AF_INET6) + ip6tunnel_xmit(skb, dev); + else +#endif + iptunnel_xmit(skb, dev); return NETDEV_TX_OK; drop: @@ -1106,7 +1360,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) did_rsc = false; rdst0 = &vxlan->default_dst; - if (rdst0->remote_ip == htonl(INADDR_ANY) && + if (vxlan_addr_any(&rdst0->remote_ip) && (vxlan->flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); @@ -1184,7 +1438,7 @@ static int vxlan_open(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; - if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { + if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { err = vxlan_join_group(dev); if (err) return err; @@ -1218,7 +1472,7 @@ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) + if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) vxlan_leave_group(dev); del_timer_sync(&vxlan->age_timer); @@ -1268,7 +1522,12 @@ static void vxlan_setup(struct net_device *dev) eth_hw_addr_random(dev); ether_setup(dev); - dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM; +#if IS_ENABLED(CONFIG_IPV6) + if (vxlan->default_dst.remote_ip.va_sa == AF_INET6) + dev->hard_header_len = ETH_HLEN + VXLAN6_HEADROOM; + else +#endif + dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM; dev->netdev_ops = &vxlan_netdev_ops; dev->destructor = vxlan_free; @@ -1305,8 +1564,10 @@ static void vxlan_setup(struct net_device *dev) static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_VXLAN_REMOTE6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, @@ -1342,6 +1603,25 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) return -ERANGE; } + if (data[IFLA_VXLAN_REMOTE]) { + __be32 gaddr = nla_get_be32(data[IFLA_VXLAN_REMOTE]); + if (!IN_MULTICAST(ntohl(gaddr))) { + pr_debug("group address is not IPv4 multicast\n"); + return -EADDRNOTAVAIL; + } + } else if (data[IFLA_VXLAN_REMOTE6]) { +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr gaddr; + nla_memcpy(&gaddr, data[IFLA_VXLAN_REMOTE6], sizeof(gaddr)); + if (!ipv6_addr_is_multicast(&gaddr)) { + pr_debug("group address is not IPv6 multicast\n"); + return -EADDRNOTAVAIL; + } +#else + return -EPFNOSUPPORT; +#endif + } + if (data[IFLA_VXLAN_PORT_RANGE]) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); @@ -1386,11 +1666,31 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, } dst->remote_vni = vni; - if (data[IFLA_VXLAN_REMOTE]) - dst->remote_ip = nla_get_be32(data[IFLA_VXLAN_REMOTE]); + if (data[IFLA_VXLAN_REMOTE]) { + dst->remote_ip.va_sin = nla_get_be32(data[IFLA_VXLAN_REMOTE]); + dst->remote_ip.va_sa = AF_INET; + } else if (data[IFLA_VXLAN_REMOTE6]) { +#if IS_ENABLED(CONFIG_IPV6) + nla_memcpy(&dst->remote_ip.va_sin6, data[IFLA_VXLAN_REMOTE6], + sizeof(struct in6_addr)); + dst->remote_ip.va_sa = AF_INET6; +#else + return -EPFNOSUPPORT; +#endif + } - if (data[IFLA_VXLAN_LOCAL]) - vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]); + if (data[IFLA_VXLAN_LOCAL]) { + vxlan->saddr.va_sin = nla_get_be32(data[IFLA_VXLAN_LOCAL]); + vxlan->saddr.va_sa = AF_INET; + } else if (data[IFLA_VXLAN_LOCAL6]) { +#if IS_ENABLED(CONFIG_IPV6) + nla_memcpy(&vxlan->saddr.va_sin6, data[IFLA_VXLAN_LOCAL6], + sizeof(struct in6_addr)); + vxlan->saddr.va_sa = AF_INET6; +#else + return -EPFNOSUPPORT; +#endif + } if (data[IFLA_VXLAN_LINK] && (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) { @@ -1468,9 +1768,9 @@ static size_t vxlan_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ - nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_REMOTE */ + nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_REMOTE{6} */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ - nla_total_size(sizeof(__be32))+ /* IFLA_VXLAN_LOCAL */ + nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ @@ -1496,14 +1796,34 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni)) goto nla_put_failure; - if (dst->remote_ip && nla_put_be32(skb, IFLA_VXLAN_REMOTE, dst->remote_ip)) - goto nla_put_failure; + if (!vxlan_addr_any(&dst->remote_ip)) { + if (dst->remote_ip.va_sa == AF_INET) { + if (nla_put_be32(skb, IFLA_VXLAN_REMOTE, dst->remote_ip.va_sin)) + goto nla_put_failure; + } else { +#if IS_ENABLED(CONFIG_IPV6) + if (nla_put(skb, IFLA_VXLAN_REMOTE6, sizeof(struct in6_addr), + &dst->remote_ip.va_sin6)) + goto nla_put_failure; +#endif + } + } if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) goto nla_put_failure; - if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr)) - goto nla_put_failure; + if (!vxlan_addr_any(&vxlan->saddr)) { + if (vxlan->saddr.va_sa == AF_INET) { + if (nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr.va_sin)) + goto nla_put_failure; + } else { +#if IS_ENABLED(CONFIG_IPV6) + if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr), + &vxlan->saddr.va_sin6)) + goto nla_put_failure; +#endif + } + } if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) || nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) || @@ -1542,38 +1862,82 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = { .fill_info = vxlan_fill_info, }; -static __net_init int vxlan_init_net(struct net *net) +/* Create UDP socket for encapsulation receive. AF_INET6 socket + * could be used for both IPv4 and IPv6 communications. + */ +#if IS_ENABLED(CONFIG_IPV6) +static __net_init int create_sock(struct net *net, struct sock **sk) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct sockaddr_in6 vxlan_addr = { + .sin6_family = AF_INET6, + .sin6_port = htons(vxlan_port), + }; + int rc; + + rc = sock_create_kern(AF_INET6, SOCK_DGRAM, IPPROTO_UDP, &vn->sock); + if (rc < 0) { + pr_debug("UDP socket create failed\n"); + return rc; + } + /* Put in proper namespace */ + *sk = vn->sock->sk; + sk_change_net(*sk, net); + + rc = kernel_bind(vn->sock, (struct sockaddr *)&vxlan_addr, + sizeof(struct sockaddr_in6)); + if (rc < 0) { + pr_debug("bind for UDP socket %pI6:%u (%d)\n", + &vxlan_addr.sin6_addr, ntohs(vxlan_addr.sin6_port), rc); + sk_release_kernel(*sk); + vn->sock = NULL; + return rc; + } + return 0; +} +#else +static __net_init int create_sock(struct net *net, struct sock **sk) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); - struct sock *sk; struct sockaddr_in vxlan_addr = { .sin_family = AF_INET, + .sin_port = htons(vxlan_port), .sin_addr.s_addr = htonl(INADDR_ANY), }; int rc; - unsigned h; - /* Create UDP socket for encapsulation receive. */ rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock); if (rc < 0) { pr_debug("UDP socket create failed\n"); return rc; } /* Put in proper namespace */ - sk = vn->sock->sk; - sk_change_net(sk, net); - - vxlan_addr.sin_port = htons(vxlan_port); + *sk = vn->sock->sk; + sk_change_net(*sk, net); - rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr, - sizeof(vxlan_addr)); + rc = kernel_bind(vn->sock, (struct sockaddr *)&vxlan_addr, + sizeof(struct sockaddr_in)); if (rc < 0) { pr_debug("bind for UDP socket %pI4:%u (%d)\n", &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc); - sk_release_kernel(sk); + sk_release_kernel(*sk); vn->sock = NULL; return rc; } + return 0; +} +#endif + +static __net_init int vxlan_init_net(struct net *net) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct sock *sk; + int rc; + unsigned h; + + rc = create_sock(net, &sk); + if (rc < 0) + return rc; /* Disable multicast loopback */ inet_sk(sk)->mc_loop = 0; @@ -1582,6 +1946,9 @@ static __net_init int vxlan_init_net(struct net *net) udp_sk(sk)->encap_type = 1; udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; udp_encap_enable(); +#if IS_ENABLED(CONFIG_IPV6) + udpv6_encap_enable(); +#endif for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vn->vni_list[h]); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 9922704..f47ab4c 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -309,6 +309,8 @@ enum { IFLA_VXLAN_RSC, IFLA_VXLAN_L2MISS, IFLA_VXLAN_L3MISS, + IFLA_VXLAN_REMOTE6, + IFLA_VXLAN_LOCAL6, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)