diff mbox series

[SRU,N,2/4] UBUNTU: SAUCE: fan: add VXLAN implementation

Message ID 20240501124023.683940-3-andrea.righi@canonical.com
State New
Headers show
Series re-enable Ubuntu FAN in the Noble kernel | expand

Commit Message

Andrea Righi May 1, 2024, 12:34 p.m. UTC
From: Jay Vosburgh <jay.vosburgh@canonical.com>

BugLink: https://bugs.launchpad.net/bugs/2064508

Generify the fan mapping support and utilise that to implement fan
mappings over vxlan transport.

Expose the existance of this functionality (when the module is loaded)
via an additional sysctl marker.

Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
[apw@canonical.com: added feature marker for fan over vxlan.]
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
[ arighi: adjust conflicts in vxlan_xmit() and vxlan_xmit_one() for 6.4-rc1 ]
[ arighi: support v6.8 ABI ]
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 drivers/net/vxlan/vxlan_core.c | 245 +++++++++++++++++++++++++++++++++
 include/net/ip_tunnels.h       |  18 ++-
 include/net/vxlan.h            |   2 +
 include/uapi/linux/if_link.h   |   1 +
 include/uapi/linux/if_tunnel.h |   4 +-
 net/ipv4/ip_tunnel.c           |   7 +-
 net/ipv4/ipip.c                | 242 ++++++++++++++++++++++++--------
 7 files changed, 453 insertions(+), 66 deletions(-)
diff mbox series

Patch

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 16106e088c63..f16a4679e5ee 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -13,6 +13,7 @@ 
 #include <linux/slab.h>
 #include <linux/udp.h>
 #include <linux/igmp.h>
+#include <linux/inetdevice.h>
 #include <linux/if_ether.h>
 #include <linux/ethtool.h>
 #include <net/arp.h>
@@ -71,6 +72,167 @@  static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
 	       ip_tunnel_collect_metadata();
 }
 
+static struct ip_fan_map *vxlan_fan_find_map(struct vxlan_dev *vxlan, __be32 daddr)
+{
+	struct ip_fan_map *fan_map;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) {
+		if (fan_map->overlay ==
+		    (daddr & inet_make_mask(fan_map->overlay_prefix))) {
+			rcu_read_unlock();
+			return fan_map;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static void vxlan_fan_flush_map(struct vxlan_dev *vxlan)
+{
+	struct ip_fan_map *fan_map;
+
+	list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) {
+		list_del_rcu(&fan_map->list);
+		kfree_rcu(fan_map, rcu);
+	}
+}
+
+static int vxlan_fan_del_map(struct vxlan_dev *vxlan, __be32 overlay)
+{
+	struct ip_fan_map *fan_map;
+
+	fan_map = vxlan_fan_find_map(vxlan, overlay);
+	if (!fan_map)
+		return -ENOENT;
+
+	list_del_rcu(&fan_map->list);
+	kfree_rcu(fan_map, rcu);
+
+	return 0;
+}
+
+static int vxlan_fan_add_map(struct vxlan_dev *vxlan, struct ifla_fan_map *map)
+{
+	__be32 overlay_mask, underlay_mask;
+	struct ip_fan_map *fan_map;
+
+	overlay_mask = inet_make_mask(map->overlay_prefix);
+	underlay_mask = inet_make_mask(map->underlay_prefix);
+
+	netdev_dbg(vxlan->dev, "vfam: map: o %x/%d u %x/%d om %x um %x\n",
+		   map->overlay, map->overlay_prefix,
+		   map->underlay, map->underlay_prefix,
+		   overlay_mask, underlay_mask);
+
+	if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask))
+		return -EINVAL;
+
+	if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask))
+		return -EINVAL;
+
+	/* Special case: overlay 0 and underlay 0: flush all mappings */
+	if (!map->overlay && !map->underlay) {
+		vxlan_fan_flush_map(vxlan);
+		return 0;
+	}
+
+	/* Special case: overlay set and underlay 0: clear map for overlay */
+	if (!map->underlay)
+		return vxlan_fan_del_map(vxlan, map->overlay);
+
+	if (vxlan_fan_find_map(vxlan, map->overlay))
+		return -EEXIST;
+
+	fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL);
+	fan_map->underlay = map->underlay;
+	fan_map->overlay = map->overlay;
+	fan_map->underlay_prefix = map->underlay_prefix;
+	fan_map->overlay_mask = ntohl(overlay_mask);
+	fan_map->overlay_prefix = map->overlay_prefix;
+
+	list_add_tail_rcu(&fan_map->list, &vxlan->fan.fan_maps);
+
+	return 0;
+}
+
+static int vxlan_parse_fan_map(struct nlattr *data[], struct vxlan_dev *vxlan)
+{
+	struct ifla_fan_map *map;
+	struct nlattr *attr;
+	int rem, rv;
+
+	nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) {
+		map = nla_data(attr);
+		rv = vxlan_fan_add_map(vxlan, map);
+		if (rv)
+			return rv;
+	}
+
+	return 0;
+}
+
+static int vxlan_fan_build_rdst(struct vxlan_dev *vxlan, struct sk_buff *skb,
+				      struct vxlan_rdst *fan_rdst)
+{
+	struct ip_fan_map *f_map;
+	union vxlan_addr *va;
+	u32 daddr, underlay;
+	struct arphdr *arp;
+	void *arp_ptr;
+	struct ethhdr *eth;
+	struct iphdr *iph;
+
+	eth = eth_hdr(skb);
+	switch (eth->h_proto) {
+	case htons(ETH_P_IP):
+		iph = ip_hdr(skb);
+		if (!iph)
+			return -EINVAL;
+		daddr = iph->daddr;
+		break;
+	case htons(ETH_P_ARP):
+		arp = arp_hdr(skb);
+		if (!arp)
+			return -EINVAL;
+		arp_ptr = arp + 1;
+		netdev_dbg(vxlan->dev,
+			   "vfbr: arp sha %pM sip %pI4 tha %pM tip %pI4\n",
+			   arp_ptr, arp_ptr + skb->dev->addr_len,
+			   arp_ptr + skb->dev->addr_len + 4,
+			   arp_ptr + (skb->dev->addr_len * 2) + 4);
+		arp_ptr += (skb->dev->addr_len * 2) + 4;
+		memcpy(&daddr, arp_ptr, 4);
+		break;
+	default:
+		netdev_dbg(vxlan->dev, "vfbr: unknown eth p %x\n", eth->h_proto);
+		return -EINVAL;
+	}
+
+	f_map = vxlan_fan_find_map(vxlan, daddr);
+	if (!f_map)
+		return -EINVAL;
+
+	daddr = ntohl(daddr);
+	underlay = ntohl(f_map->underlay);
+	if (!underlay)
+		return -EINVAL;
+
+	memset(fan_rdst, 0, sizeof(*fan_rdst));
+	va = &fan_rdst->remote_ip;
+	va->sa.sa_family = AF_INET;
+	fan_rdst->remote_vni = vxlan->default_dst.remote_vni;
+	va->sin.sin_addr.s_addr = htonl(underlay |
+					((daddr & ~f_map->overlay_mask) >>
+					 (32 - f_map->overlay_prefix -
+					  (32 - f_map->underlay_prefix))));
+	netdev_dbg(vxlan->dev, "vfbr: daddr %x ul %x dst %x\n",
+		   daddr, underlay, va->sin.sin_addr.s_addr);
+
+	return 0;
+}
+
 /* Find VXLAN socket based on network namespace, address family, UDP port,
  * enabled unshareable flags and socket device binding (see l3mdev with
  * non-default VRF).
@@ -2433,6 +2595,13 @@  void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			goto tx_error;
 		}
 
+		if (fan_has_map(&vxlan->fan) && rt->rt_flags & RTCF_LOCAL) {
+			netdev_dbg(dev, "discard fan to localhost %pI4\n",
+				   &rdst->remote_ip.sin.sin_addr.s_addr);
+			ip_rt_put(rt);
+			goto tx_free;
+		}
+
 		if (!info) {
 			/* Bypass encapsulation if the destination is local */
 			err = encap_bypass_if_local(skb, dev, vxlan, AF_INET,
@@ -2569,6 +2738,7 @@  void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	dst_release(ndst);
 	dev->stats.tx_errors++;
 	vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0);
+tx_free:
 	kfree_skb(skb);
 }
 
@@ -2716,6 +2886,20 @@  static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 		rcu_read_unlock();
 	}
 
+	if (fan_has_map(&vxlan->fan)) {
+		struct vxlan_rdst fan_rdst;
+
+		netdev_dbg(vxlan->dev, "vxlan_xmit p %x d %pM\n",
+			   eth->h_proto, eth->h_dest);
+		if (vxlan_fan_build_rdst(vxlan, skb, &fan_rdst)) {
+			dev->stats.tx_dropped++;
+			kfree_skb(skb);
+			return NETDEV_TX_OK;
+		}
+		vxlan_xmit_one(skb, dev, vni, &fan_rdst, 0);
+		return NETDEV_TX_OK;
+	}
+
 	eth = eth_hdr(skb);
 	f = vxlan_find_mac(vxlan, eth->h_dest, vni);
 	did_rsc = false;
@@ -3325,6 +3509,8 @@  static void vxlan_setup(struct net_device *dev)
 		spin_lock_init(&vxlan->hash_lock[h]);
 		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
 	}
+
+	INIT_LIST_HEAD(&vxlan->fan.fan_maps);
 }
 
 static void vxlan_ether_setup(struct net_device *dev)
@@ -4055,6 +4241,12 @@  static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 		conf->remote_ip.sa.sa_family = AF_INET6;
 	}
 
+	if (data[IFLA_VXLAN_FAN_MAP]) {
+		err = vxlan_parse_fan_map(data, vxlan);
+		if (err)
+			return err;
+	}
+
 	if (data[IFLA_VXLAN_LOCAL]) {
 		if (changelink && (conf->saddr.sa.sa_family != AF_INET)) {
 			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old");
@@ -4440,6 +4632,7 @@  static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(0) + /* IFLA_VXLAN_GPE */
 		nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */
 		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */
+		nla_total_size(sizeof(struct ip_fan_map) * 256) +
 		0;
 }
 
@@ -4486,6 +4679,26 @@  static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		}
 	}
 
+	if (fan_has_map(&vxlan->fan)) {
+		struct nlattr *fan_nest;
+		struct ip_fan_map *fan_map;
+
+		fan_nest = nla_nest_start(skb, IFLA_VXLAN_FAN_MAP);
+		if (!fan_nest)
+			goto nla_put_failure;
+		list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) {
+			struct ifla_fan_map map;
+
+			map.underlay = fan_map->underlay;
+			map.underlay_prefix = fan_map->underlay_prefix;
+			map.overlay = fan_map->overlay;
+			map.overlay_prefix = fan_map->overlay_prefix;
+			if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map))
+				goto nla_put_failure;
+		}
+		nla_nest_end(skb, fan_nest);
+	}
+
 	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
 		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
@@ -4826,6 +5039,22 @@  static __net_init int vxlan_init_net(struct net *net)
 					 NULL);
 }
 
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *vxlan_fan_header;
+static unsigned int vxlan_fan_version = 4;
+
+static struct ctl_table vxlan_fan_sysctls[] = {
+	{
+		.procname	= "vxlan",
+		.data		= &vxlan_fan_version,
+		.maxlen		= sizeof(vxlan_fan_version),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+	{},
+};
+#endif /* CONFIG_SYSCTL */
+
 static void vxlan_destroy_tunnels(struct net *net, struct list_head *head)
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
@@ -4903,7 +5132,20 @@  static int __init vxlan_init_module(void)
 
 	vxlan_vnifilter_init();
 
+#ifdef CONFIG_SYSCTL
+	vxlan_fan_header = register_net_sysctl(&init_net, "net/fan",
+					      vxlan_fan_sysctls);
+	if (!vxlan_fan_header) {
+		rc = -ENOMEM;
+		goto sysctl_failed;
+	}
+#endif /* CONFIG_SYSCTL */
+
 	return 0;
+#ifdef CONFIG_SYSCTL
+sysctl_failed:
+	rtnl_link_unregister(&vxlan_link_ops);
+#endif /* CONFIG_SYSCTL */
 out4:
 	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
 out3:
@@ -4917,6 +5159,9 @@  late_initcall(vxlan_init_module);
 
 static void __exit vxlan_cleanup_module(void)
 {
+#ifdef CONFIG_SYSCTL
+	unregister_net_sysctl_table(vxlan_fan_header);
+#endif /* CONFIG_SYSCTL */
 	vxlan_vnifilter_uninit();
 	rtnl_link_unregister(&vxlan_link_ops);
 	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 98a3d8ad9415..8c53e4179854 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -118,9 +118,18 @@  struct metadata_dst;
  */
 #define FAN_OVERLAY_CNT		256
 
+struct ip_fan_map {
+	__be32			underlay;
+	__be32			overlay;
+	u16			underlay_prefix;
+	u16			overlay_prefix;
+	u32			overlay_mask;
+	struct list_head	list;
+	struct rcu_head		rcu;
+};
+
 struct ip_tunnel_fan {
-/*	u32 __rcu	*map;*/
-	u32		map[FAN_OVERLAY_CNT];
+	struct list_head	fan_maps;
 };
 
 struct ip_tunnel {
@@ -170,6 +179,11 @@  struct ip_tunnel {
 	bool			ignore_df;
 };
 
+static inline int fan_has_map(const struct ip_tunnel_fan *fan)
+{
+	return !list_empty(&fan->fan_maps);
+}
+
 struct tnl_ptk_info {
 	__be16 flags;
 	__be16 proto;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 33ba6fc151cf..e55d5b1483db 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -294,6 +294,8 @@  struct vxlan_dev {
 	struct net	  *net;		/* netns for packet i/o */
 	struct vxlan_rdst default_dst;	/* default destination */
 
+	struct ip_tunnel_fan fan;
+
 	struct timer_list age_timer;
 	spinlock_t	  hash_lock[FDB_HASH_SIZE];
 	unsigned int	  addrcnt;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ab9bcff96e4d..4345ceae5d99 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1378,6 +1378,7 @@  enum {
 	IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
 	IFLA_VXLAN_LOCALBYPASS,
 	IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */
+	IFLA_VXLAN_FAN_MAP = 33,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index a862f0c483b7..f1401060d5b5 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -186,8 +186,6 @@  enum {
 		(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT | \
 		TUNNEL_GTP_OPT)
 
-#define TUNNEL_FAN		__cpu_to_be16(0x8000)
-
 enum {
 	IFLA_FAN_UNSPEC,
 	IFLA_FAN_MAPPING,
@@ -196,7 +194,7 @@  enum {
 
 #define IFLA_FAN_MAX (__IFLA_FAN_MAX - 1)
 
-struct ip_tunnel_fan_map {
+struct ifla_fan_map {
 	__be32		underlay;
 	__be32		overlay;
 	__u16		underlay_prefix;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 5faebe94d071..b6ca5742fc46 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1227,11 +1227,6 @@  int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
 
-static int ip_tunnel_is_fan(struct ip_tunnel *tunnel)
-{
-	return tunnel->parms.i_flags & TUNNEL_FAN;
-}
-
 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 			 struct ip_tunnel_parm *p, __u32 fwmark)
 {
@@ -1241,7 +1236,7 @@  int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
 
 	if (dev == itn->fb_tunnel_dev)
-		return ip_tunnel_is_fan(tunnel) ? 0 : -EINVAL;
+		return fan_has_map(&tunnel->fan) ? 0 : -EINVAL;
 
 	t = ip_tunnel_find(itn, p, dev->type);
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2fbe16a33bc1..a044da845559 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -102,6 +102,7 @@ 
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
 #include <linux/inetdevice.h>
+#include <linux/rculist.h>
 
 #include <net/sock.h>
 #include <net/ip.h>
@@ -268,37 +269,144 @@  static int mplsip_rcv(struct sk_buff *skb)
 }
 #endif
 
-static int ipip_tunnel_is_fan(struct ip_tunnel *tunnel)
+static struct ip_fan_map *ipip_fan_find_map(struct ip_tunnel *t, __be32 daddr)
 {
-	return tunnel->parms.i_flags & TUNNEL_FAN;
+	struct ip_fan_map *fan_map;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) {
+		if (fan_map->overlay ==
+		    (daddr & inet_make_mask(fan_map->overlay_prefix))) {
+			rcu_read_unlock();
+			return fan_map;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
 }
 
-/*
- * Determine fan tunnel endpoint to send packet to, based on the inner IP
- * address.  For an overlay (inner) address Y.A.B.C, the transformation is
- * F.G.A.B, where "F" and "G" are the first two octets of the underlay
- * network (the network portion of a /16), "A" and "B" are the low order
- * two octets of the underlay network host (the host portion of a /16),
- * and "Y" is a configured first octet of the overlay network.
+/* Determine fan tunnel endpoint to send packet to, based on the inner IP
+ * address.  
+ *
+ * Given a /8 overlay and /16 underlay, for an overlay (inner) address
+ * Y.A.B.C, the transformation is F.G.A.B, where "F" and "G" are the first
+ * two octets of the underlay network (the network portion of a /16), "A"
+ * and "B" are the low order two octets of the underlay network host (the
+ * host portion of a /16), and "Y" is a configured first octet of the
+ * overlay network.
+ *
+ * E.g., underlay host 10.88.3.4/16 with an overlay of 99.0.0.0/8 would
+ * host overlay subnet 99.3.4.0/24.  An overlay network datagram from
+ * 99.3.4.5 to 99.6.7.8, would be directed to underlay host 10.88.6.7,
+ * which hosts overlay network subnet 99.6.7.0/24.  This transformation is
+ * described in detail further below.
+ *
+ * Using netmasks for the overlay and underlay other than /8 and /16, as
+ * shown above, can yield larger (or smaller) overlay subnets, with the
+ * trade-off of allowing fewer (or more) underlay hosts to participate.
+ *
+ * The size of each overlay network subnet is defined by the total of the
+ * network mask of the overlay plus the size of host portion of the
+ * underlay network. In the above example, /8 + /16 = /24.
+ *
+ * E.g., consider underlay host 10.99.238.5/20 and overlay 99.0.0.0/8. In
+ * this case, the network portion of the underlay is 10.99.224.0/20, and
+ * the host portion is 0.0.14.5 (12 bits).  To determine the overlay
+ * network subnet, the 12 bits of host portion are left shifted 12 bits
+ * (/20 - /8) and ORed with the overlay subnet prefix.  This yields an
+ * overlay subnet of 99.224.80/20, composed of 8 bits overlay, followed by
+ * 12 bits underlay.  This yields 12 bits in the overlay network portion,
+ * allowing for 4094 addresses in each overlay network subnet.  The
+ * trade-off is that fewer hosts may participate in the underlay network,
+ * as its host address size has shrunk from 16 bits (65534 addresses) in
+ * the first example to 12 bits (4094 addresses) here.
+ *
+ * For fewer hosts per overlay subnet (permitting a larger number of
+ * underlay hosts to participate), the underlay netmask may be made
+ * smaller.
+ *
+ * E.g., underlay host 10.111.1.2/12 (network 10.96.0.0/12, host portion
+ * is 0.15.1.2, 20 bits) with an overlay of 33.0.0.0/8 would left shift
+ * the 20 bits of host by 4 (so that it's highest order bit is adjacent to
+ * the lowest order bit of the /8 overlay).  This yields an overlay subnet
+ * of 33.240.16.32/28 (8 bits overlay, 20 bits from the host portion of
+ * the underlay).  This provides more addresses for the underlay network
+ * (approximately 2^20), but each host's segment of the overlay provides
+ * only 4 bits of addresses (14 usable).
+ *
+ * It is also possible to adjust the overlay subnet.
+ *
+ * For an overlay of 240.0.0.0/5 and underlay of 10.88.0.0/20, consider
+ * underlay host 10.88.129.2; the 12 bits of host, 0.0.1.2, are left
+ * shifted 15 bits (/20 - /5), yielding an overlay network of
+ * 240.129.0.0/17.  An underlay host of 10.88.244.215 would yield an
+ * overlay network of 242.107.128.0/17.
+ *
+ * For an overlay of 100.64.0.0/10 and underlay of 10.224.220.0/24, for
+ * underlay host 10.224.220.10, the underlay host portion (.10) is left
+ * shifted 14 bits, yielding an overlay network subnet of 100.66.128.0/18.
+ * This would permit 254 addresses on the underlay, with each overlay
+ * segment providing approximately 2^14 - 2 addresses (16382).
+ *
+ * For packets being encapsulated, the overlay network destination IP
+ * address is deconstructed into its overlay and underlay-derived
+ * portions.  The underlay portion (determined by the overlay mask and
+ * overlay subnet mask) is right shifted according to the size of the
+ * underlay network mask.  This value is then ORed with the network
+ * portion of the underlay network to produce the underlay network
+ * destination for the encapsulated datagram.
+ *
+ * For example, using the initial example of underlay 10.88.3.4/16 and
+ * overlay 99.0.0.0/8, with underlay host 10.88.3.4/16 providing overlay
+ * subnet 99.3.4.0/24 with specfic host 99.3.4.5.  A datagram from
+ * 99.3.4.5 to 99.6.7.8 would first have the underlay host derived portion
+ * of the address extracted.  This is a number of bits equal to underlay
+ * network host portion.  In the destination address, the highest order of
+ * these bits is one bit lower than the lowest order bit from the overlay
+ * network mask.
  *
- * E.g., underlay host 10.88.3.4 with an overlay of 99 would host overlay
- * subnet 99.3.4.0/24.  An overlay network datagram from 99.3.4.5 to
- * 99.6.7.8, would be directed to underlay host 10.88.6.7, which hosts
- * overlay network 99.6.7.0/24.
+ * Using the sample value, 99.6.7.8, the overlay mask is /8, and the
+ * underlay mask is /16 (leaving 16 bits for the host portion).  The bits
+ * to be shifted are the middle two octets, 0.6.7.0, as this is 99.6.7.8
+ * ANDed with the mask 0x00ffff00 (which is 16 bits, the highest order of
+ * which is 1 bit lower than the lowest order overlay address bit).
+ *
+ * These octets, 0.6.7.0, are then right shifted 8 bits, yielding 0.0.6.7.
+ * This value is then ORed with the underlay network portion,
+ * 10.88.0.0/16, providing 10.88.6.7 as the final underlay destination for
+ * the encapuslated datagram.
+ *
+ * Another transform using the final example: overlay 100.64.0.0/10 and
+ * underlay 10.224.220.0/24.  Consider overlay address 100.66.128.1
+ * sending a datagram to 100.66.200.5.  In this case, 8 bits (the host
+ * portion size of 10.224.220.0/24) beginning after the 100.64/10 overlay
+ * prefix are masked off, yielding 0.2.192.0.  This is right shifted 14
+ * (32 - 10 - (32 - 24), i.e., the number of bits between the overlay
+ * network portion and the underlay host portion) bits, yielding 0.0.0.11.
+ * This is ORed with the underlay network portion, 10.224.220.0/24, giving
+ * the underlay destination of 10.224.220.11 for overlay destination
+ * 100.66.200.5.
  */
 static int ipip_build_fan_iphdr(struct ip_tunnel *tunnel, struct sk_buff *skb, struct iphdr *iph)
 {
-	unsigned int overlay;
+	struct ip_fan_map *f_map;
 	u32 daddr, underlay;
 
+	f_map = ipip_fan_find_map(tunnel, ip_hdr(skb)->daddr);
+	if (!f_map)
+		return -ENOENT;
+
 	daddr = ntohl(ip_hdr(skb)->daddr);
-	overlay = daddr >> 24;
-	underlay = tunnel->fan.map[overlay];
+	underlay = ntohl(f_map->underlay);
 	if (!underlay)
 		return -EINVAL;
 
 	*iph = tunnel->parms.iph;
-	iph->daddr = htonl(underlay | ((daddr >> 8) & 0x0000ffff));
+	iph->daddr = htonl(underlay |
+			   ((daddr & ~f_map->overlay_mask) >>
+			    (32 - f_map->overlay_prefix -
+			     (32 - f_map->underlay_prefix))));
 	return 0;
 }
 
@@ -336,7 +444,7 @@  static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
 		goto tx_error;
 
-	if (ipip_tunnel_is_fan(tunnel)) {
+	if (fan_has_map(&tunnel->fan)) {
 		if (ipip_build_fan_iphdr(tunnel, skb, &fiph))
 			goto tx_error;
 		tiph = &fiph;
@@ -407,6 +515,8 @@  static const struct net_device_ops ipip_netdev_ops = {
 
 static void ipip_tunnel_setup(struct net_device *dev)
 {
+	struct ip_tunnel *t = netdev_priv(dev);
+
 	dev->netdev_ops		= &ipip_netdev_ops;
 	dev->header_ops		= &ip_tunnel_header_ops;
 
@@ -419,6 +529,7 @@  static void ipip_tunnel_setup(struct net_device *dev)
 	dev->features		|= IPIP_FEATURES;
 	dev->hw_features	|= IPIP_FEATURES;
 	ip_tunnel_setup(dev, ipip_net_id);
+	INIT_LIST_HEAD(&t->fan.fan_maps);
 }
 
 static int ipip_tunnel_init(struct net_device *dev)
@@ -471,41 +582,65 @@  static void ipip_netlink_parms(struct nlattr *data[],
 		*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
 }
 
-static void ipip_fan_free_map(struct ip_tunnel *t)
+static void ipip_fan_flush_map(struct ip_tunnel *t)
 {
-	memset(&t->fan.map, 0, sizeof(t->fan.map));
+	struct ip_fan_map *fan_map;
+
+	list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) {
+		list_del_rcu(&fan_map->list);
+		kfree_rcu(fan_map, rcu);
+	}
 }
 
-static int ipip_fan_set_map(struct ip_tunnel *t, struct ip_tunnel_fan_map *map)
+static int ipip_fan_del_map(struct ip_tunnel *t, __be32 overlay)
 {
-	u32 overlay, overlay_mask, underlay, underlay_mask;
+	struct ip_fan_map *fan_map;
 
-	if ((map->underlay_prefix && map->underlay_prefix != 16) ||
-	    (map->overlay_prefix && map->overlay_prefix != 8))
-		return -EINVAL;
+	fan_map = ipip_fan_find_map(t, overlay);
+	if (!fan_map)
+		return -ENOENT;
 
-	overlay = ntohl(map->overlay);
-	overlay_mask = ntohl(inet_make_mask(map->overlay_prefix));
+	list_del_rcu(&fan_map->list);
+	kfree_rcu(fan_map, rcu);
 
-	underlay = ntohl(map->underlay);
-	underlay_mask = ntohl(inet_make_mask(map->underlay_prefix));
+	return 0;
+}
 
-	if ((overlay & ~overlay_mask) || (underlay & ~underlay_mask))
-		return -EINVAL;
+static int ipip_fan_add_map(struct ip_tunnel *t, struct ifla_fan_map *map)
+{
+	__be32 overlay_mask, underlay_mask;
+	struct ip_fan_map *fan_map;
 
-	if (!(overlay & overlay_mask) && (underlay & underlay_mask))
+	overlay_mask = inet_make_mask(map->overlay_prefix);
+	underlay_mask = inet_make_mask(map->underlay_prefix);
+
+	if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask))
 		return -EINVAL;
 
-	t->parms.i_flags |= TUNNEL_FAN;
+	if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask))
+		return -EINVAL;
 
-	/* Special case: overlay 0 and underlay 0 clears all mappings */
-	if (!overlay && !underlay) {
-		ipip_fan_free_map(t);
+	/* Special case: overlay 0 and underlay 0: flush all mappings */
+	if (!map->overlay && !map->underlay) {
+		ipip_fan_flush_map(t);
 		return 0;
 	}
+	
+	/* Special case: overlay set and underlay 0: clear map for overlay */
+	if (!map->underlay)
+		return ipip_fan_del_map(t, map->overlay);
+
+	if (ipip_fan_find_map(t, map->overlay))
+		return -EEXIST;
+
+	fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL);
+	fan_map->underlay = map->underlay;
+	fan_map->overlay = map->overlay;
+	fan_map->underlay_prefix = map->underlay_prefix;
+	fan_map->overlay_mask = ntohl(overlay_mask);
+	fan_map->overlay_prefix = map->overlay_prefix;
 
-	overlay >>= (32 - map->overlay_prefix);
-	t->fan.map[overlay] = underlay;
+	list_add_tail_rcu(&fan_map->list, &t->fan.fan_maps);
 
 	return 0;
 }
@@ -513,7 +648,7 @@  static int ipip_fan_set_map(struct ip_tunnel *t, struct ip_tunnel_fan_map *map)
 static int ipip_netlink_fan(struct nlattr *data[], struct ip_tunnel *t,
 			    struct ip_tunnel_parm *parms)
 {
-	struct ip_tunnel_fan_map *map;
+	struct ifla_fan_map *map;
 	struct nlattr *attr;
 	int rem, rv;
 
@@ -525,7 +660,7 @@  static int ipip_netlink_fan(struct nlattr *data[], struct ip_tunnel *t,
 
 	nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) {
 		map = nla_data(attr);
-		rv = ipip_fan_set_map(t, map);
+		rv = ipip_fan_add_map(t, map);
 		if (rv)
 			return rv;
 	}
@@ -619,7 +754,7 @@  static size_t ipip_get_size(const struct net_device *dev)
 		/* IFLA_IPTUN_FWMARK */
 		nla_total_size(4) +
 		/* IFLA_IPTUN_FAN_MAP */
-		nla_total_size(sizeof(struct ip_tunnel_fan_map)) * 256 +
+		nla_total_size(sizeof(struct ifla_fan_map)) * 256 +
 		0;
 }
 
@@ -652,25 +787,22 @@  static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (tunnel->collect_md)
 		if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
 			goto nla_put_failure;
-	if (tunnel->parms.i_flags & TUNNEL_FAN) {
+	if (fan_has_map(&tunnel->fan)) {
 		struct nlattr *fan_nest;
-		int i;
+		struct ip_fan_map *fan_map;
 
 		fan_nest = nla_nest_start(skb, IFLA_IPTUN_FAN_MAP);
 		if (!fan_nest)
 			goto nla_put_failure;
-		for (i = 0; i < 256; i++) {
-			if (tunnel->fan.map[i]) {
-				struct ip_tunnel_fan_map map;
-
-				map.underlay = htonl(tunnel->fan.map[i]);
-				map.underlay_prefix = 16;
-				map.overlay = htonl(i << 24);
-				map.overlay_prefix = 8;
-				if (nla_put(skb, IFLA_FAN_MAPPING,
-					    sizeof(map), &map))
-					goto nla_put_failure;
-			}
+		list_for_each_entry_rcu(fan_map, &tunnel->fan.fan_maps, list) {
+			struct ifla_fan_map map;
+
+			map.underlay = fan_map->underlay;
+			map.underlay_prefix = fan_map->underlay_prefix;
+			map.overlay = fan_map->overlay;
+			map.overlay_prefix = fan_map->overlay_prefix;
+			if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map))
+				goto nla_put_failure;
 		}
 		nla_nest_end(skb, fan_nest);
 	}