diff mbox series

[SRU,N,1/4] UBUNTU: SAUCE: fan: tunnel multiple mapping mode (v3)

Message ID 20240501124023.683940-2-andrea.righi@canonical.com
State New
Headers show
Series re-enable Ubuntu FAN in the Noble kernel | expand

Commit Message

Andrea Righi May 1, 2024, 12:34 p.m. UTC
From: Jay Vosburgh <jay.vosburgh@canonical.com>

BugLink: https://bugs.launchpad.net/bugs/2064508

Switch to a single tunnel for all mappings, this removes the limitations
on how many mappings each tunnel can handle, and therefore how many Fan
slices each local address may hold.

NOTE: This introduces a new kernel netlink interface which needs updated
iproute2 support.

BugLink: http://bugs.launchpad.net/bugs/1470091
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
[saf: Fix conflicts during rebase to 4.12]
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
[arighi: support v6.8 ABI]
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 include/net/ip_tunnels.h       |  14 +++
 include/uapi/linux/if_tunnel.h |  21 ++++
 net/ipv4/ip_tunnel.c           |   7 +-
 net/ipv4/ipip.c                | 179 ++++++++++++++++++++++++++++++++-
 4 files changed, 218 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 2d746f4c9a0a..98a3d8ad9415 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -109,6 +109,19 @@  struct ip_tunnel_prl_entry {
 };
 
 struct metadata_dst;
+/* A fan overlay /8 (250.0.0.0/8, for example) maps to exactly one /16
+ * underlay (10.88.0.0/16, for example).  Multiple local addresses within
+ * the /16 may be used, but a particular overlay may not span
+ * multiple underlay subnets.
+ *
+ * We store one underlay, indexed by the overlay's high order octet.
+ */
+#define FAN_OVERLAY_CNT		256
+
+struct ip_tunnel_fan {
+/*	u32 __rcu	*map;*/
+	u32		map[FAN_OVERLAY_CNT];
+};
 
 struct ip_tunnel {
 	struct ip_tunnel __rcu	*next;
@@ -149,6 +162,7 @@  struct ip_tunnel {
 #endif
 	struct ip_tunnel_prl_entry __rcu *prl;	/* potential router list */
 	unsigned int		prl_count;	/* # of entries in PRL */
+	struct ip_tunnel_fan	fan;
 	unsigned int		ip_tnl_net_id;
 	struct gro_cells	gro_cells;
 	__u32			fwmark;
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 102119628ff5..a862f0c483b7 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -77,6 +77,10 @@  enum {
 	IFLA_IPTUN_ENCAP_DPORT,
 	IFLA_IPTUN_COLLECT_METADATA,
 	IFLA_IPTUN_FWMARK,
+
+	__IFLA_IPTUN_VENDOR_BREAK, /* Ensure new entries do not hit the below. */
+	IFLA_IPTUN_FAN_MAP = 33,
+
 	__IFLA_IPTUN_MAX,
 };
 #define IFLA_IPTUN_MAX	(__IFLA_IPTUN_MAX - 1)
@@ -182,4 +186,21 @@  enum {
 		(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT | \
 		TUNNEL_GTP_OPT)
 
+#define TUNNEL_FAN		__cpu_to_be16(0x8000)
+
+enum {
+	IFLA_FAN_UNSPEC,
+	IFLA_FAN_MAPPING,
+	__IFLA_FAN_MAX,
+};
+
+#define IFLA_FAN_MAX (__IFLA_FAN_MAX - 1)
+
+struct ip_tunnel_fan_map {
+	__be32		underlay;
+	__be32		overlay;
+	__u16		underlay_prefix;
+	__u16		overlay_prefix;
+};
+
 #endif /* _UAPI_IF_TUNNEL_H_ */
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 7af36e4f1647..5faebe94d071 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1227,6 +1227,11 @@  int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
 
+static int ip_tunnel_is_fan(struct ip_tunnel *tunnel)
+{
+	return tunnel->parms.i_flags & TUNNEL_FAN;
+}
+
 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 			 struct ip_tunnel_parm *p, __u32 fwmark)
 {
@@ -1236,7 +1241,7 @@  int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
 
 	if (dev == itn->fb_tunnel_dev)
-		return -EINVAL;
+		return ip_tunnel_is_fan(tunnel) ? 0 : -EINVAL;
 
 	t = ip_tunnel_find(itn, p, dev->type);
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 03afa3871efc..2fbe16a33bc1 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -101,6 +101,7 @@ 
 #include <linux/init.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
+#include <linux/inetdevice.h>
 
 #include <net/sock.h>
 #include <net/ip.h>
@@ -267,6 +268,40 @@  static int mplsip_rcv(struct sk_buff *skb)
 }
 #endif
 
+static int ipip_tunnel_is_fan(struct ip_tunnel *tunnel)
+{
+	return tunnel->parms.i_flags & TUNNEL_FAN;
+}
+
+/*
+ * Determine fan tunnel endpoint to send packet to, based on the inner IP
+ * address.  For an overlay (inner) address Y.A.B.C, the transformation is
+ * F.G.A.B, where "F" and "G" are the first two octets of the underlay
+ * network (the network portion of a /16), "A" and "B" are the low order
+ * two octets of the underlay network host (the host portion of a /16),
+ * and "Y" is a configured first octet of the overlay network.
+ *
+ * E.g., underlay host 10.88.3.4 with an overlay of 99 would host overlay
+ * subnet 99.3.4.0/24.  An overlay network datagram from 99.3.4.5 to
+ * 99.6.7.8, would be directed to underlay host 10.88.6.7, which hosts
+ * overlay network 99.6.7.0/24.
+ */
+static int ipip_build_fan_iphdr(struct ip_tunnel *tunnel, struct sk_buff *skb, struct iphdr *iph)
+{
+	unsigned int overlay;
+	u32 daddr, underlay;
+
+	daddr = ntohl(ip_hdr(skb)->daddr);
+	overlay = daddr >> 24;
+	underlay = tunnel->fan.map[overlay];
+	if (!underlay)
+		return -EINVAL;
+
+	*iph = tunnel->parms.iph;
+	iph->daddr = htonl(underlay | ((daddr >> 8) & 0x0000ffff));
+	return 0;
+}
+
 /*
  *	This function assumes it is being called from dev_queue_xmit()
  *	and that skb is filled properly by that function.
@@ -277,6 +312,7 @@  static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr  *tiph = &tunnel->parms.iph;
 	u8 ipproto;
+	struct iphdr fiph;
 
 	if (!pskb_inet_may_pull(skb))
 		goto tx_error;
@@ -300,6 +336,14 @@  static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
 		goto tx_error;
 
+	if (ipip_tunnel_is_fan(tunnel)) {
+		if (ipip_build_fan_iphdr(tunnel, skb, &fiph))
+			goto tx_error;
+		tiph = &fiph;
+	} else {
+		tiph = &tunnel->parms.iph;
+	}
+
 	skb_set_inner_ipproto(skb, ipproto);
 
 	if (tunnel->collect_md)
@@ -427,6 +471,68 @@  static void ipip_netlink_parms(struct nlattr *data[],
 		*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
 }
 
+static void ipip_fan_free_map(struct ip_tunnel *t)
+{
+	memset(&t->fan.map, 0, sizeof(t->fan.map));
+}
+
+static int ipip_fan_set_map(struct ip_tunnel *t, struct ip_tunnel_fan_map *map)
+{
+	u32 overlay, overlay_mask, underlay, underlay_mask;
+
+	if ((map->underlay_prefix && map->underlay_prefix != 16) ||
+	    (map->overlay_prefix && map->overlay_prefix != 8))
+		return -EINVAL;
+
+	overlay = ntohl(map->overlay);
+	overlay_mask = ntohl(inet_make_mask(map->overlay_prefix));
+
+	underlay = ntohl(map->underlay);
+	underlay_mask = ntohl(inet_make_mask(map->underlay_prefix));
+
+	if ((overlay & ~overlay_mask) || (underlay & ~underlay_mask))
+		return -EINVAL;
+
+	if (!(overlay & overlay_mask) && (underlay & underlay_mask))
+		return -EINVAL;
+
+	t->parms.i_flags |= TUNNEL_FAN;
+
+	/* Special case: overlay 0 and underlay 0 clears all mappings */
+	if (!overlay && !underlay) {
+		ipip_fan_free_map(t);
+		return 0;
+	}
+
+	overlay >>= (32 - map->overlay_prefix);
+	t->fan.map[overlay] = underlay;
+
+	return 0;
+}
+
+static int ipip_netlink_fan(struct nlattr *data[], struct ip_tunnel *t,
+			    struct ip_tunnel_parm *parms)
+{
+	struct ip_tunnel_fan_map *map;
+	struct nlattr *attr;
+	int rem, rv;
+
+	if (!data[IFLA_IPTUN_FAN_MAP])
+		return 0;
+
+	if (parms->iph.daddr)
+		return -EINVAL;
+
+	nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) {
+		map = nla_data(attr);
+		rv = ipip_fan_set_map(t, map);
+		if (rv)
+			return rv;
+	}
+
+	return 0;
+}
+
 static int ipip_newlink(struct net *src_net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[],
 			struct netlink_ext_ack *extack)
@@ -435,15 +541,19 @@  static int ipip_newlink(struct net *src_net, struct net_device *dev,
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
 	__u32 fwmark = 0;
+	int err;
 
 	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
-		int err = ip_tunnel_encap_setup(t, &ipencap);
+		err = ip_tunnel_encap_setup(t, &ipencap);
 
 		if (err < 0)
 			return err;
 	}
 
 	ipip_netlink_parms(data, &p, &t->collect_md, &fwmark);
+	err = ipip_netlink_fan(data, t, &p);
+	if (err < 0)
+		return err;
 	return ip_tunnel_newlink(dev, tb, &p, fwmark);
 }
 
@@ -456,9 +566,10 @@  static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct ip_tunnel_encap ipencap;
 	bool collect_md;
 	__u32 fwmark = t->fwmark;
+	int err;
 
 	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
-		int err = ip_tunnel_encap_setup(t, &ipencap);
+		err = ip_tunnel_encap_setup(t, &ipencap);
 
 		if (err < 0)
 			return err;
@@ -467,6 +578,9 @@  static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 	ipip_netlink_parms(data, &p, &collect_md, &fwmark);
 	if (collect_md)
 		return -EINVAL;
+	err = ipip_netlink_fan(data, t, &p);
+	if (err < 0)
+		return err;
 
 	if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
 	    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -504,6 +618,8 @@  static size_t ipip_get_size(const struct net_device *dev)
 		nla_total_size(0) +
 		/* IFLA_IPTUN_FWMARK */
 		nla_total_size(4) +
+		/* IFLA_IPTUN_FAN_MAP */
+		nla_total_size(sizeof(struct ip_tunnel_fan_map)) * 256 +
 		0;
 }
 
@@ -536,6 +652,29 @@  static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (tunnel->collect_md)
 		if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
 			goto nla_put_failure;
+	if (tunnel->parms.i_flags & TUNNEL_FAN) {
+		struct nlattr *fan_nest;
+		int i;
+
+		fan_nest = nla_nest_start(skb, IFLA_IPTUN_FAN_MAP);
+		if (!fan_nest)
+			goto nla_put_failure;
+		for (i = 0; i < 256; i++) {
+			if (tunnel->fan.map[i]) {
+				struct ip_tunnel_fan_map map;
+
+				map.underlay = htonl(tunnel->fan.map[i]);
+				map.underlay_prefix = 16;
+				map.overlay = htonl(i << 24);
+				map.overlay_prefix = 8;
+				if (nla_put(skb, IFLA_FAN_MAPPING,
+					    sizeof(map), &map))
+					goto nla_put_failure;
+			}
+		}
+		nla_nest_end(skb, fan_nest);
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -556,6 +695,9 @@  static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_COLLECT_METADATA]	= { .type = NLA_FLAG },
 	[IFLA_IPTUN_FWMARK]		= { .type = NLA_U32 },
+
+	[__IFLA_IPTUN_VENDOR_BREAK ... IFLA_IPTUN_MAX]	= { .type = NLA_BINARY },
+	[IFLA_IPTUN_FAN_MAP]		= { .type = NLA_NESTED },
 };
 
 static struct rtnl_link_ops ipip_link_ops __read_mostly = {
@@ -604,6 +746,23 @@  static struct pernet_operations ipip_net_ops = {
 	.size = sizeof(struct ip_tunnel_net),
 };
 
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *ipip_fan_header;
+static unsigned int ipip_fan_version = 3;
+
+static struct ctl_table ipip_fan_sysctls[] = {
+	{
+		.procname	= "version",
+		.data		= &ipip_fan_version,
+		.maxlen		= sizeof(ipip_fan_version),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+	{},
+};
+
+#endif /* CONFIG_SYSCTL */
+
 static int __init ipip_init(void)
 {
 	int err;
@@ -629,9 +788,22 @@  static int __init ipip_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+#ifdef CONFIG_SYSCTL
+	ipip_fan_header = register_net_sysctl(&init_net, "net/fan",
+					      ipip_fan_sysctls);
+	if (!ipip_fan_header) {
+		err = -ENOMEM;
+		goto sysctl_failed;
+	}
+#endif /* CONFIG_SYSCTL */
+
 out:
 	return err;
 
+#ifdef CONFIG_SYSCTL
+sysctl_failed:
+	rtnl_link_unregister(&ipip_link_ops);
+#endif /* CONFIG_SYSCTL */
 rtnl_link_failed:
 #if IS_ENABLED(CONFIG_MPLS)
 	xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
@@ -646,6 +818,9 @@  static int __init ipip_init(void)
 
 static void __exit ipip_fini(void)
 {
+#ifdef CONFIG_SYSCTL
+	unregister_net_sysctl_table(ipip_fan_header);
+#endif /* CONFIG_SYSCTL */
 	rtnl_link_unregister(&ipip_link_ops);
 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
 		pr_info("%s: can't deregister tunnel\n", __func__);