diff mbox

multicast for v2.6.29

Message ID 4A40E151.10007@iki.fi
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Timo Teras June 23, 2009, 2:06 p.m. UTC

Comments

Timo Teras June 23, 2009, 2:41 p.m. UTC | #1
Whoops. This was not supposed be sent on the list as-is.

This is more of an RFC, for what I want to do in future. Multicast routing
for NBMA destinations.

The patch is quick hack, and breaks ABI, and not intended for inclusion.
I suppose the multicast routing would need to be updated to Netlink era
before doing this kind of thing. And remove the hard coded limits such
as amount of MFC destinations and that you can have only one multicast
router user space process.

If you have any comments on the patch, they are appreciated.

Thanks,
  Timo

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 8a45569..13500a3 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -33,7 +33,7 @@ 
 #define SIOCGETSGCNT	(SIOCPROTOPRIVATE+1)
 #define SIOCGETRPF	(SIOCPROTOPRIVATE+2)
 
-#define MAXVIFS		32	
+#define MAXVIFS		256
 typedef unsigned long vifbitmap_t;	/* User mode code depends on this lot */
 typedef unsigned short vifi_t;
 #define ALL_VIFS	((vifi_t)(-1))
@@ -41,7 +41,7 @@  typedef unsigned short vifi_t;
 /*
  *	Same idea as select
  */
- 
+
 #define VIFM_SET(n,m)	((m)|=(1<<(n)))
 #define VIFM_CLR(n,m)	((m)&=~(1<<(n)))
 #define VIFM_ISSET(n,m)	((m)&(1<<(n)))
@@ -53,7 +53,7 @@  typedef unsigned short vifi_t;
  *	Passed by mrouted for an MRT_ADD_VIF - again we use the
  *	mrouted 3.6 structures for compatibility
  */
- 
+
 struct vifctl {
 	vifi_t	vifc_vifi;		/* Index of VIF */
 	unsigned char vifc_flags;	/* VIFF_ flags */
@@ -66,11 +66,12 @@  struct vifctl {
 #define VIFF_TUNNEL	0x1	/* IPIP tunnel */
 #define VIFF_SRCRT	0x2	/* NI */
 #define VIFF_REGISTER	0x4	/* register vif	*/
+#define VIFF_NBMA	0x10
 
 /*
  *	Cache manipulation structures for mrouted and PIMd
  */
- 
+
 struct mfcctl
 {
 	struct in_addr mfcc_origin;		/* Origin of mcast	*/
@@ -83,10 +84,10 @@  struct mfcctl
 	int	     mfcc_expire;
 };
 
-/* 
+/*
  *	Group count retrieval for mrouted
  */
- 
+
 struct sioc_sg_req
 {
 	struct in_addr src;
@@ -113,7 +114,7 @@  struct sioc_vif_req
  *	This is the format the mroute daemon expects to see IGMP control
  *	data. Magically happens to be like an IP packet as per the original
  */
- 
+
 struct igmpmsg
 {
 	__u32 unused1,unused2;
@@ -190,7 +191,7 @@  struct vif_device
 
 #define VIFF_STATIC 0x8000
 
-struct mfc_cache 
+struct mfc_cache
 {
 	struct mfc_cache *next;			/* Next entry on cache line 	*/
 	__be32 mfc_mcastgrp;			/* Group the entry belongs to 	*/
@@ -224,7 +225,7 @@  struct mfc_cache
 #define MFC_HASH(a,b)	(((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1))
 #else
 #define MFC_HASH(a,b)	((((__force u32)(__be32)a)^(((__force u32)(__be32)b)>>2))&(MFC_LINES-1))
-#endif		
+#endif
 
 #endif
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 1466644..5adea03 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -116,6 +116,31 @@  static struct net_protocol pim_protocol;
 
 static struct timer_list ipmr_expire_timer;
 
+static __be32 ipmr_get_skb_nbma(struct sk_buff *skb)
+{
+	union {
+		char addr[MAX_ADDR_LEN];
+		__be32 inaddr;
+	} u;
+
+	if (dev_parse_header(skb, u.addr) != 4)
+		return INADDR_ANY;
+
+	return u.inaddr;
+}
+
+static int ip_mr_match_vif_skb(struct vif_device *vif, struct sk_buff *skb)
+{
+	if (vif->dev != skb->dev)
+		return 0;
+
+	if (vif->flags & VIFF_NBMA)
+		return ipmr_get_skb_nbma(skb) == vif->remote;
+
+	return 1;
+}
+
+
 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 
 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
@@ -468,6 +493,7 @@  static int vif_add(struct vifctl *vifc, int mrtsock)
 			return err;
 		}
 		break;
+	case VIFF_NBMA:
 	case 0:
 		dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
 		if (!dev)
@@ -502,7 +528,7 @@  static int vif_add(struct vifctl *vifc, int mrtsock)
 	v->pkt_in = 0;
 	v->pkt_out = 0;
 	v->link = dev->ifindex;
-	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER|VIFF_NBMA))
 		v->link = dev->iflink;
 
 	/* And finish update writing critical data */
@@ -1191,12 +1217,15 @@  static inline int ipmr_forward_finish(struct sk_buff *skb)
 {
 	struct ip_options * opt	= &(IPCB(skb)->opt);
 
-	IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+	IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
 
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
 
-	return dst_output(skb);
+	if (skb->dst != NULL)
+		return dst_output(skb);
+	else
+		return dev_queue_xmit(skb);
 }
 
 /*
@@ -1208,7 +1237,8 @@  static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct vif_device *vif = &vif_table[vifi];
 	struct net_device *dev;
-	struct rtable *rt;
+	struct net_device *fromdev = skb->dev;
+	struct rtable *rt = NULL;
 	int    encap = 0;
 
 	if (vif->dev == NULL)
@@ -1236,6 +1266,19 @@  static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 		if (ip_route_output_key(&init_net, &rt, &fl))
 			goto out_free;
 		encap = sizeof(struct iphdr);
+		dev = rt->u.dst.dev;
+	} else if (vif->flags&VIFF_NBMA) {
+		/* Fixme, we should take tunnel source address from the
+		 * tunnel device binding if it exists */
+		struct flowi fl = { .oif = vif->link,
+				    .nl_u = { .ip4_u =
+					      { .daddr = vif->remote,
+						.tos = RT_TOS(iph->tos) } },
+				    .proto = IPPROTO_GRE };
+		if (ip_route_output_key(&init_net, &rt, &fl))
+			goto out_free;
+		encap = LL_RESERVED_SPACE(rt->u.dst.dev);
+		dev = vif->dev;
 	} else {
 		struct flowi fl = { .oif = vif->link,
 				    .nl_u = { .ip4_u =
@@ -1244,34 +1287,39 @@  static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 				    .proto = IPPROTO_IPIP };
 		if (ip_route_output_key(&init_net, &rt, &fl))
 			goto out_free;
+		dev = rt->u.dst.dev;
 	}
 
-	dev = rt->u.dst.dev;
+	if (!(vif->flags & VIFF_NBMA)) {
+		if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+			/* Do not fragment multicasts. Alas, IPv4 does not
+			   allow to send ICMP, so that packets will disappear
+			   to blackhole.
+			*/
 
-	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
-		/* Do not fragment multicasts. Alas, IPv4 does not
-		   allow to send ICMP, so that packets will disappear
-		   to blackhole.
-		 */
-
-		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
-		ip_rt_put(rt);
-		goto out_free;
+			IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+			goto out_free_rt;
+		}
 	}
 
 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
 
-	if (skb_cow(skb, encap)) {
-		ip_rt_put(rt);
-		goto out_free;
-	}
+	if (skb_cow(skb, encap))
+		goto out_free_rt;
 
 	vif->pkt_out++;
 	vif->bytes_out += skb->len;
 
 	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	if (vif->flags & VIFF_NBMA) {
+		ip_rt_put(rt);
+		skb->dst = NULL;
+		rt = NULL;
+	} else {
+		skb->dst = &rt->u.dst;
+	}
 	ip_decrease_ttl(ip_hdr(skb));
+	skb->dev = dev;
 
 	/* FIXME: forward and output firewalls used to be called here.
 	 * What do we do with netfilter? -- RR */
@@ -1280,6 +1328,10 @@  static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 		/* FIXME: extra output firewall step used to be here. --RR */
 		vif->dev->stats.tx_packets++;
 		vif->dev->stats.tx_bytes += skb->len;
+	} else if (vif->flags & VIFF_NBMA) {
+		if (dev_hard_header(skb, dev, ntohs(skb->protocol),
+				    &vif->remote, NULL, 4) < 0)
+			goto out_free_rt;
 	}
 
 	IPCB(skb)->flags |= IPSKB_FORWARDED;
@@ -1295,20 +1347,29 @@  static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 	 * not mrouter) cannot join to more than one interface - it will
 	 * result in receiving multiple packets.
 	 */
-	NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
+	NF_HOOK(PF_INET, NF_INET_FORWARD, skb, fromdev, dev,
 		ipmr_forward_finish);
 	return;
 
+out_free_rt:
+	if (rt != NULL)
+		ip_rt_put(rt);
 out_free:
 	kfree_skb(skb);
 	return;
 }
 
-static int ipmr_find_vif(struct net_device *dev)
+static int ipmr_find_vif(struct net_device *dev, __be32 nbma_origin)
 {
 	int ct;
 	for (ct=maxvif-1; ct>=0; ct--) {
-		if (vif_table[ct].dev == dev)
+		if (vif_table[ct].dev != dev)
+			continue;
+
+		if (vif_table[ct].flags & VIFF_NBMA) {
+			if (vif_table[ct].remote == nbma_origin)
+				break;
+		} else if (nbma_origin == INADDR_ANY)
 			break;
 	}
 	return ct;
@@ -1328,7 +1389,7 @@  static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 	/*
 	 * Wrong interface: drop packet and (maybe) send PIM assert.
 	 */
-	if (vif_table[vif].dev != skb->dev) {
+	if (!ip_mr_match_vif_skb(&vif_table[vif], skb)) {
 		int true_vifi;
 
 		if (skb->rtable->fl.iif == 0) {
@@ -1347,7 +1408,7 @@  static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 		}
 
 		cache->mfc_un.res.wrong_if++;
-		true_vifi = ipmr_find_vif(skb->dev);
+		true_vifi = ipmr_find_vif(skb->dev, ipmr_get_skb_nbma(skb));
 
 		if (true_vifi >= 0 && mroute_do_assert &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
@@ -1454,7 +1515,7 @@  int ip_mr_input(struct sk_buff *skb)
 			skb = skb2;
 		}
 
-		vif = ipmr_find_vif(skb->dev);
+		vif = ipmr_find_vif(skb->dev, ipmr_get_skb_nbma(skb));
 		if (vif >= 0) {
 			int err = ipmr_cache_unresolved(vif, skb);
 			read_unlock(&mrt_lock);
@@ -1634,7 +1695,7 @@  int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
 		}
 
 		dev = skb->dev;
-		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
+		if (dev == NULL || (vif = ipmr_find_vif(dev, INADDR_ANY)) < 0) {
 			read_unlock(&mrt_lock);
 			return -ENODEV;
 		}