diff mbox

multicast over nbma gre tunnels

Message ID 4A8126AA.9050309@iki.fi
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Timo Teras Aug. 11, 2009, 8:07 a.m. UTC
Hi,

I'm trying to figure out the proper way to do multicast forwarding
over nbma gre tunnels. Currently, the userland opennhrp daemon just
listens of packet socket, and calls sendto() for each target it
wants to forward it. Obviously this is slow.

I started to look at how to do it kernel. And I'm playing with the
multicast forwarding code, and tried something like below (see patch
at end of mail).

However, there's several draw backs to this:
- ABI breaks: MAXVIFS is changed (used in userland visible struct)
- it does not forward locally originating link-local multicast
  traffic (it does not traverse through mcast forwarding code)
- there can be only one mrouter application, so I can't have
  opennhrp manage the gre device, and some app managing the
  forwarding between interfaces

It looks like to me that the multicast forwarding code would need
a rewrite anyway, using netlink so that it can be managed by
multiple apps without hard limits such as MAXVIFS.

But then again, I'm thinking if the gre nbma part should be made
part of ipmr.c (have "vifs" for each nbma entity; and hook
link-local traffic to ipmr.c too) or ip_gre.c (specific api for
managing the nbma forwardings within gre code).

The other problem is that, each multicast packet could be
forwarded to, say hundred nodes, via same physical eth. I'm
wonder if just copying the skb hundred times and queuing them
on same device causes problems? Any suggestions how this could
be done in a smart way?

Incidentally, I noticed that using this, and having multicast
sender send to gre1, with forwarding gre1->multiple gre1 nbma
destinations, the sender application had a large soft-irq
usage. Forwarding from real interface to multiple nbma
destinations seemed had the expected cpu usage level. I suppose
I introduced (or there exists) some sort of routing loop that
the packets got handled TTL times, instead of once per packet.

Thanks,
  Timo


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 0d45b4e..406ef6f 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -33,7 +33,7 @@ 
 #define SIOCGETSGCNT	(SIOCPROTOPRIVATE+1)
 #define SIOCGETRPF	(SIOCPROTOPRIVATE+2)
 
-#define MAXVIFS		32	
+#define MAXVIFS		256
 typedef unsigned long vifbitmap_t;	/* User mode code depends on this lot */
 typedef unsigned short vifi_t;
 #define ALL_VIFS	((vifi_t)(-1))
@@ -66,6 +66,7 @@  struct vifctl {
 #define VIFF_TUNNEL	0x1	/* IPIP tunnel */
 #define VIFF_SRCRT	0x2	/* NI */
 #define VIFF_REGISTER	0x4	/* register vif	*/
+#define VIFF_NBMA	0x10
 
 /*
  *	Cache manipulation structures for mrouted and PIMd
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 13e9dd3..43c988b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -105,6 +105,31 @@  static struct net_protocol pim_protocol;
 
 static struct timer_list ipmr_expire_timer;
 
+static __be32 ipmr_get_skb_nbma(struct sk_buff *skb)
+{
+	union {
+		char addr[MAX_ADDR_LEN];
+		__be32 inaddr;
+	} u;
+
+	if (dev_parse_header(skb, u.addr) != 4)
+		return INADDR_ANY;
+
+	return u.inaddr;
+}
+
+static int ip_mr_match_vif_skb(struct vif_device *vif, struct sk_buff *skb)
+{
+	if (vif->dev != skb->dev)
+		return 0;
+
+	if (vif->flags & VIFF_NBMA)
+		return ipmr_get_skb_nbma(skb) == vif->remote;
+
+	return 1;
+}
+
+
 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 
 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
@@ -470,6 +495,7 @@  static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
 			return err;
 		}
 		break;
+	case VIFF_NBMA:
 	case 0:
 		dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
 		if (!dev)
@@ -504,7 +530,7 @@  static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
 	v->pkt_in = 0;
 	v->pkt_out = 0;
 	v->link = dev->ifindex;
-	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER|VIFF_NBMA))
 		v->link = dev->iflink;
 
 	/* And finish update writing critical data */
@@ -1212,12 +1238,15 @@  static inline int ipmr_forward_finish(struct sk_buff *skb)
 {
 	struct ip_options * opt	= &(IPCB(skb)->opt);
 
-	IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+	IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
 
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
 
-	return dst_output(skb);
+	if (skb->dst != NULL)
+		return dst_output(skb);
+	else
+		return dev_queue_xmit(skb);
 }
 
 /*
@@ -1230,7 +1259,8 @@  static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct vif_device *vif = &net->ipv4.vif_table[vifi];
 	struct net_device *dev;
-	struct rtable *rt;
+	struct net_device *fromdev = skb->dev;
+	struct rtable *rt = NULL;
 	int    encap = 0;
 
 	if (vif->dev == NULL)
@@ -1257,6 +1287,19 @@  static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 		if (ip_route_output_key(net, &rt, &fl))
 			goto out_free;
 		encap = sizeof(struct iphdr);
+		dev = rt->u.dst.dev;
+	} else if (vif->flags&VIFF_NBMA) {
+		/* Fixme, we should take tunnel source address from the
+		 * tunnel device binding if it exists */
+		struct flowi fl = { .oif = vif->link,
+				    .nl_u = { .ip4_u =
+					      { .daddr = vif->remote,
+						.tos = RT_TOS(iph->tos) } },
+				    .proto = IPPROTO_GRE };
+		if (ip_route_output_key(&init_net, &rt, &fl))
+			goto out_free;
+		encap = LL_RESERVED_SPACE(rt->u.dst.dev);
+		dev = vif->dev;
 	} else {
 		struct flowi fl = { .oif = vif->link,
 				    .nl_u = { .ip4_u =
@@ -1265,34 +1308,39 @@  static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 				    .proto = IPPROTO_IPIP };
 		if (ip_route_output_key(net, &rt, &fl))
 			goto out_free;
+		dev = rt->u.dst.dev;
 	}
 
-	dev = rt->u.dst.dev;
+	if (!(vif->flags & VIFF_NBMA)) {
+		if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+			/* Do not fragment multicasts. Alas, IPv4 does not
+			   allow to send ICMP, so that packets will disappear
+			   to blackhole.
+			*/
 
-	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
-		/* Do not fragment multicasts. Alas, IPv4 does not
-		   allow to send ICMP, so that packets will disappear
-		   to blackhole.
-		 */
-
-		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
-		ip_rt_put(rt);
-		goto out_free;
+			IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+			goto out_free_rt;
+		}
 	}
 
 	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
 
-	if (skb_cow(skb, encap)) {
-		ip_rt_put(rt);
-		goto out_free;
-	}
+	if (skb_cow(skb, encap))
+		goto out_free_rt;
 
 	vif->pkt_out++;
 	vif->bytes_out += skb->len;
 
 	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	if (vif->flags & VIFF_NBMA) {
+		ip_rt_put(rt);
+		skb->dst = NULL;
+		rt = NULL;
+	} else {
+		skb->dst = &rt->u.dst;
+	}
 	ip_decrease_ttl(ip_hdr(skb));
+	skb->dev = dev;
 
 	/* FIXME: forward and output firewalls used to be called here.
 	 * What do we do with netfilter? -- RR */
@@ -1301,6 +1349,10 @@  static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 		/* FIXME: extra output firewall step used to be here. --RR */
 		vif->dev->stats.tx_packets++;
 		vif->dev->stats.tx_bytes += skb->len;
+	} else if (vif->flags & VIFF_NBMA) {
+		if (dev_hard_header(skb, dev, ntohs(skb->protocol),
+				    &vif->remote, NULL, 4) < 0)
+			goto out_free_rt;
 	}
 
 	IPCB(skb)->flags |= IPSKB_FORWARDED;
@@ -1316,21 +1368,30 @@  static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 	 * not mrouter) cannot join to more than one interface - it will
 	 * result in receiving multiple packets.
 	 */
-	NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
+	NF_HOOK(PF_INET, NF_INET_FORWARD, skb, fromdev, dev,
 		ipmr_forward_finish);
 	return;
 
+out_free_rt:
+	if (rt != NULL)
+		ip_rt_put(rt);
 out_free:
 	kfree_skb(skb);
 	return;
 }
 
-static int ipmr_find_vif(struct net_device *dev)
+static int ipmr_find_vif(struct net_device *dev, __be32 nbma_origin)
 {
 	struct net *net = dev_net(dev);
 	int ct;
 	for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) {
-		if (net->ipv4.vif_table[ct].dev == dev)
+		if (net->ipv4.vif_table[ct].dev != dev)
+			continue;
+
+		if (net->ipv4.vif_table[ct].flags & VIFF_NBMA) {
+			if (net->ipv4.vif_table[ct].remote == nbma_origin)
+				break;
+		} else if (nbma_origin == INADDR_ANY)
 			break;
 	}
 	return ct;
@@ -1351,7 +1412,7 @@  static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 	/*
 	 * Wrong interface: drop packet and (maybe) send PIM assert.
 	 */
-	if (net->ipv4.vif_table[vif].dev != skb->dev) {
+	if (!ip_mr_match_vif_skb(&net->ipv4.vif_table[vif], skb)) {
 		int true_vifi;
 
 		if (skb->rtable->fl.iif == 0) {
@@ -1370,7 +1431,7 @@  static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 		}
 
 		cache->mfc_un.res.wrong_if++;
-		true_vifi = ipmr_find_vif(skb->dev);
+		true_vifi = ipmr_find_vif(skb->dev, ipmr_get_skb_nbma(skb));
 
 		if (true_vifi >= 0 && net->ipv4.mroute_do_assert &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
@@ -1479,7 +1540,7 @@  int ip_mr_input(struct sk_buff *skb)
 			skb = skb2;
 		}
 
-		vif = ipmr_find_vif(skb->dev);
+		vif = ipmr_find_vif(skb->dev, ipmr_get_skb_nbma(skb));
 		if (vif >= 0) {
 			int err = ipmr_cache_unresolved(net, vif, skb);
 			read_unlock(&mrt_lock);
@@ -1663,7 +1724,7 @@  int ipmr_get_route(struct net *net,
 		}
 
 		dev = skb->dev;
-		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
+		if (dev == NULL || (vif = ipmr_find_vif(dev, INADDR_ANY)) < 0) {
 			read_unlock(&mrt_lock);
 			return -ENODEV;
 		}