Patchwork IP: Increment INADDRERRORS if routing for a packet is not successful

login
register
mail settings
Submitter Eric Dumazet
Date June 2, 2010, 7:25 p.m.
Message ID <1275506732.2519.23.camel@edumazet-laptop>
Download mbox | patch
Permalink /patch/54421/
State RFC
Delegated to: David Miller
Headers show

Comments

Eric Dumazet - June 2, 2010, 7:25 p.m.
Le mercredi 02 juin 2010 à 13:59 -0500, Christoph Lameter a écrit :

> The rp_filter is rejecting traffic coming into a NIC for which the kernel
> has a multicast join list that indicates that this traffic is expected on
> this NIC. You could consult the MC subscription list to verify that the
> traffic is coming into the right NIC.
> 
> In the MC case the user can explicitly specify through which NIC the
> traffic is expected. See IP_ADD_MEMBERSHIP.

This has litle to do with MC. We certainly are not going to check MC
membership in fib_validate_source() !

Say we have eth0 on 192.168.0.1/24 and eth1 on 192.168.0.2/24

Then we cannot use rp_filter = 1, even with unicast trafic.

I really dont understand why you would setup rp_filter in such a
situation. This wont work.

Now, I agree we should have a counter somewhere to help admins to
understand their error ;)

Here is patch I am currently testing.

I finaly created a new counter, because its a linux specific check.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter - June 2, 2010, 8:11 p.m.
On Wed, 2 Jun 2010, Eric Dumazet wrote:

>
> Say we have eth0 on 192.168.0.1/24 and eth1 on 192.168.0.2/24
>
> Then we cannot use rp_filter = 1, even with unicast trafic.
>
> I really dont understand why you would setup rp_filter in such a
> situation. This wont work.

rp_filter was setup in the past and it worked. Stephen fixed it in 2.6.31
for multicast and thus suddenly multicast stopped working on secondary
interfaces when we moved to 2.6.32. rp_filter having to be off is okay but
it does not feel correct.

> Now, I agree we should have a counter somewhere to help admins to
> understand their error ;)

Ah. Good.

> Here is patch I am currently testing.
>
> I finaly created a new counter, because its a linux specific check.

Looks good which does not say too much given my limited networking
knowledge.

Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 5279771..ebb0c80 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -229,6 +229,7 @@  enum
 	LINUX_MIB_TCPBACKLOGDROP,
 	LINUX_MIB_TCPMINTTLDROP, /* RFC 5082 */
 	LINUX_MIB_TCPDEFERACCEPTDROP,
+	LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4f0ed45..e830f7a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -284,7 +284,7 @@  int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 	if (no_addr)
 		goto last_resort;
 	if (rpf == 1)
-		goto e_inval;
+		goto e_rpf;
 	fl.oif = dev->ifindex;
 
 	ret = 0;
@@ -299,7 +299,7 @@  int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 
 last_resort:
 	if (rpf)
-		goto e_inval;
+		goto e_rpf;
 	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
 	*itag = 0;
 	return 0;
@@ -308,6 +308,8 @@  e_inval_res:
 	fib_res_put(&res);
 e_inval:
 	return -EINVAL;
+e_rpf:
+	return -EXDEV;
 }
 
 static inline __be32 sk_extract_addr(struct sockaddr *addr)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d930dc5..d52c9da 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -340,6 +340,9 @@  static int ip_rcv_finish(struct sk_buff *skb)
 			else if (err == -ENETUNREACH)
 				IP_INC_STATS_BH(dev_net(skb->dev),
 						IPSTATS_MIB_INNOROUTES);
+			else if (err == -EXDEV)
+				NET_INC_STATS_BH(dev_net(skb->dev),
+						 LINUX_MIB_IPRPFILTER);
 			goto drop;
 		}
 	}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3dc9914..e320ca6 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -252,6 +252,7 @@  static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
 	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
 	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
+	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8495bce..3a264f7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1851,6 +1851,7 @@  static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	__be32 spec_dst;
 	struct in_device *in_dev = in_dev_get(dev);
 	u32 itag = 0;
+	int err;
 
 	/* Primary sanity checks. */
 
@@ -1865,10 +1866,12 @@  static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		if (!ipv4_is_local_multicast(daddr))
 			goto e_inval;
 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
-	} else if (fib_validate_source(saddr, 0, tos, 0,
-					dev, &spec_dst, &itag, 0) < 0)
-		goto e_inval;
-
+	} else {
+		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
+					  &itag, 0);
+		if (err < 0)
+			goto e_err;
+	}
 	rth = dst_alloc(&ipv4_dst_ops);
 	if (!rth)
 		goto e_nobufs;
@@ -1922,6 +1925,9 @@  e_nobufs:
 e_inval:
 	in_dev_put(in_dev);
 	return -EINVAL;
+e_err:
+	in_dev_put(in_dev);
+	return err;
 }
 
 
@@ -1985,7 +1991,6 @@  static int __mkroute_input(struct sk_buff *skb,
 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
 					 saddr);
 
-		err = -EINVAL;
 		goto cleanup;
 	}
 
@@ -2191,7 +2196,7 @@  brd_input:
 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
 					  &itag, skb->mark);
 		if (err < 0)
-			goto martian_source;
+			goto martian_source_keep_err;
 		if (err)
 			flags |= RTCF_DIRECTSRC;
 	}
@@ -2272,8 +2277,10 @@  e_nobufs:
 	goto done;
 
 martian_source:
+	err = -EINVAL;
+martian_source_keep_err:
 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
-	goto e_inval;
+	goto done;
 }
 
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,