diff mbox

[net,v3] ipv4: try to cache dst_entries which would cause a redirect

Message ID 996db187263dc0419ea3ab4d6e3fad4c0e0e5b44.1422010254.git.hannes@stressinduktion.org
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Hannes Frederic Sowa Jan. 23, 2015, 11:01 a.m. UTC
Not caching dst_entries which cause redirects could be exploited by hosts
on the same subnet, causing a severe DoS attack. This effect aggravated
since commit f88649721268999 ("ipv4: fix dst race in sk_dst_get()").

Lookups causing redirects will be allocated with DST_NOCACHE set which
will force dst_release to free them via RCU.  Unfortunately waiting for
RCU grace period just takes too long, we can end up with >1M dst_entries
waiting to be released and the system will run OOM. rcuos threads cannot
catch up under high softirq load.

Attaching the flag to emit a redirect later on to the specific skb allows
us to cache those dst_entries thus reducing the pressure on allocation
and deallocation.

This issue was discovered by Marcelo Leitner.

Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Marcelo Leitner <mleitner@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
v2:
Julian noticed that v1 did omit the redirect flag in rtnetlink
queries, fixed. Thanks!

v3:
Julian noticed that we could accidentally write into NEIGHCB, fixed
and thanks again!

 include/net/ip.h      | 11 ++++++-----
 net/ipv4/ip_forward.c |  3 ++-
 net/ipv4/route.c      |  9 +++++----
 3 files changed, 13 insertions(+), 10 deletions(-)

Comments

Julian Anastasov Jan. 23, 2015, 8:09 p.m. UTC | #1
Hello,

On Fri, 23 Jan 2015, Hannes Frederic Sowa wrote:

> Not caching dst_entries which cause redirects could be exploited by hosts
> on the same subnet, causing a severe DoS attack. This effect aggravated
> since commit f88649721268999 ("ipv4: fix dst race in sk_dst_get()").
> 
> Lookups causing redirects will be allocated with DST_NOCACHE set which
> will force dst_release to free them via RCU.  Unfortunately waiting for
> RCU grace period just takes too long, we can end up with >1M dst_entries
> waiting to be released and the system will run OOM. rcuos threads cannot
> catch up under high softirq load.
> 
> Attaching the flag to emit a redirect later on to the specific skb allows
> us to cache those dst_entries thus reducing the pressure on allocation
> and deallocation.
> 
> This issue was discovered by Marcelo Leitner.
> 
> Cc: Julian Anastasov <ja@ssi.bg>
> Signed-off-by: Marcelo Leitner <mleitner@redhat.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

	Thanks! Looks fine.

Signed-off-by: Julian Anastasov <ja@ssi.bg>

> ---
> v2:
> Julian noticed that v1 did omit the redirect flag in rtnetlink
> queries, fixed. Thanks!
> 
> v3:
> Julian noticed that we could accidentally write into NEIGHCB, fixed
> and thanks again!
> 
>  include/net/ip.h      | 11 ++++++-----
>  net/ipv4/ip_forward.c |  3 ++-
>  net/ipv4/route.c      |  9 +++++----
>  3 files changed, 13 insertions(+), 10 deletions(-)
> 
> diff --git a/include/net/ip.h b/include/net/ip.h
> index 0e5a0ba..14211ea 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -39,11 +39,12 @@ struct inet_skb_parm {
>  	struct ip_options	opt;		/* Compiled IP options		*/
>  	unsigned char		flags;
>  
> -#define IPSKB_FORWARDED		1
> -#define IPSKB_XFRM_TUNNEL_SIZE	2
> -#define IPSKB_XFRM_TRANSFORMED	4
> -#define IPSKB_FRAG_COMPLETE	8
> -#define IPSKB_REROUTED		16
> +#define IPSKB_FORWARDED		BIT(0)
> +#define IPSKB_XFRM_TUNNEL_SIZE	BIT(1)
> +#define IPSKB_XFRM_TRANSFORMED	BIT(2)
> +#define IPSKB_FRAG_COMPLETE	BIT(3)
> +#define IPSKB_REROUTED		BIT(4)
> +#define IPSKB_DOREDIRECT	BIT(5)
>  
>  	u16			frag_max_size;
>  };
> diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
> index 3a83ce5..787b3c2 100644
> --- a/net/ipv4/ip_forward.c
> +++ b/net/ipv4/ip_forward.c
> @@ -129,7 +129,8 @@ int ip_forward(struct sk_buff *skb)
>  	 *	We now generate an ICMP HOST REDIRECT giving the route
>  	 *	we calculated.
>  	 */
> -	if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
> +	if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
> +	    !skb_sec_path(skb))
>  		ip_rt_send_redirect(skb);
>  
>  	skb->priority = rt_tos2priority(iph->tos);
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 2000110..0c63b2a 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1567,11 +1567,10 @@ static int __mkroute_input(struct sk_buff *skb,
>  
>  	do_cache = res->fi && !itag;
>  	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
> +	    skb->protocol == htons(ETH_P_IP) &&
>  	    (IN_DEV_SHARED_MEDIA(out_dev) ||
> -	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
> -		flags |= RTCF_DOREDIRECT;
> -		do_cache = false;
> -	}
> +	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
> +		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
>  
>  	if (skb->protocol != htons(ETH_P_IP)) {
>  		/* Not IP (i.e. ARP). Do not create route, if it is
> @@ -2316,6 +2315,8 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
>  	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
>  	if (rt->rt_flags & RTCF_NOTIFY)
>  		r->rtm_flags |= RTM_F_NOTIFY;
> +	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
> +		r->rtm_flags |= RTCF_DOREDIRECT;
>  
>  	if (nla_put_be32(skb, RTA_DST, dst))
>  		goto nla_put_failure;
> -- 
> 2.1.0

Regards

--
Julian Anastasov <ja@ssi.bg>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Jan. 27, 2015, 1:29 a.m. UTC | #2
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Fri, 23 Jan 2015 12:01:26 +0100

> Not caching dst_entries which cause redirects could be exploited by hosts
> on the same subnet, causing a severe DoS attack. This effect aggravated
> since commit f88649721268999 ("ipv4: fix dst race in sk_dst_get()").
> 
> Lookups causing redirects will be allocated with DST_NOCACHE set which
> will force dst_release to free them via RCU.  Unfortunately waiting for
> RCU grace period just takes too long, we can end up with >1M dst_entries
> waiting to be released and the system will run OOM. rcuos threads cannot
> catch up under high softirq load.
> 
> Attaching the flag to emit a redirect later on to the specific skb allows
> us to cache those dst_entries thus reducing the pressure on allocation
> and deallocation.
> 
> This issue was discovered by Marcelo Leitner.
> 
> Cc: Julian Anastasov <ja@ssi.bg>
> Signed-off-by: Marcelo Leitner <mleitner@redhat.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

Applied and queued up for -stable, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/ip.h b/include/net/ip.h
index 0e5a0ba..14211ea 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -39,11 +39,12 @@  struct inet_skb_parm {
 	struct ip_options	opt;		/* Compiled IP options		*/
 	unsigned char		flags;
 
-#define IPSKB_FORWARDED		1
-#define IPSKB_XFRM_TUNNEL_SIZE	2
-#define IPSKB_XFRM_TRANSFORMED	4
-#define IPSKB_FRAG_COMPLETE	8
-#define IPSKB_REROUTED		16
+#define IPSKB_FORWARDED		BIT(0)
+#define IPSKB_XFRM_TUNNEL_SIZE	BIT(1)
+#define IPSKB_XFRM_TRANSFORMED	BIT(2)
+#define IPSKB_FRAG_COMPLETE	BIT(3)
+#define IPSKB_REROUTED		BIT(4)
+#define IPSKB_DOREDIRECT	BIT(5)
 
 	u16			frag_max_size;
 };
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 3a83ce5..787b3c2 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -129,7 +129,8 @@  int ip_forward(struct sk_buff *skb)
 	 *	We now generate an ICMP HOST REDIRECT giving the route
 	 *	we calculated.
 	 */
-	if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
+	if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
+	    !skb_sec_path(skb))
 		ip_rt_send_redirect(skb);
 
 	skb->priority = rt_tos2priority(iph->tos);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2000110..0c63b2a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1567,11 +1567,10 @@  static int __mkroute_input(struct sk_buff *skb,
 
 	do_cache = res->fi && !itag;
 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
+	    skb->protocol == htons(ETH_P_IP) &&
 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
-	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
-		flags |= RTCF_DOREDIRECT;
-		do_cache = false;
-	}
+	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
 
 	if (skb->protocol != htons(ETH_P_IP)) {
 		/* Not IP (i.e. ARP). Do not create route, if it is
@@ -2316,6 +2315,8 @@  static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
 	if (rt->rt_flags & RTCF_NOTIFY)
 		r->rtm_flags |= RTM_F_NOTIFY;
+	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
+		r->rtm_flags |= RTCF_DOREDIRECT;
 
 	if (nla_put_be32(skb, RTA_DST, dst))
 		goto nla_put_failure;