diff mbox

[net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path

Message ID 20141113225457.A3E502900805@tardy
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Rick Jones Nov. 13, 2014, 10:54 p.m. UTC
From: Rick Jones <rick.jones2@hp.com>

If icmp_rcv() has successfully processed the incoming ICMP datagram, we
should use consume_skb() rather than kfree_skb() because a hit on the likes
of perf -e skb:kfree_skb is not called-for.

Signed-off-by: Rick Jones <rick.jones2@hp.com>

---

A test system hit with a flood ping hits on perf top -e ksb:kfre_skb before
the change and none after for the normal/success path.  The IPv6 path would
be somewhat more ugly.  For the time being, just deal with the overlap on
ping_rcv() between the two to avoid a possible double free of an skb.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

David Miller Nov. 14, 2014, 1:32 a.m. UTC | #1
From: raj@tardy.usa.hp.com (Rick Jones)
Date: Thu, 13 Nov 2014 14:54:57 -0800 (PST)

> +	/* until the v6 path can be better sorted we may still need
> +	 * to kfree_sbk() here but want to avoid a double free from

Typo "kfree_skb()".
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 14, 2014, 2:17 a.m. UTC | #2
On Thu, 2014-11-13 at 14:54 -0800, Rick Jones wrote:
> From: Rick Jones <rick.jones2@hp.com>
> 
> If icmp_rcv() has successfully processed the incoming ICMP datagram, we
> should use consume_skb() rather than kfree_skb() because a hit on the likes
> of perf -e skb:kfree_skb is not called-for.
> 
> Signed-off-by: Rick Jones <rick.jones2@hp.com>
> 
> ---
> 
> A test system hit with a flood ping hits on perf top -e ksb:kfre_skb before
> the change and none after for the normal/success path.  The IPv6 path would
> be somewhat more ugly.  For the time being, just deal with the overlap on
> ping_rcv() between the two to avoid a possible double free of an skb.
> 
> diff --git a/include/net/ping.h b/include/net/ping.h
> index 026479b..f074060 100644
> --- a/include/net/ping.h
> +++ b/include/net/ping.h
> @@ -82,7 +82,7 @@ int  ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
>  int  ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
>  		     size_t len);
>  int  ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
> -void ping_rcv(struct sk_buff *skb);
> +bool ping_rcv(struct sk_buff *skb);
>  
>  #ifdef CONFIG_PROC_FS
>  struct ping_seq_afinfo {
> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> index 36b7bfa..b9f3653 100644
> --- a/net/ipv4/icmp.c
> +++ b/net/ipv4/icmp.c
> @@ -190,7 +190,7 @@ EXPORT_SYMBOL(icmp_err_convert);
>   */
>  
>  struct icmp_control {
> -	void (*handler)(struct sk_buff *skb);
> +	bool (*handler)(struct sk_buff *skb);
>  	short   error;		/* This ICMP is classed as an error message */
>  };
>  
> @@ -746,7 +746,7 @@ static bool icmp_tag_validation(int proto)
>   *	ICMP_PARAMETERPROB.
>   */
>  
> -static void icmp_unreach(struct sk_buff *skb)
> +static bool icmp_unreach(struct sk_buff *skb)
>  {
>  	const struct iphdr *iph;
>  	struct icmphdr *icmph;
> @@ -839,10 +839,11 @@ static void icmp_unreach(struct sk_buff *skb)
>  	icmp_socket_deliver(skb, info);
>  
>  out:
> -	return;
> +	return true;
>  out_err:
>  	ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
> -	goto out;
> +	kfree_skb(skb);
> +	return false;
>  }
>  
> 
> @@ -850,17 +851,22 @@ out_err:
>   *	Handle ICMP_REDIRECT.
>   */
>  
> -static void icmp_redirect(struct sk_buff *skb)
> +static bool icmp_redirect(struct sk_buff *skb)
>  {
>  	if (skb->len < sizeof(struct iphdr)) {
>  		ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
> -		return;
> +		kfree_skb(skb);
> +		return false;
>  	}
>  
> -	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
> -		return;
> +	if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
> +		/* there aught to be a stat */
> +		kfree_skb(skb);
> +		return false;
> +	}
>  
>  	icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
> +	return true;
>  }
>  
>  /*
> @@ -875,7 +881,7 @@ static void icmp_redirect(struct sk_buff *skb)
>   *	See also WRT handling of options once they are done and working.
>   */
>  
> -static void icmp_echo(struct sk_buff *skb)
> +static bool icmp_echo(struct sk_buff *skb)
>  {
>  	struct net *net;
>  
> @@ -891,6 +897,8 @@ static void icmp_echo(struct sk_buff *skb)
>  		icmp_param.head_len	   = sizeof(struct icmphdr);
>  		icmp_reply(&icmp_param, skb);
>  	}
> +	/* should there be an ICMP stat for ignored echos? */
> +	return true;
>  }
>  
>  /*
> @@ -900,7 +908,7 @@ static void icmp_echo(struct sk_buff *skb)
>   *		  MUST be accurate to a few minutes.
>   *		  MUST be updated at least at 15Hz.
>   */
> -static void icmp_timestamp(struct sk_buff *skb)
> +static bool icmp_timestamp(struct sk_buff *skb)
>  {
>  	struct timespec tv;
>  	struct icmp_bxm icmp_param;
> @@ -927,15 +935,18 @@ static void icmp_timestamp(struct sk_buff *skb)
>  	icmp_param.data_len	   = 0;
>  	icmp_param.head_len	   = sizeof(struct icmphdr) + 12;
>  	icmp_reply(&icmp_param, skb);
> -out:
> -	return;
> +	return true;
> +
>  out_err:
>  	ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
> -	goto out;
> +	kfree_skb(skb);
> +	return false;
>  }
>  
> -static void icmp_discard(struct sk_buff *skb)
> +static bool icmp_discard(struct sk_buff *skb)
>  {
> +	/* pretend it was a success */
> +	return true;
>  }
>  
>  /*
> @@ -946,6 +957,7 @@ int icmp_rcv(struct sk_buff *skb)
>  	struct icmphdr *icmph;
>  	struct rtable *rt = skb_rtable(skb);
>  	struct net *net = dev_net(rt->dst.dev);
> +	bool success;
>  
>  	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
>  		struct sec_path *sp = skb_sec_path(skb);
> @@ -1012,7 +1024,12 @@ int icmp_rcv(struct sk_buff *skb)
>  		}
>  	}
>  
> -	icmp_pointers[icmph->type].handler(skb);
> +	success = icmp_pointers[icmph->type].handler(skb);
> +
> +	if (success) 
> +		consume_skb(skb);
> +
> +	return 0;


This looks quite complicated to me.

Why are you adding kfree_skb() everywhere instead of :

	bool to_consume = icmp_pointers[icmph->type].handler(skb);
	if (ro_consume)
		consume_skb(skb);
	else
		kfree_skb(skb);

>  
>  drop:
>  	kfree_skb(skb);



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones Nov. 14, 2014, 4:16 p.m. UTC | #3
>
> This looks quite complicated to me.
>
> Why are you adding kfree_skb() everywhere instead of :
>
> 	bool to_consume = icmp_pointers[icmph->type].handler(skb);
> 	if (ro_consume)
> 		consume_skb(skb);
> 	else
> 		kfree_skb(skb);

I thought the point of the drop profiling was to show where the drops 
were happening.  Leaving the kfree_skb() up in icmp_rcv() does not 
improve showing where the drops happened.  That is why I've pushed it 
down into the routines called by icmp_rcv().

rick
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 14, 2014, 4:58 p.m. UTC | #4
On Fri, 2014-11-14 at 08:16 -0800, Rick Jones wrote:

> I thought the point of the drop profiling was to show where the drops 
> were happening.  Leaving the kfree_skb() up in icmp_rcv() does not 
> improve showing where the drops happened.  That is why I've pushed it 
> down into the routines called by icmp_rcv().

OK, but we drop an icmp message, and that really should be enough.

The point is that most normal icmp messages wont be dropped, but
consumed.

I am not sure we want to bloat the kernel for such minor problem.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones Nov. 14, 2014, 6:29 p.m. UTC | #5
On 11/14/2014 08:58 AM, Eric Dumazet wrote:
> On Fri, 2014-11-14 at 08:16 -0800, Rick Jones wrote:
>
>> I thought the point of the drop profiling was to show where the drops
>> were happening.  Leaving the kfree_skb() up in icmp_rcv() does not
>> improve showing where the drops happened.  That is why I've pushed it
>> down into the routines called by icmp_rcv().
>
> OK, but we drop an icmp message, and that really should be enough.
>
> The point is that most normal icmp messages wont be dropped, but
> consumed.
>
> I am not sure we want to bloat the kernel for such minor problem.

I can certainly rework the patch that way, but one thing I have noticed 
when the system is the initiator of pings rather than the target, is 
that I get (or at least I think I do) drop profile hits in ping_rcv():

# To display the perf.data header info, please use 
--header/--header-only options.
#
# Samples: 997K of event 'skb:kfree_skb'
# Event count (approx.): 997789
#
# Children      Self  Symbol                                      Shared 
Object
# ........  ........  .......................................... 
...........................
#
    100.00%   100.00%  [k] kfree_skb 
[kernel.kallsyms]
             |
             |--100.00%-- ping_rcv
             |          icmp_rcv
             |          ip_local_deliver_finish
             |          ip_local_deliver
             |          ip_rcv_finish
             |          ip_rcv
             |          __netif_receive_skb_core
             |          __netif_receive_skb
             |          netif_receive_skb_internal
             |          napi_gro_receive
             |          e1000_receive_skb
             |          e1000_clean_rx_irq
             |          e1000e_poll
             |          net_rx_action
             |          __do_softirq

I don't have an explanation for it though.  Perhaps it is just confusion 
on my part.

That is from:

raj@raj-8510w:~$ sudo ~/net-next/tools/perf/perf record -a -g -e 
skb:kfree_skb ping -n -f -q tardy.usa.hp.com -c 1000000
PING tardy.usa.hp.com (16.103.148.51) 56(84) bytes of data.

--- tardy.usa.hp.com ping statistics ---
1000000 packets transmitted, 1000000 received, 0% packet loss, time 65751ms
rtt min/avg/max/mdev = 0.036/0.053/0.170/0.008 ms, ipg/ewma 0.065/0.052 ms
[ perf record: Woken up 1037 times to write data ]
[ perf record: Captured and wrote 259.456 MB perf.data (~11335798 samples) ]
Warning:
Processed 1002854 events and lost 2 chunks!

Check IO/CPU overload!

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/ping.h b/include/net/ping.h
index 026479b..f074060 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -82,7 +82,7 @@  int  ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
 int  ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		     size_t len);
 int  ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-void ping_rcv(struct sk_buff *skb);
+bool ping_rcv(struct sk_buff *skb);
 
 #ifdef CONFIG_PROC_FS
 struct ping_seq_afinfo {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 36b7bfa..b9f3653 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -190,7 +190,7 @@  EXPORT_SYMBOL(icmp_err_convert);
  */
 
 struct icmp_control {
-	void (*handler)(struct sk_buff *skb);
+	bool (*handler)(struct sk_buff *skb);
 	short   error;		/* This ICMP is classed as an error message */
 };
 
@@ -746,7 +746,7 @@  static bool icmp_tag_validation(int proto)
  *	ICMP_PARAMETERPROB.
  */
 
-static void icmp_unreach(struct sk_buff *skb)
+static bool icmp_unreach(struct sk_buff *skb)
 {
 	const struct iphdr *iph;
 	struct icmphdr *icmph;
@@ -839,10 +839,11 @@  static void icmp_unreach(struct sk_buff *skb)
 	icmp_socket_deliver(skb, info);
 
 out:
-	return;
+	return true;
 out_err:
 	ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
-	goto out;
+	kfree_skb(skb);
+	return false;
 }
 
 
@@ -850,17 +851,22 @@  out_err:
  *	Handle ICMP_REDIRECT.
  */
 
-static void icmp_redirect(struct sk_buff *skb)
+static bool icmp_redirect(struct sk_buff *skb)
 {
 	if (skb->len < sizeof(struct iphdr)) {
 		ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
-		return;
+		kfree_skb(skb);
+		return false;
 	}
 
-	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-		return;
+	if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
+		/* there aught to be a stat */
+		kfree_skb(skb);
+		return false;
+	}
 
 	icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
+	return true;
 }
 
 /*
@@ -875,7 +881,7 @@  static void icmp_redirect(struct sk_buff *skb)
  *	See also WRT handling of options once they are done and working.
  */
 
-static void icmp_echo(struct sk_buff *skb)
+static bool icmp_echo(struct sk_buff *skb)
 {
 	struct net *net;
 
@@ -891,6 +897,8 @@  static void icmp_echo(struct sk_buff *skb)
 		icmp_param.head_len	   = sizeof(struct icmphdr);
 		icmp_reply(&icmp_param, skb);
 	}
+	/* should there be an ICMP stat for ignored echos? */
+	return true;
 }
 
 /*
@@ -900,7 +908,7 @@  static void icmp_echo(struct sk_buff *skb)
  *		  MUST be accurate to a few minutes.
  *		  MUST be updated at least at 15Hz.
  */
-static void icmp_timestamp(struct sk_buff *skb)
+static bool icmp_timestamp(struct sk_buff *skb)
 {
 	struct timespec tv;
 	struct icmp_bxm icmp_param;
@@ -927,15 +935,18 @@  static void icmp_timestamp(struct sk_buff *skb)
 	icmp_param.data_len	   = 0;
 	icmp_param.head_len	   = sizeof(struct icmphdr) + 12;
 	icmp_reply(&icmp_param, skb);
-out:
-	return;
+	return true;
+
 out_err:
 	ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
-	goto out;
+	kfree_skb(skb);
+	return false;
 }
 
-static void icmp_discard(struct sk_buff *skb)
+static bool icmp_discard(struct sk_buff *skb)
 {
+	/* pretend it was a success */
+	return true;
 }
 
 /*
@@ -946,6 +957,7 @@  int icmp_rcv(struct sk_buff *skb)
 	struct icmphdr *icmph;
 	struct rtable *rt = skb_rtable(skb);
 	struct net *net = dev_net(rt->dst.dev);
+	bool success;
 
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 		struct sec_path *sp = skb_sec_path(skb);
@@ -1012,7 +1024,12 @@  int icmp_rcv(struct sk_buff *skb)
 		}
 	}
 
-	icmp_pointers[icmph->type].handler(skb);
+	success = icmp_pointers[icmph->type].handler(skb);
+
+	if (success) 
+		consume_skb(skb);
+
+	return 0;
 
 drop:
 	kfree_skb(skb);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 736236c..7d54eed 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -955,7 +955,7 @@  EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
  *	All we need to do is get the socket.
  */
 
-void ping_rcv(struct sk_buff *skb)
+bool ping_rcv(struct sk_buff *skb)
 {
 	struct sock *sk;
 	struct net *net = dev_net(skb->dev);
@@ -974,11 +974,13 @@  void ping_rcv(struct sk_buff *skb)
 		pr_debug("rcv on socket %p\n", sk);
 		ping_queue_rcv_skb(sk, skb_get(skb));
 		sock_put(sk);
-		return;
+		return true;
 	}
 	pr_debug("no socket, dropping\n");
 
-	/* We're called from icmp_rcv(). kfree_skb() is done there. */
+	/* Do the kfree_skb() here to get a better drop profile */
+	kfree_skb(skb);
+	return false;
 }
 EXPORT_SYMBOL_GPL(ping_rcv);
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 0929340..6c0f805 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -679,6 +679,7 @@  static int icmpv6_rcv(struct sk_buff *skb)
 	const struct in6_addr *saddr, *daddr;
 	struct icmp6hdr *hdr;
 	u8 type;
+	bool success = true;
 
 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 		struct sec_path *sp = skb_sec_path(skb);
@@ -726,7 +727,7 @@  static int icmpv6_rcv(struct sk_buff *skb)
 		break;
 
 	case ICMPV6_ECHO_REPLY:
-		ping_rcv(skb);
+		success = ping_rcv(skb);
 		break;
 
 	case ICMPV6_PKT_TOOBIG:
@@ -790,7 +791,15 @@  static int icmpv6_rcv(struct sk_buff *skb)
 		icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
 	}
 
-	kfree_skb(skb);
+	/* until the v6 path can be better sorted we may still need
+	 * to kfree_sbk() here but want to avoid a double free from
+	 * the ping_rcv() path, which shares code with IPv4. assume
+	 * success and preserve the status quo behaviour for the rest
+	 * of the paths to here
+	 */
+	if (success)
+		kfree_skb(skb);
+
 	return 0;
 
 csum_error: