Message ID | 20141113225457.A3E502900805@tardy |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
From: raj@tardy.usa.hp.com (Rick Jones) Date: Thu, 13 Nov 2014 14:54:57 -0800 (PST) > + /* until the v6 path can be better sorted we may still need > + * to kfree_sbk() here but want to avoid a double free from Typo "kfree_skb()". -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2014-11-13 at 14:54 -0800, Rick Jones wrote: > From: Rick Jones <rick.jones2@hp.com> > > If icmp_rcv() has successfully processed the incoming ICMP datagram, we > should use consume_skb() rather than kfree_skb() because a hit on the likes > of perf -e skb:kfree_skb is not called-for. > > Signed-off-by: Rick Jones <rick.jones2@hp.com> > > --- > > A test system hit with a flood ping hits on perf top -e ksb:kfre_skb before > the change and none after for the normal/success path. The IPv6 path would > be somewhat more ugly. For the time being, just deal with the overlap on > ping_rcv() between the two to avoid a possible double free of an skb. > > diff --git a/include/net/ping.h b/include/net/ping.h > index 026479b..f074060 100644 > --- a/include/net/ping.h > +++ b/include/net/ping.h > @@ -82,7 +82,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, > int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, > size_t len); > int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); > -void ping_rcv(struct sk_buff *skb); > +bool ping_rcv(struct sk_buff *skb); > > #ifdef CONFIG_PROC_FS > struct ping_seq_afinfo { > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c > index 36b7bfa..b9f3653 100644 > --- a/net/ipv4/icmp.c > +++ b/net/ipv4/icmp.c > @@ -190,7 +190,7 @@ EXPORT_SYMBOL(icmp_err_convert); > */ > > struct icmp_control { > - void (*handler)(struct sk_buff *skb); > + bool (*handler)(struct sk_buff *skb); > short error; /* This ICMP is classed as an error message */ > }; > > @@ -746,7 +746,7 @@ static bool icmp_tag_validation(int proto) > * ICMP_PARAMETERPROB. > */ > > -static void icmp_unreach(struct sk_buff *skb) > +static bool icmp_unreach(struct sk_buff *skb) > { > const struct iphdr *iph; > struct icmphdr *icmph; > @@ -839,10 +839,11 @@ static void icmp_unreach(struct sk_buff *skb) > icmp_socket_deliver(skb, info); > > out: > - return; > + return true; > out_err: > ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); > - goto out; > + kfree_skb(skb); > + return false; > } > > > @@ -850,17 +851,22 @@ out_err: > * Handle ICMP_REDIRECT. > */ > > -static void icmp_redirect(struct sk_buff *skb) > +static bool icmp_redirect(struct sk_buff *skb) > { > if (skb->len < sizeof(struct iphdr)) { > ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); > - return; > + kfree_skb(skb); > + return false; > } > > - if (!pskb_may_pull(skb, sizeof(struct iphdr))) > - return; > + if (!pskb_may_pull(skb, sizeof(struct iphdr))) { > + /* there aught to be a stat */ > + kfree_skb(skb); > + return false; > + } > > icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); > + return true; > } > > /* > @@ -875,7 +881,7 @@ static void icmp_redirect(struct sk_buff *skb) > * See also WRT handling of options once they are done and working. > */ > > -static void icmp_echo(struct sk_buff *skb) > +static bool icmp_echo(struct sk_buff *skb) > { > struct net *net; > > @@ -891,6 +897,8 @@ static void icmp_echo(struct sk_buff *skb) > icmp_param.head_len = sizeof(struct icmphdr); > icmp_reply(&icmp_param, skb); > } > + /* should there be an ICMP stat for ignored echos? */ > + return true; > } > > /* > @@ -900,7 +908,7 @@ static void icmp_echo(struct sk_buff *skb) > * MUST be accurate to a few minutes. > * MUST be updated at least at 15Hz. > */ > -static void icmp_timestamp(struct sk_buff *skb) > +static bool icmp_timestamp(struct sk_buff *skb) > { > struct timespec tv; > struct icmp_bxm icmp_param; > @@ -927,15 +935,18 @@ static void icmp_timestamp(struct sk_buff *skb) > icmp_param.data_len = 0; > icmp_param.head_len = sizeof(struct icmphdr) + 12; > icmp_reply(&icmp_param, skb); > -out: > - return; > + return true; > + > out_err: > ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); > - goto out; > + kfree_skb(skb); > + return false; > } > > -static void icmp_discard(struct sk_buff *skb) > +static bool icmp_discard(struct sk_buff *skb) > { > + /* pretend it was a success */ > + return true; > } > > /* > @@ -946,6 +957,7 @@ int icmp_rcv(struct sk_buff *skb) > struct icmphdr *icmph; > struct rtable *rt = skb_rtable(skb); > struct net *net = dev_net(rt->dst.dev); > + bool success; > > if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { > struct sec_path *sp = skb_sec_path(skb); > @@ -1012,7 +1024,12 @@ int icmp_rcv(struct sk_buff *skb) > } > } > > - icmp_pointers[icmph->type].handler(skb); > + success = icmp_pointers[icmph->type].handler(skb); > + > + if (success) > + consume_skb(skb); > + > + return 0; This looks quite complicated to me. Why are you adding kfree_skb() everywhere instead of : bool to_consume = icmp_pointers[icmph->type].handler(skb); if (ro_consume) consume_skb(skb); else kfree_skb(skb); > > drop: > kfree_skb(skb); -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > This looks quite complicated to me. > > Why are you adding kfree_skb() everywhere instead of : > > bool to_consume = icmp_pointers[icmph->type].handler(skb); > if (ro_consume) > consume_skb(skb); > else > kfree_skb(skb); I thought the point of the drop profiling was to show where the drops were happening. Leaving the kfree_skb() up in icmp_rcv() does not improve showing where the drops happened. That is why I've pushed it down into the routines called by icmp_rcv(). rick -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2014-11-14 at 08:16 -0800, Rick Jones wrote: > I thought the point of the drop profiling was to show where the drops > were happening. Leaving the kfree_skb() up in icmp_rcv() does not > improve showing where the drops happened. That is why I've pushed it > down into the routines called by icmp_rcv(). OK, but we drop an icmp message, and that really should be enough. The point is that most normal icmp messages wont be dropped, but consumed. I am not sure we want to bloat the kernel for such minor problem. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/14/2014 08:58 AM, Eric Dumazet wrote: > On Fri, 2014-11-14 at 08:16 -0800, Rick Jones wrote: > >> I thought the point of the drop profiling was to show where the drops >> were happening. Leaving the kfree_skb() up in icmp_rcv() does not >> improve showing where the drops happened. That is why I've pushed it >> down into the routines called by icmp_rcv(). > > OK, but we drop an icmp message, and that really should be enough. > > The point is that most normal icmp messages wont be dropped, but > consumed. > > I am not sure we want to bloat the kernel for such minor problem. I can certainly rework the patch that way, but one thing I have noticed when the system is the initiator of pings rather than the target, is that I get (or at least I think I do) drop profile hits in ping_rcv(): # To display the perf.data header info, please use --header/--header-only options. # # Samples: 997K of event 'skb:kfree_skb' # Event count (approx.): 997789 # # Children Self Symbol Shared Object # ........ ........ .......................................... ........................... # 100.00% 100.00% [k] kfree_skb [kernel.kallsyms] | |--100.00%-- ping_rcv | icmp_rcv | ip_local_deliver_finish | ip_local_deliver | ip_rcv_finish | ip_rcv | __netif_receive_skb_core | __netif_receive_skb | netif_receive_skb_internal | napi_gro_receive | e1000_receive_skb | e1000_clean_rx_irq | e1000e_poll | net_rx_action | __do_softirq I don't have an explanation for it though. Perhaps it is just confusion on my part. That is from: raj@raj-8510w:~$ sudo ~/net-next/tools/perf/perf record -a -g -e skb:kfree_skb ping -n -f -q tardy.usa.hp.com -c 1000000 PING tardy.usa.hp.com (16.103.148.51) 56(84) bytes of data. --- tardy.usa.hp.com ping statistics --- 1000000 packets transmitted, 1000000 received, 0% packet loss, time 65751ms rtt min/avg/max/mdev = 0.036/0.053/0.170/0.008 ms, ipg/ewma 0.065/0.052 ms [ perf record: Woken up 1037 times to write data ] [ perf record: Captured and wrote 259.456 MB perf.data (~11335798 samples) ] Warning: Processed 1002854 events and lost 2 chunks! Check IO/CPU overload! -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/net/ping.h b/include/net/ping.h index 026479b..f074060 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -82,7 +82,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -void ping_rcv(struct sk_buff *skb); +bool ping_rcv(struct sk_buff *skb); #ifdef CONFIG_PROC_FS struct ping_seq_afinfo { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 36b7bfa..b9f3653 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -190,7 +190,7 @@ EXPORT_SYMBOL(icmp_err_convert); */ struct icmp_control { - void (*handler)(struct sk_buff *skb); + bool (*handler)(struct sk_buff *skb); short error; /* This ICMP is classed as an error message */ }; @@ -746,7 +746,7 @@ static bool icmp_tag_validation(int proto) * ICMP_PARAMETERPROB. */ -static void icmp_unreach(struct sk_buff *skb) +static bool icmp_unreach(struct sk_buff *skb) { const struct iphdr *iph; struct icmphdr *icmph; @@ -839,10 +839,11 @@ static void icmp_unreach(struct sk_buff *skb) icmp_socket_deliver(skb, info); out: - return; + return true; out_err: ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); - goto out; + kfree_skb(skb); + return false; } @@ -850,17 +851,22 @@ out_err: * Handle ICMP_REDIRECT. */ -static void icmp_redirect(struct sk_buff *skb) +static bool icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); - return; + kfree_skb(skb); + return false; } - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) { + /* there aught to be a stat */ + kfree_skb(skb); + return false; + } icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); + return true; } /* @@ -875,7 +881,7 @@ static void icmp_redirect(struct sk_buff *skb) * See also WRT handling of options once they are done and working. */ -static void icmp_echo(struct sk_buff *skb) +static bool icmp_echo(struct sk_buff *skb) { struct net *net; @@ -891,6 +897,8 @@ static void icmp_echo(struct sk_buff *skb) icmp_param.head_len = sizeof(struct icmphdr); icmp_reply(&icmp_param, skb); } + /* should there be an ICMP stat for ignored echos? */ + return true; } /* @@ -900,7 +908,7 @@ static void icmp_echo(struct sk_buff *skb) * MUST be accurate to a few minutes. * MUST be updated at least at 15Hz. */ -static void icmp_timestamp(struct sk_buff *skb) +static bool icmp_timestamp(struct sk_buff *skb) { struct timespec tv; struct icmp_bxm icmp_param; @@ -927,15 +935,18 @@ static void icmp_timestamp(struct sk_buff *skb) icmp_param.data_len = 0; icmp_param.head_len = sizeof(struct icmphdr) + 12; icmp_reply(&icmp_param, skb); -out: - return; + return true; + out_err: ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); - goto out; + kfree_skb(skb); + return false; } -static void icmp_discard(struct sk_buff *skb) +static bool icmp_discard(struct sk_buff *skb) { + /* pretend it was a success */ + return true; } /* @@ -946,6 +957,7 @@ int icmp_rcv(struct sk_buff *skb) struct icmphdr *icmph; struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->dst.dev); + bool success; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); @@ -1012,7 +1024,12 @@ int icmp_rcv(struct sk_buff *skb) } } - icmp_pointers[icmph->type].handler(skb); + success = icmp_pointers[icmph->type].handler(skb); + + if (success) + consume_skb(skb); + + return 0; drop: kfree_skb(skb); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 736236c..7d54eed 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -955,7 +955,7 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); * All we need to do is get the socket. */ -void ping_rcv(struct sk_buff *skb) +bool ping_rcv(struct sk_buff *skb) { struct sock *sk; struct net *net = dev_net(skb->dev); @@ -974,11 +974,13 @@ void ping_rcv(struct sk_buff *skb) pr_debug("rcv on socket %p\n", sk); ping_queue_rcv_skb(sk, skb_get(skb)); sock_put(sk); - return; + return true; } pr_debug("no socket, dropping\n"); - /* We're called from icmp_rcv(). kfree_skb() is done there. */ + /* Do the kfree_skb() here to get a better drop profile */ + kfree_skb(skb); + return false; } EXPORT_SYMBOL_GPL(ping_rcv); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 0929340..6c0f805 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -679,6 +679,7 @@ static int icmpv6_rcv(struct sk_buff *skb) const struct in6_addr *saddr, *daddr; struct icmp6hdr *hdr; u8 type; + bool success = true; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); @@ -726,7 +727,7 @@ static int icmpv6_rcv(struct sk_buff *skb) break; case ICMPV6_ECHO_REPLY: - ping_rcv(skb); + success = ping_rcv(skb); break; case ICMPV6_PKT_TOOBIG: @@ -790,7 +791,15 @@ static int icmpv6_rcv(struct sk_buff *skb) icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); } - kfree_skb(skb); + /* until the v6 path can be better sorted we may still need + * to kfree_sbk() here but want to avoid a double free from + * the ping_rcv() path, which shares code with IPv4. assume + * success and preserve the status quo behaviour for the rest + * of the paths to here + */ + if (success) + kfree_skb(skb); + return 0; csum_error: