diff mbox series

[net-next,v3,2/3] udp: elide zerocopy operation in hot path

Message ID 20181129202620.233237-3-willemdebruijn.kernel@gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series udp msg_zerocopy | expand

Commit Message

Willem de Bruijn Nov. 29, 2018, 8:26 p.m. UTC
From: Willem de Bruijn <willemb@google.com>

With MSG_ZEROCOPY, each skb holds a reference to a struct ubuf_info.
Release of its last reference triggers a completion notification.

The TCP stack in tcp_sendmsg_locked holds an extra ref independent of
the skbs, because it can build, send and free skbs within its loop,
possibly reaching refcount zero and freeing the ubuf_info too soon.

The UDP stack currently also takes this extra ref, but does not need
it as all skbs are sent after return from __ip(6)_append_data.

Avoid the extra refcount_inc and refcount_dec_and_test, and generally
the sock_zerocopy_put in the common path, by passing the initial
reference to the first skb.

This approach is taken instead of initializing the refcount to 0, as
that would generate error "refcount_t: increment on 0" on the
next skb_zcopy_set.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/skbuff.h | 12 ++++++++----
 net/core/skbuff.c      |  9 +++++----
 net/ipv4/ip_output.c   | 10 +++++-----
 net/ipv4/tcp.c         |  2 +-
 net/ipv6/ip6_output.c  | 10 +++++-----
 5 files changed, 24 insertions(+), 19 deletions(-)

Comments

Willem de Bruijn Nov. 29, 2018, 8:37 p.m. UTC | #1
On Thu, Nov 29, 2018 at 3:26 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> From: Willem de Bruijn <willemb@google.com>
>
> With MSG_ZEROCOPY, each skb holds a reference to a struct ubuf_info.
> Release of its last reference triggers a completion notification.
>
> The TCP stack in tcp_sendmsg_locked holds an extra ref independent of
> the skbs, because it can build, send and free skbs within its loop,
> possibly reaching refcount zero and freeing the ubuf_info too soon.
>
> The UDP stack currently also takes this extra ref, but does not need
> it as all skbs are sent after return from __ip(6)_append_data.
>
> Avoid the extra refcount_inc and refcount_dec_and_test, and generally
> the sock_zerocopy_put in the common path, by passing the initial
> reference to the first skb.
>
> This approach is taken instead of initializing the refcount to 0, as
> that would generate error "refcount_t: increment on 0" on the
> next skb_zcopy_set.
>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---

> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 9602746d7175..08ff04d12642 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -881,8 +881,8 @@ static int __ip_append_data(struct sock *sk,
>         int csummode = CHECKSUM_NONE;
>         struct rtable *rt = (struct rtable *)cork->dst;
>         unsigned int wmem_alloc_delta = 0;
> +       bool paged, extra_uref;
>         u32 tskey = 0;
> -       bool paged;
>
>         skb = skb_peek_tail(queue);
>
> @@ -921,12 +921,13 @@ static int __ip_append_data(struct sock *sk,
>                 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
>                 if (!uarg)
>                         return -ENOBUFS;
> +               extra_uref = true;
>                 if (rt->dst.dev->features & NETIF_F_SG &&
>                     csummode == CHECKSUM_PARTIAL) {
>                         paged = true;
>                 } else {
>                         uarg->zerocopy = 0;
> -                       skb_zcopy_set(skb, uarg);
> +                       skb_zcopy_set(skb, uarg, &extra_uref);
>                 }
>         }
>
> @@ -1019,7 +1020,7 @@ static int __ip_append_data(struct sock *sk,
>                         cork->tx_flags = 0;
>                         skb_shinfo(skb)->tskey = tskey;
>                         tskey = 0;
> -                       skb_zcopy_set(skb, uarg);
> +                       skb_zcopy_set(skb, uarg, &extra_uref);
>
>                         /*
>                          *      Find where to start putting bytes.
> @@ -1123,13 +1124,12 @@ static int __ip_append_data(struct sock *sk,
>
>         if (wmem_alloc_delta)
>                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
> -       sock_zerocopy_put(uarg);
>         return 0;
>
>  error_efault:
>         err = -EFAULT;
>  error:
> -       sock_zerocopy_put_abort(uarg);
> +       sock_zerocopy_put_abort(uarg, extra_uref);

I'll need another revision. Sorry for the spam.

In the draft patch I suggested that the skb_zcopy_set needs to be
moved below getfrag, so that the uarg is not freed on the only
error path that calls kfree_skb inside the main loop.

This is still needed. Else sock_zerocopy_put_abort here is
reached with a non-NULL, but already freed uarg.

Will send a v4. Will let this sit for at least a day in case anyone
else has comments.
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 04f52e719571..75d50ab7997c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -481,7 +481,7 @@  static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 }
 
 void sock_zerocopy_put(struct ubuf_info *uarg);
-void sock_zerocopy_put_abort(struct ubuf_info *uarg);
+void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 
 void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
 
@@ -1326,10 +1326,14 @@  static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
-static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
+				 bool *have_ref)
 {
 	if (skb && uarg && !skb_zcopy(skb)) {
-		sock_zerocopy_get(uarg);
+		if (unlikely(have_ref && *have_ref))
+			*have_ref = false;
+		else
+			sock_zerocopy_get(uarg);
 		skb_shinfo(skb)->destructor_arg = uarg;
 		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
 	}
@@ -1374,7 +1378,7 @@  static inline void skb_zcopy_abort(struct sk_buff *skb)
 	struct ubuf_info *uarg = skb_zcopy(skb);
 
 	if (uarg) {
-		sock_zerocopy_put_abort(uarg);
+		sock_zerocopy_put_abort(uarg, false);
 		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
 	}
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 435bac91d293..7cfc2144228a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1089,7 +1089,7 @@  void sock_zerocopy_put(struct ubuf_info *uarg)
 }
 EXPORT_SYMBOL_GPL(sock_zerocopy_put);
 
-void sock_zerocopy_put_abort(struct ubuf_info *uarg)
+void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 {
 	if (uarg) {
 		struct sock *sk = skb_from_uarg(uarg)->sk;
@@ -1097,7 +1097,8 @@  void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 		atomic_dec(&sk->sk_zckey);
 		uarg->len--;
 
-		sock_zerocopy_put(uarg);
+		if (have_uref)
+			sock_zerocopy_put(uarg);
 	}
 }
 EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
@@ -1137,7 +1138,7 @@  int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 		return err;
 	}
 
-	skb_zcopy_set(skb, uarg);
+	skb_zcopy_set(skb, uarg, NULL);
 	return skb->len - orig_len;
 }
 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
@@ -1157,7 +1158,7 @@  static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
 			if (skb_copy_ubufs(nskb, GFP_ATOMIC))
 				return -EIO;
 		}
-		skb_zcopy_set(nskb, skb_uarg(orig));
+		skb_zcopy_set(nskb, skb_uarg(orig), NULL);
 	}
 	return 0;
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9602746d7175..08ff04d12642 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -881,8 +881,8 @@  static int __ip_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
+	bool paged, extra_uref;
 	u32 tskey = 0;
-	bool paged;
 
 	skb = skb_peek_tail(queue);
 
@@ -921,12 +921,13 @@  static int __ip_append_data(struct sock *sk,
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
 		if (!uarg)
 			return -ENOBUFS;
+		extra_uref = true;
 		if (rt->dst.dev->features & NETIF_F_SG &&
 		    csummode == CHECKSUM_PARTIAL) {
 			paged = true;
 		} else {
 			uarg->zerocopy = 0;
-			skb_zcopy_set(skb, uarg);
+			skb_zcopy_set(skb, uarg, &extra_uref);
 		}
 	}
 
@@ -1019,7 +1020,7 @@  static int __ip_append_data(struct sock *sk,
 			cork->tx_flags = 0;
 			skb_shinfo(skb)->tskey = tskey;
 			tskey = 0;
-			skb_zcopy_set(skb, uarg);
+			skb_zcopy_set(skb, uarg, &extra_uref);
 
 			/*
 			 *	Find where to start putting bytes.
@@ -1123,13 +1124,12 @@  static int __ip_append_data(struct sock *sk,
 
 	if (wmem_alloc_delta)
 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
-	sock_zerocopy_put(uarg);
 	return 0;
 
 error_efault:
 	err = -EFAULT;
 error:
-	sock_zerocopy_put_abort(uarg);
+	sock_zerocopy_put_abort(uarg, extra_uref);
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 252048776dbb..444cdbff0638 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1423,7 +1423,7 @@  int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 	if (copied + copied_syn)
 		goto out;
 out_err:
-	sock_zerocopy_put_abort(uarg);
+	sock_zerocopy_put_abort(uarg, true);
 	err = sk_stream_error(sk, flags, err);
 	/* make sure we wake any epoll edge trigger waiter */
 	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 56f09c4c40dc..1cc9ae923093 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1258,7 +1258,7 @@  static int __ip6_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	unsigned int maxnonfragsize, headersize;
 	unsigned int wmem_alloc_delta = 0;
-	bool paged;
+	bool paged, extra_uref;
 
 	skb = skb_peek_tail(queue);
 	if (!skb) {
@@ -1327,12 +1327,13 @@  static int __ip6_append_data(struct sock *sk,
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
 		if (!uarg)
 			return -ENOBUFS;
+		extra_uref = true;
 		if (rt->dst.dev->features & NETIF_F_SG &&
 		    csummode == CHECKSUM_PARTIAL) {
 			paged = true;
 		} else {
 			uarg->zerocopy = 0;
-			skb_zcopy_set(skb, uarg);
+			skb_zcopy_set(skb, uarg, &extra_uref);
 		}
 	}
 
@@ -1458,7 +1459,7 @@  static int __ip6_append_data(struct sock *sk,
 			cork->tx_flags = 0;
 			skb_shinfo(skb)->tskey = tskey;
 			tskey = 0;
-			skb_zcopy_set(skb, uarg);
+			skb_zcopy_set(skb, uarg, &extra_uref);
 
 			/*
 			 *	Find where to start putting bytes
@@ -1561,13 +1562,12 @@  static int __ip6_append_data(struct sock *sk,
 
 	if (wmem_alloc_delta)
 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
-	sock_zerocopy_put(uarg);
 	return 0;
 
 error_efault:
 	err = -EFAULT;
 error:
-	sock_zerocopy_put_abort(uarg);
+	sock_zerocopy_put_abort(uarg, extra_uref);
 	cork->length -= length;
 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);