diff mbox

[RFC,v2,09/12] udp: enable sendmsg zerocopy

Message ID 20170222163901.90834-10-willemdebruijn.kernel@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Willem de Bruijn Feb. 22, 2017, 4:38 p.m. UTC
From: Willem de Bruijn <willemb@google.com>

Add MSG_ZEROCOPY support to inet/dgram. This includes udplite.

Tested:
  loopback test snd_zerocopy_lo -u -z produces

  without zerocopy (-u):
    rx=173940 (10854 MB) tx=173940 txc=0
    rx=367026 (22904 MB) tx=367026 txc=0
    rx=564078 (35201 MB) tx=564078 txc=0
    rx=756588 (47214 MB) tx=756588 txc=0

  with zerocopy (-u -z):
    rx=377994 (23588 MB) tx=377994 txc=377980
    rx=792654 (49465 MB) tx=792654 txc=792632
    rx=1209582 (75483 MB) tx=1209582 txc=1209552
    rx=1628376 (101618 MB) tx=1628376 txc=1628338

  loopback test currently fails with corking, due to
  CHECKSUM_PARTIAL being disabled with UDP_CORK after commit
  d749c9cbffd6 ("ipv4: no CHECKSUM_PARTIAL on MSG_MORE corked sockets")

  I will suggest to allow it on NETIF_F_LOOPBACK.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/skbuff.h |  5 +++++
 net/ipv4/ip_output.c   | 34 +++++++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 5 deletions(-)
diff mbox

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6ad1724ceb60..9e7386f3f7a8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -424,6 +424,11 @@  struct ubuf_info {
 
 #define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
 
+#define sock_can_zerocopy(sk, rt, csummode) \
+	((rt->dst.dev->features & NETIF_F_SG) && \
+	 ((sk->sk_type == SOCK_RAW) || \
+	  (sk->sk_type == SOCK_DGRAM && csummode & CHECKSUM_UNNECESSARY)))
+
 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
 struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 					struct ubuf_info *uarg);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 737ce826d7ec..9e0110d8a429 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -919,7 +919,7 @@  static int __ip_append_data(struct sock *sk,
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct sk_buff *skb;
-
+	struct ubuf_info *uarg = NULL;
 	struct ip_options *opt = cork->opt;
 	int hh_len;
 	int exthdrlen;
@@ -963,9 +963,16 @@  static int __ip_append_data(struct sock *sk,
 	    !exthdrlen)
 		csummode = CHECKSUM_PARTIAL;
 
+	if (flags & MSG_ZEROCOPY && length &&
+	    sock_can_zerocopy(sk, rt, skb ? skb->ip_summed : csummode)) {
+		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+		if (!uarg)
+			return -ENOBUFS;
+	}
+
 	cork->length += length;
 	if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
-	    (sk->sk_protocol == IPPROTO_UDP) &&
+	    (sk->sk_protocol == IPPROTO_UDP) && !uarg &&
 	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
 	    (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
 		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
@@ -1017,6 +1024,8 @@  static int __ip_append_data(struct sock *sk,
 			if ((flags & MSG_MORE) &&
 			    !(rt->dst.dev->features&NETIF_F_SG))
 				alloclen = mtu;
+			else if (uarg)
+				alloclen = min_t(int, fraglen, MAX_HEADER);
 			else
 				alloclen = fraglen;
 
@@ -1059,11 +1068,12 @@  static int __ip_append_data(struct sock *sk,
 			cork->tx_flags = 0;
 			skb_shinfo(skb)->tskey = tskey;
 			tskey = 0;
+			skb_zcopy_set(skb, uarg);
 
 			/*
 			 *	Find where to start putting bytes.
 			 */
-			data = skb_put(skb, fraglen + exthdrlen);
+			data = skb_put(skb, alloclen);
 			skb_set_network_header(skb, exthdrlen);
 			skb->transport_header = (skb->network_header +
 						 fragheaderlen);
@@ -1079,7 +1089,9 @@  static int __ip_append_data(struct sock *sk,
 				pskb_trim_unique(skb_prev, maxfraglen);
 			}
 
-			copy = datalen - transhdrlen - fraggap;
+			copy = min(datalen,
+				   alloclen - exthdrlen - fragheaderlen);
+			copy -= transhdrlen - fraggap;
 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 				err = -EFAULT;
 				kfree_skb(skb);
@@ -1087,7 +1099,7 @@  static int __ip_append_data(struct sock *sk,
 			}
 
 			offset += copy;
-			length -= datalen - fraggap;
+			length -= copy + transhdrlen;
 			transhdrlen = 0;
 			exthdrlen = 0;
 			csummode = CHECKSUM_NONE;
@@ -1115,6 +1127,17 @@  static int __ip_append_data(struct sock *sk,
 				err = -EFAULT;
 				goto error;
 			}
+		} else if (uarg) {
+			struct iov_iter *iter;
+
+			if (sk->sk_type == SOCK_RAW)
+				iter = &((struct msghdr **)from)[0]->msg_iter;
+			else
+				iter = &((struct msghdr *)from)->msg_iter;
+			err = skb_zerocopy_add_frags_iter(sk, skb, iter, copy, uarg);
+			if (err < 0)
+				goto error;
+			copy = err;
 		} else {
 			int i = skb_shinfo(skb)->nr_frags;
 
@@ -1155,6 +1178,7 @@  static int __ip_append_data(struct sock *sk,
 error_efault:
 	err = -EFAULT;
 error:
+	sock_zerocopy_put_abort(uarg);
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
 	return err;