Message ID | 1375997852.4004.130.camel@edumazet-glaptop |
---|---|
State | Accepted, archived |
Delegated to: | David Miller |
Headers | show |
From: Eric Dumazet <eric.dumazet@gmail.com> Date: Thu, 08 Aug 2013 14:37:32 -0700 > From: Eric Dumazet <edumazet@google.com> > > unix_stream_sendmsg() currently uses order-2 allocations, > and we had numerous reports this can fail. > > The __GFP_REPEAT flag present in sock_alloc_send_pskb() is > not helping. > > This patch extends the work done in commit eb6a24816b247c > ("af_unix: reduce high order page allocations) for > datagram sockets. > > This opens the possibility of zero copy IO (splice() and > friends) > > The trick is to not use skb_pull() anymore in recvmsg() path, > and instead add a @consumed field in UNIXCB() to track amount > of already read payload in the skb. > > There is a performance regression for large sends > because of extra page allocations that will be addressed > in a follow-up patch, allowing sock_alloc_send_pskb() > to attempt high order page allocations. > > Signed-off-by: Eric Dumazet <edumazet@google.com> Applied. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 08, 2013 at 02:37:32PM -0700, Eric Dumazet wrote: > From: Eric Dumazet <edumazet@google.com> > > unix_stream_sendmsg() currently uses order-2 allocations, > and we had numerous reports this can fail. > > The __GFP_REPEAT flag present in sock_alloc_send_pskb() is > not helping. > > This patch extends the work done in commit eb6a24816b247c > ("af_unix: reduce high order page allocations) for > datagram sockets. > > This opens the possibility of zero copy IO (splice() and > friends) > > The trick is to not use skb_pull() anymore in recvmsg() path, > and instead add a @consumed field in UNIXCB() to track amount > of already read payload in the skb. > > There is a performance regression for large sends > because of extra page allocations that will be addressed > in a follow-up patch, allowing sock_alloc_send_pskb() > to attempt high order page allocations. > > Signed-off-by: Eric Dumazet <edumazet@google.com> > Cc: David Rientjes <rientjes@google.com> I am sorry to report that I had to revert both these patches to have a working systemd again. This is the af_unix stream server side: $ strace -e network -p 1 Process 1 attached accept4(9, {sa_family=AF_LOCAL, NULL}, [2], SOCK_CLOEXEC) = 15 getsockname(15, {sa_family=AF_LOCAL, sun_path="/run/systemd/private"}, [23]) = 0 recvmsg(15, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=0, msg_flags=0}, 0) = 1 getsockopt(15, SOL_SOCKET, SO_PEERCRED, {pid=515, uid=0, gid=0}, [12]) = 0 sendto(15, "OK cfd216826810ecc1c17014b052079"..., 37, MSG_NOSIGNAL, NULL, 0) = 37 sendto(15, "AGREE_UNIX_FD\r\n", 15, MSG_NOSIGNAL, NULL, 0) = 15 getsockopt(15, SOL_SOCKET, SO_PEERSEC, "unconfined_u:unconfined_r:unconf"..., [54]) = 0 socket(PF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0) = 28 connect(28, {sa_family=AF_LOCAL, sun_path="/var/run/setrans/.setrans-unix"}, 110) = -1 ENOENT (No such file or directory) socket(PF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0) = 28 connect(28, {sa_family=AF_LOCAL, sun_path="/var/run/setrans/.setrans-unix"}, 110) = -1 ENOENT (No such file or directory) getsockopt(15, SOL_SOCKET, SO_PEERCRED, {pid=515, uid=0, gid=0}, [12]) = 0 socket(PF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0) = 28 connect(28, {sa_family=AF_LOCAL, sun_path="/var/run/setrans/.setrans-unix"}, 110) = -1 ENOENT (No such file or directory) socket(PF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0) = 28 connect(28, {sa_family=AF_LOCAL, sun_path="/var/run/setrans/.setrans-unix"}, 110) = -1 ENOENT (No such file or directory) sendmsg(15, {msg_name(0)=NULL, msg_iov(2)=[{"l\2\1\0018\233\0\0\1\0\0\0\33\0\0\0\5\1u\0\1\0\0\0\10\1g\0\ra(s"..., 48}, {"0\233\0\0\0\0\0\0\20\0\0\0ebtables.service\0\0\0\0"..., 39736}], msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 39784 recvmsg(15, {msg_name(0)=NULL, msg_iov(1)=[{"", 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 0 This the af_unix stream client side: socket(PF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0) = 3 connect(3, {sa_family=AF_LOCAL, sun_path="/run/systemd/private"}, 22) = 0 getsockname(3, {sa_family=AF_LOCAL, NULL}, [2]) = 0 getsockopt(3, SOL_SOCKET, SO_PEERCRED, {pid=1, uid=0, gid=0}, [12]) = 0 sendto(3, "\0", 1, MSG_NOSIGNAL, NULL, 0) = 1 sendto(3, "AUTH EXTERNAL 30\r\n", 18, MSG_NOSIGNAL, NULL, 0) = 18 sendto(3, "NEGOTIATE_UNIX_FD\r\n", 19, MSG_NOSIGNAL, NULL, 0) = 19 sendto(3, "BEGIN\r\n", 7, MSG_NOSIGNAL, NULL, 0) = 7 sendmsg(3, {msg_name(0)=NULL, msg_iov(2)=[{"l\1\0\1\0\0\0\0\1\0\0\0\222\0\0\0\1\1o\0\31\0\0\0/org/fre"..., 168}, {"", 0}], msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 168 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"l\2\1\0018\233\0\0\1\0\0\0\33\0\0\0\5\1u\0\1\0\0\0\10\1g\0\ra(s"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"rget\0\0\0\0$\0\0\0systemd-update-utmp-"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"/systemd1/unit/systemd_2duser_2d"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"ed\0\0\10\0\0\0inactive\0\0\0\0\4\0\0\0dead\0\0\0\0"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"emd-random-seed-load.service\0\0\0\0"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"\4\0\0\0dead\0\0\0\0\0\0\0\0\0\0\0\0B\0\0\0/org/fre"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"\6\0\0\0exited\0\0\0\0\0\0\0\0\0\0009\0\0\0/org/fre"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"ode\0\6\0\0\0loaded\0\0\10\0\0\0inactive\0\0\0\0"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"4\0\0\0/org/freedesktop/systemd1/un"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"up_2etarget\0\0\0\0\0\0\0\0\0\0\0\0\0;\0\0\0/org"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"art Card Daemon\0\6\0\0\0loaded\0\0\10\0\0\0"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"inactive\0\0\0\0\4\0\0\0dead\0\0\0\0\0\0\0\0\0\0\0\0"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{".swap\0\0\0006\0\0\0/dev/disk/by-uuid/3b"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"ice\0\0\0\0\0\0\0\0\0\0\0\0\0K\0\0\0/org/freedes"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"s_2dclean_2etimer\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"8250_2dtty_2dttyS1_2edevice\0\0\0\0\0"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{".device\0003\0\0\0/org/freedesktop/sys"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"dblock_2dvda_2edevice\0\0\0\16\0\0\0dev-"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{")\0\0\0SYSV: Initializes network co"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 2048 recvmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{"emd-random-seed-save.service\0\0\0\0"..., 2048}], msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 872 Failed to issue method call: Did not receive a reply. Possible causes include: the remote application did not send a reply, the message bus security policy blocked the reply, the reply timeout expired, or the network connection was broken. --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=516, si_status=0, si_utime=0, si_stime=0} --- Total bytes consumed here is 19 * 2048 + 872 = 39784 which corresponds to number of bytes send from the server. Strange, all this looks normal even if I compare the results to a working systemctl -a. When stracing with -s40000 I see that the last string send is '\0\0/org/freedesktop/systemd1/unit/polkit_2eservice\0' but on the receiver side I get a '0\0\0/\0\0\0/org/freedesktop/systemd1/unit/prefd'. Which is actually payload from the second recvmsg on the client side. The rest of the payload matches up, too. Maybe you could have a look? Greetings, Hannes -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2013-08-11 at 16:06 +0200, Hannes Frederic Sowa wrote: > I am sorry to report that I had to revert both these patches to have a working > systemd again. Hi Hannes If you revert only the second patch, the problem is still there, right ? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Aug 11, 2013 at 07:32:33AM -0700, Eric Dumazet wrote: > On Sun, 2013-08-11 at 16:06 +0200, Hannes Frederic Sowa wrote: > > > I am sorry to report that I had to revert both these patches to have a working > > systemd again. > > Hi Hannes > > If you revert only the second patch, the problem is still there, right ? Exactly, only reverting 2/2 did not help. Thanks, Hannes -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2013-08-11 at 16:37 +0200, Hannes Frederic Sowa wrote: > On Sun, Aug 11, 2013 at 07:32:33AM -0700, Eric Dumazet wrote: > > On Sun, 2013-08-11 at 16:06 +0200, Hannes Frederic Sowa wrote: > > > > > I am sorry to report that I had to revert both these patches to have a working > > > systemd again. > > > > Hi Hannes > > > > If you revert only the second patch, the problem is still there, right ? > > Exactly, only reverting 2/2 did not help. I wonder if the consumed field (part of skb->cb{}) would not be zero ? I assumed it was. I have to check. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 05442df..a175ba4 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -35,6 +35,7 @@ struct unix_skb_parms { #ifdef CONFIG_SECURITY_NETWORK u32 secid; /* Security ID */ #endif + u32 consumed; }; #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c4ce243..99dc760 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1596,6 +1596,10 @@ out: return err; } +/* We use paged skbs for stream sockets, and limit occupancy to 32768 + * bytes, and a minimun of a full page. + */ +#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, struct msghdr *msg, size_t len) @@ -1609,6 +1613,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, struct scm_cookie tmp_scm; bool fds_sent = false; int max_level; + int data_len; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1635,40 +1640,21 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, goto pipe_err; while (sent < len) { - /* - * Optimisation for the fact that under 0.01% of X - * messages typically need breaking up. - */ - - size = len-sent; + size = len - sent; /* Keep two messages in the pipe so it schedules better */ - if (size > ((sk->sk_sndbuf >> 1) - 64)) - size = (sk->sk_sndbuf >> 1) - 64; + size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); - if (size > SKB_MAX_ALLOC) - size = SKB_MAX_ALLOC; - - /* - * Grab a buffer - */ + /* allow fallback to order-0 allocations */ + size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); - skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, - &err); + data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); - if (skb == NULL) + skb = sock_alloc_send_pskb(sk, size - data_len, data_len, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) goto out_err; - /* - * If you pass two values to the sock_alloc_send_skb - * it tries to grab the large buffer with GFP_NOFS - * (which can fail easily), and if it fails grab the - * fallback size buffer which is under a page and will - * succeed. [Alan] - */ - size = min_t(int, size, skb_tailroom(skb)); - - /* Only send the fds in the first buffer */ err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); if (err < 0) { @@ -1678,7 +1664,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, max_level = err + 1; fds_sent = true; - err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + skb_put(skb, size - data_len); + skb->data_len = data_len; + skb->len = size; + err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, size); if (err) { kfree_skb(skb); goto out_err; @@ -1890,6 +1879,11 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, return timeo; } +static unsigned int unix_skb_len(const struct sk_buff *skb) +{ + return skb->len - UNIXCB(skb).consumed; +} + static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) @@ -1977,8 +1971,8 @@ again: } skip = sk_peek_offset(sk, flags); - while (skip >= skb->len) { - skip -= skb->len; + while (skip >= unix_skb_len(skb)) { + skip -= unix_skb_len(skb); last = skb; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) @@ -2005,8 +1999,9 @@ again: sunaddr = NULL; } - chunk = min_t(unsigned int, skb->len - skip, size); - if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) { + chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); + if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip, + msg->msg_iov, chunk)) { if (copied == 0) copied = -EFAULT; break; @@ -2016,14 +2011,14 @@ again: /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { - skb_pull(skb, chunk); + UNIXCB(skb).consumed += chunk; sk_peek_offset_bwd(sk, chunk); if (UNIXCB(skb).fp) unix_detach_fds(siocb->scm, skb); - if (skb->len) + if (unix_skb_len(skb)) break; skb_unlink(skb, &sk->sk_receive_queue); @@ -2107,7 +2102,7 @@ long unix_inq_len(struct sock *sk) if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) - amount += skb->len; + amount += unix_skb_len(skb); } else { skb = skb_peek(&sk->sk_receive_queue); if (skb)