diff mbox series

[PATCHv5,iproute2,net-next,2/2] lib/libnetlink: re malloc buff if size is not enough

Message ID 1508982107-28474-2-git-send-email-liuhangbin@gmail.com
State Accepted, archived
Delegated to: stephen hemminger
Headers show
Series libnetlink: malloc correct buff at run time | expand

Commit Message

Hangbin Liu Oct. 26, 2017, 1:41 a.m. UTC
With commit 72b365e8e0fd ("libnetlink: Double the dump buffer size")
we doubled the buffer size to support more VFs. But the VFs number is
increasing all the time. Some customers even use more than 200 VFs now.

We could not double it everytime when the buffer is not enough. Let's just
not hard code the buffer size and malloc the correct number when running.

Introduce function rtnl_recvmsg() to always return a newly allocated buffer.
The caller need to free it after using.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
 lib/libnetlink.c | 114 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 34 deletions(-)

Comments

David Ahern Oct. 26, 2017, 2:59 a.m. UTC | #1
On 10/25/17 7:41 PM, Hangbin Liu wrote:
> +static int rtnl_recvmsg(int fd, struct msghdr *msg, char **answer)
> +{
> +	struct iovec *iov = msg->msg_iov;
> +	char *buf;
> +	int len;
> +
> +	iov->iov_base = NULL;
> +	iov->iov_len = 0;
> +
> +	len = __rtnl_recvmsg(fd, msg, MSG_PEEK | MSG_TRUNC);
> +	if (len < 0)
> +		return len;
> +
> +	buf = malloc(len);
> +	if (!buf) {
> +		fprintf(stderr, "malloc error: not enough buffer\n");
> +		return -ENOMEM;
> +	}
> +
> +	iov->iov_base = buf;
> +	iov->iov_len = len;
> +
> +	len = __rtnl_recvmsg(fd, msg, 0);
> +	if (len < 0) {
> +		free(buf);
> +		return len;
> +	}

The kernel needs a flag that says "give me the message of the buffer is
large enough; if not just PEEK and tell me the length." That would avoid
the double call in most cases.
Stephen Hemminger Oct. 26, 2017, 10:24 a.m. UTC | #2
On Wed, 25 Oct 2017 20:59:01 -0600
David Ahern <dsahern@gmail.com> wrote:

> On 10/25/17 7:41 PM, Hangbin Liu wrote:
> > +static int rtnl_recvmsg(int fd, struct msghdr *msg, char **answer)
> > +{
> > +	struct iovec *iov = msg->msg_iov;
> > +	char *buf;
> > +	int len;
> > +
> > +	iov->iov_base = NULL;
> > +	iov->iov_len = 0;
> > +
> > +	len = __rtnl_recvmsg(fd, msg, MSG_PEEK | MSG_TRUNC);
> > +	if (len < 0)
> > +		return len;
> > +
> > +	buf = malloc(len);
> > +	if (!buf) {
> > +		fprintf(stderr, "malloc error: not enough buffer\n");
> > +		return -ENOMEM;
> > +	}
> > +
> > +	iov->iov_base = buf;
> > +	iov->iov_len = len;
> > +
> > +	len = __rtnl_recvmsg(fd, msg, 0);
> > +	if (len < 0) {
> > +		free(buf);
> > +		return len;
> > +	}  
> 
> The kernel needs a flag that says "give me the message of the buffer is
> large enough; if not just PEEK and tell me the length." That would avoid
> the double call in most cases.

Actually this has little impact because old code was doing implicit zero
of whole buffer, new code does not.
David Ahern Oct. 26, 2017, 3:28 p.m. UTC | #3
On 10/26/17 4:24 AM, Stephen Hemminger wrote:
>>
>> The kernel needs a flag that says "give me the message of the buffer is
>> large enough; if not just PEEK and tell me the length." That would avoid
>> the double call in most cases.
> 
> Actually this has little impact because old code was doing implicit zero
> of whole buffer, new code does not.
> 

The patch calls recvmsg twice; libnl does the same thing. It would be
better performance wise to have a flag that allows retrieval of the
message if the supplied buffer is large enough and PEEK semantics if
not. It was really a comment on how we could do better with proper
kernel support.
Phil Sutter Oct. 26, 2017, 3:33 p.m. UTC | #4
On Thu, Oct 26, 2017 at 09:28:00AM -0600, David Ahern wrote:
> On 10/26/17 4:24 AM, Stephen Hemminger wrote:
> >>
> >> The kernel needs a flag that says "give me the message of the buffer is
> >> large enough; if not just PEEK and tell me the length." That would avoid
> >> the double call in most cases.
> > 
> > Actually this has little impact because old code was doing implicit zero
> > of whole buffer, new code does not.
> > 
> 
> The patch calls recvmsg twice; libnl does the same thing. It would be
> better performance wise to have a flag that allows retrieval of the
> message if the supplied buffer is large enough and PEEK semantics if
> not. It was really a comment on how we could do better with proper
> kernel support.

Doesn't MSG_TRUNC without MSG_PEEK do just that?

Cheers, Phil
David Ahern Oct. 26, 2017, 3:42 p.m. UTC | #5
On 10/26/17 9:33 AM, Phil Sutter wrote:
> On Thu, Oct 26, 2017 at 09:28:00AM -0600, David Ahern wrote:
>> On 10/26/17 4:24 AM, Stephen Hemminger wrote:
>>>>
>>>> The kernel needs a flag that says "give me the message of the buffer is
>>>> large enough; if not just PEEK and tell me the length." That would avoid
>>>> the double call in most cases.
>>>
>>> Actually this has little impact because old code was doing implicit zero
>>> of whole buffer, new code does not.
>>>
>>
>> The patch calls recvmsg twice; libnl does the same thing. It would be
>> better performance wise to have a flag that allows retrieval of the
>> message if the supplied buffer is large enough and PEEK semantics if
>> not. It was really a comment on how we could do better with proper
>> kernel support.
> 
> Doesn't MSG_TRUNC without MSG_PEEK do just that?

MSG_TRUNC returns the actual message length if it is greater than the
buffer. The message was dequeued and what could be copied into the
supplied buffer is copied, but that means the returned message is truncated.
Phil Sutter Oct. 26, 2017, 6:31 p.m. UTC | #6
On Thu, Oct 26, 2017 at 09:42:46AM -0600, David Ahern wrote:
> On 10/26/17 9:33 AM, Phil Sutter wrote:
> > On Thu, Oct 26, 2017 at 09:28:00AM -0600, David Ahern wrote:
> >> On 10/26/17 4:24 AM, Stephen Hemminger wrote:
> >>>>
> >>>> The kernel needs a flag that says "give me the message of the buffer is
> >>>> large enough; if not just PEEK and tell me the length." That would avoid
> >>>> the double call in most cases.
> >>>
> >>> Actually this has little impact because old code was doing implicit zero
> >>> of whole buffer, new code does not.
> >>>
> >>
> >> The patch calls recvmsg twice; libnl does the same thing. It would be
> >> better performance wise to have a flag that allows retrieval of the
> >> message if the supplied buffer is large enough and PEEK semantics if
> >> not. It was really a comment on how we could do better with proper
> >> kernel support.
> > 
> > Doesn't MSG_TRUNC without MSG_PEEK do just that?
> 
> MSG_TRUNC returns the actual message length if it is greater than the
> buffer. The message was dequeued and what could be copied into the
> supplied buffer is copied, but that means the returned message is truncated.

Ah, so one would have to resend the request then. Stupid me. :)

Thanks, Phil
Eric Dumazet Feb. 12, 2019, 11:32 p.m. UTC | #7
On 10/25/2017 06:41 PM, Hangbin Liu wrote:
> With commit 72b365e8e0fd ("libnetlink: Double the dump buffer size")
> we doubled the buffer size to support more VFs. But the VFs number is
> increasing all the time. Some customers even use more than 200 VFs now.
> 
> We could not double it everytime when the buffer is not enough. Let's just
> not hard code the buffer size and malloc the correct number when running.
> 
> Introduce function rtnl_recvmsg() to always return a newly allocated buffer.
> The caller need to free it after using.
> 
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> Signed-off-by: Phil Sutter <phil@nwl.cc>
> ---
>  lib/libnetlink.c | 114 ++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 80 insertions(+), 34 deletions(-)
> 
> diff --git a/lib/libnetlink.c b/lib/libnetlink.c
> index be7ac86..1847c0b 100644
> --- a/lib/libnetlink.c
> +++ b/lib/libnetlink.c
> @@ -402,6 +402,64 @@ static void rtnl_dump_error(const struct rtnl_handle *rth,
>  	}
>  }
>  
> +static int __rtnl_recvmsg(int fd, struct msghdr *msg, int flags)
> +{
> +	int len;
> +
> +	do {
> +		len = recvmsg(fd, msg, flags);
> +	} while (len < 0 && (errno == EINTR || errno == EAGAIN));
> +
> +	if (len < 0) {
> +		fprintf(stderr, "netlink receive error %s (%d)\n",
> +			strerror(errno), errno);
> +		return -errno;
> +	}
> +
> +	if (len == 0) {
> +		fprintf(stderr, "EOF on netlink\n");
> +		return -ENODATA;
> +	}
> +
> +	return len;
> +}
> +
> +static int rtnl_recvmsg(int fd, struct msghdr *msg, char **answer)
> +{
> +	struct iovec *iov = msg->msg_iov;
> +	char *buf;
> +	int len;
> +
> +	iov->iov_base = NULL;
> +	iov->iov_len = 0;
> +
> +	len = __rtnl_recvmsg(fd, msg, MSG_PEEK | MSG_TRUNC);
> +	if (len < 0)
> +		return len;
> +
> +	buf = malloc(len);
> +	if (!buf) {
> +		fprintf(stderr, "malloc error: not enough buffer\n");
> +		return -ENOMEM;
> +	}
> +
> +	iov->iov_base = buf;
> +	iov->iov_len = len;
> +
> +	len = __rtnl_recvmsg(fd, msg, 0);
> +	if (len < 0) {
> +		free(buf);
> +		return len;
> +	}
> +
> +	if (answer)
> +		*answer = buf;
> +	else
> +		free(buf);
> +
> +	return len;
> +}
> +


This patch brings a serious performance penalty.

ss command now uses two system calls per ~4KB worth of data

recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{NULL, 0}], msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 3328 <0.000120>
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"h\0\0\0\24\0\2\0@\342\1\0\322\0\6\0\n\1\1\0\250\253\276@&\7\370\260\200\231\16\6"..., 3328}], msg_controllen=0, msg_flags=0}, 0) = 3328 <0.000108>
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{NULL, 0}], msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 3328 <0.000086>
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"h\0\0\0\24\0\2\0@\342\1\0\322\0\6\0\n\10\2\0002A\266S&\7\370\260\200\231\16\6"..., 3328}], msg_controllen=0, msg_flags=0}, 0) = 3328 <0.000121>


So we are back to a very pessimistic situation.
Eric Dumazet Feb. 12, 2019, 11:43 p.m. UTC | #8
On 02/12/2019 03:32 PM, Eric Dumazet wrote:
> 

> 
> This patch brings a serious performance penalty.
> 
> ss command now uses two system calls per ~4KB worth of data
> 
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{NULL, 0}], msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 3328 <0.000120>
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"h\0\0\0\24\0\2\0@\342\1\0\322\0\6\0\n\1\1\0\250\253\276@&\7\370\260\200\231\16\6"..., 3328}], msg_controllen=0, msg_flags=0}, 0) = 3328 <0.000108>
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{NULL, 0}], msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 3328 <0.000086>
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"h\0\0\0\24\0\2\0@\342\1\0\322\0\6\0\n\10\2\0002A\266S&\7\370\260\200\231\16\6"..., 3328}], msg_controllen=0, msg_flags=0}, 0) = 3328 <0.000121>
> 
> 
> So we are back to a very pessimistic situation.
> 

I guess this patch will solve the issue :

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index ced33728777a17e0905e76acb904ac4709707488..309b5b3787e3d8f8c47f035d270ae2b4df01703e 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -442,6 +442,8 @@ static int rtnl_recvmsg(int fd, struct msghdr *msg, char **answer)
        if (len < 0)
                return len;
 
+       if (len < 32768)
+               len = 32768;
        buf = malloc(len);
        if (!buf) {
                fprintf(stderr, "malloc error: not enough buffer\n");
diff mbox series

Patch

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index be7ac86..1847c0b 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -402,6 +402,64 @@  static void rtnl_dump_error(const struct rtnl_handle *rth,
 	}
 }
 
+static int __rtnl_recvmsg(int fd, struct msghdr *msg, int flags)
+{
+	int len;
+
+	do {
+		len = recvmsg(fd, msg, flags);
+	} while (len < 0 && (errno == EINTR || errno == EAGAIN));
+
+	if (len < 0) {
+		fprintf(stderr, "netlink receive error %s (%d)\n",
+			strerror(errno), errno);
+		return -errno;
+	}
+
+	if (len == 0) {
+		fprintf(stderr, "EOF on netlink\n");
+		return -ENODATA;
+	}
+
+	return len;
+}
+
+static int rtnl_recvmsg(int fd, struct msghdr *msg, char **answer)
+{
+	struct iovec *iov = msg->msg_iov;
+	char *buf;
+	int len;
+
+	iov->iov_base = NULL;
+	iov->iov_len = 0;
+
+	len = __rtnl_recvmsg(fd, msg, MSG_PEEK | MSG_TRUNC);
+	if (len < 0)
+		return len;
+
+	buf = malloc(len);
+	if (!buf) {
+		fprintf(stderr, "malloc error: not enough buffer\n");
+		return -ENOMEM;
+	}
+
+	iov->iov_base = buf;
+	iov->iov_len = len;
+
+	len = __rtnl_recvmsg(fd, msg, 0);
+	if (len < 0) {
+		free(buf);
+		return len;
+	}
+
+	if (answer)
+		*answer = buf;
+	else
+		free(buf);
+
+	return len;
+}
+
 int rtnl_dump_filter_l(struct rtnl_handle *rth,
 		       const struct rtnl_dump_filter_arg *arg)
 {
@@ -413,31 +471,18 @@  int rtnl_dump_filter_l(struct rtnl_handle *rth,
 		.msg_iov = &iov,
 		.msg_iovlen = 1,
 	};
-	char buf[32768];
+	char *buf;
 	int dump_intr = 0;
 
-	iov.iov_base = buf;
 	while (1) {
 		int status;
 		const struct rtnl_dump_filter_arg *a;
 		int found_done = 0;
 		int msglen = 0;
 
-		iov.iov_len = sizeof(buf);
-		status = recvmsg(rth->fd, &msg, 0);
-
-		if (status < 0) {
-			if (errno == EINTR || errno == EAGAIN)
-				continue;
-			fprintf(stderr, "netlink receive error %s (%d)\n",
-				strerror(errno), errno);
-			return -1;
-		}
-
-		if (status == 0) {
-			fprintf(stderr, "EOF on netlink\n");
-			return -1;
-		}
+		status = rtnl_recvmsg(rth->fd, &msg, &buf);
+		if (status < 0)
+			return status;
 
 		if (rth->dump_fp)
 			fwrite(buf, 1, NLMSG_ALIGN(status), rth->dump_fp);
@@ -462,8 +507,10 @@  int rtnl_dump_filter_l(struct rtnl_handle *rth,
 
 				if (h->nlmsg_type == NLMSG_DONE) {
 					err = rtnl_dump_done(h);
-					if (err < 0)
+					if (err < 0) {
+						free(buf);
 						return -1;
+					}
 
 					found_done = 1;
 					break; /* process next filter */
@@ -471,19 +518,23 @@  int rtnl_dump_filter_l(struct rtnl_handle *rth,
 
 				if (h->nlmsg_type == NLMSG_ERROR) {
 					rtnl_dump_error(rth, h);
+					free(buf);
 					return -1;
 				}
 
 				if (!rth->dump_fp) {
 					err = a->filter(&nladdr, h, a->arg1);
-					if (err < 0)
+					if (err < 0) {
+						free(buf);
 						return err;
+					}
 				}
 
 skip_it:
 				h = NLMSG_NEXT(h, msglen);
 			}
 		}
+		free(buf);
 
 		if (found_done) {
 			if (dump_intr)
@@ -543,7 +594,7 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 		.msg_iov = &iov,
 		.msg_iovlen = 1,
 	};
-	char   buf[32768] = {};
+	char *buf;
 
 	n->nlmsg_seq = seq = ++rtnl->seq;
 
@@ -556,22 +607,12 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 		return -1;
 	}
 
-	iov.iov_base = buf;
 	while (1) {
-		iov.iov_len = sizeof(buf);
-		status = recvmsg(rtnl->fd, &msg, 0);
+		status = rtnl_recvmsg(rtnl->fd, &msg, &buf);
+
+		if (status < 0)
+			return status;
 
-		if (status < 0) {
-			if (errno == EINTR || errno == EAGAIN)
-				continue;
-			fprintf(stderr, "netlink receive error %s (%d)\n",
-				strerror(errno), errno);
-			return -1;
-		}
-		if (status == 0) {
-			fprintf(stderr, "EOF on netlink\n");
-			return -1;
-		}
 		if (msg.msg_namelen != sizeof(nladdr)) {
 			fprintf(stderr,
 				"sender address length == %d\n",
@@ -585,6 +626,7 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 			if (l < 0 || len > status) {
 				if (msg.msg_flags & MSG_TRUNC) {
 					fprintf(stderr, "Truncated message\n");
+					free(buf);
 					return -1;
 				}
 				fprintf(stderr,
@@ -611,6 +653,7 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 					if (answer)
 						memcpy(answer, h,
 						       MIN(maxlen, h->nlmsg_len));
+					free(buf);
 					return 0;
 				}
 
@@ -619,12 +662,14 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 					rtnl_talk_error(h, err, errfn);
 
 				errno = -err->error;
+				free(buf);
 				return -1;
 			}
 
 			if (answer) {
 				memcpy(answer, h,
 				       MIN(maxlen, h->nlmsg_len));
+				free(buf);
 				return 0;
 			}
 
@@ -633,6 +678,7 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 			status -= NLMSG_ALIGN(len);
 			h = (struct nlmsghdr *)((char *)h + NLMSG_ALIGN(len));
 		}
+		free(buf);
 
 		if (msg.msg_flags & MSG_TRUNC) {
 			fprintf(stderr, "Message truncated\n");