[LEDE-DEV,ubox] Fixes log read starvation issue after threshold reached

Submitted by Ron Brash on July 11, 2017, 7:04 p.m.

Details

Message ID CAPZm7uWqvLursodLqXz5rgAXLs5DXqExojznxCQBDHqKZBwBCQ@mail.gmail.com
State New
Headers show

Commit Message

Ron Brash July 11, 2017, 7:04 p.m.
This patch fixes a logread starvation error, which occurs after many
logs are generated (around 16k if defaults are used). The log read
process seems to halt silently and yet continues running.  A restart
of the log services fixes it.

This is problematic because logs should be logged, instead of silently lost

Signed-off-by: “Ron Brash <“ron.brash@gmail.com”>
---
 log/logread.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

                ustream_consume(s, cur_len);
@@ -240,14 +242,28 @@ static void logread_fd_data_cb(struct ustream
*s, int bytes)
                uloop_end();
 }

+static void notify_fd_remove_cb();
+
 static void logread_fd_cb(struct ubus_request *req, int fd)
 {
        static struct ustream_fd test_fd;
-
+       uloop_register_notify_fd_remove(fd, notify_fd_remove_cb);
        test_fd.stream.notify_read = logread_fd_data_cb;
        ustream_fd_init(&test_fd, fd);
 }

+struct ubus_context *context;
+uint32_t ctx_id;
+struct blob_buf * bb;
+struct ubus_request request;
+
+static void notify_fd_remove_cb() {
+       ubus_lookup_id(context, "log", &ctx_id);
+       ubus_invoke_async(context, ctx_id, "read", bb->head, &request);
+       request.fd_cb = logread_fd_cb;
+       ubus_complete_request_async(context, &request);
+}
+
 int main(int argc, char **argv)
 {
        static struct ubus_request req;
@@ -362,6 +378,9 @@ int main(int argc, char **argv)
                        sender.fd = STDOUT_FILENO;
                }

+               context = ctx;
+               bb = &b;
+
                ubus_invoke_async(ctx, id, "read", b.head, &req);
                req.fd_cb = logread_fd_cb;
                ubus_complete_request_async(ctx, &req);
@@ -374,3 +393,4 @@ int main(int argc, char **argv)

        return ret;
 }
+
--
2.7.4

Comments

Jo-Philipp Wich July 12, 2017, 9:34 p.m.
Hi Ron,

thanks for your effort in sharing your fixes.

I have troubles understanding your patch, please find some comments
inline below.

~ Jo

--

On 07/11/2017 09:04 PM, Ron Brash wrote:
> This patch fixes a logread starvation error, which occurs after many
> logs are generated (around 16k if defaults are used). The log read

Do you have an easy test case for this? Is piping ~16K plaintext to the
logger enough to trigger the starvation? I'd like to understand the
problem some more before judging the patch.

> process seems to halt silently and yet continues running.  A restart
> of the log services fixes it.
> 
> This is problematic because logs should be logged, instead of silently lost
> 
> Signed-off-by: “Ron Brash <“ron.brash@gmail.com”>
> ---
>  log/logread.c | 24 ++++++++++++++++++++++--
>  1 file changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/log/logread.c b/log/logread.c
> index edac1d9..7eb028e 100644
> --- a/log/logread.c
> +++ b/log/logread.c
> @@ -230,8 +230,10 @@ static void logread_fd_data_cb(struct ustream *s,
> int bytes)
>                         break;
> 
>                 cur_len = blob_len(a) + sizeof(*a);
> -               if (len < cur_len)
> +               if (len < cur_len) {
> +                       ustream_consume(s, len);
>                         break;
> +               }
> 
>                 log_notify(a);
>                 ustream_consume(s, cur_len);
> @@ -240,14 +242,28 @@ static void logread_fd_data_cb(struct ustream
> *s, int bytes)
>                 uloop_end();
>  }
> 
> +static void notify_fd_remove_cb();

If you move the static function here you do not need the forward
declaration.

> +
>  static void logread_fd_cb(struct ubus_request *req, int fd)
>  {
>         static struct ustream_fd test_fd;
> -
> +       uloop_register_notify_fd_remove(fd, notify_fd_remove_cb);

What is "uloop_register_notify_fd_remove()" ? It is not part of libubox
as far as I can see. Is it a function introduced by some other custom
changes of yours?

>         test_fd.stream.notify_read = logread_fd_data_cb;
>         ustream_fd_init(&test_fd, fd);
>  }
> 
> +struct ubus_context *context;
> +uint32_t ctx_id;
> +struct blob_buf * bb;
> +struct ubus_request request;

The "request" and "ctx_id" variables appear to be local to
"notify_fd_remove_cb()" and should be moved inside the function.

The "context" and "bb" variables should be declared "static" if they're
outside of the function context, however I am not sure if we actually
need them because those values can be usually obtained elsewhere in the
callback.

> +
> +static void notify_fd_remove_cb() {
> +       ubus_lookup_id(context, "log", &ctx_id);
> +       ubus_invoke_async(context, ctx_id, "read", bb->head, &request);
> +       request.fd_cb = logread_fd_cb;
> +       ubus_complete_request_async(context, &request);
> +}
> +
>  int main(int argc, char **argv)
>  {
>         static struct ubus_request req;
> @@ -362,6 +378,9 @@ int main(int argc, char **argv)
>                         sender.fd = STDOUT_FILENO;
>                 }
> 
> +               context = ctx;
> +               bb = &b;
> +
>                 ubus_invoke_async(ctx, id, "read", b.head, &req);
>                 req.fd_cb = logread_fd_cb;
>                 ubus_complete_request_async(ctx, &req);
> @@ -374,3 +393,4 @@ int main(int argc, char **argv)
> 
>         return ret;
>  }
> +
> --
> 2.7.4
> 
> _______________________________________________
> Lede-dev mailing list
> Lede-dev@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/lede-dev
>
Felix Fietkau July 13, 2017, 9:53 a.m.
On 2017-07-11 21:04, Ron Brash wrote:
> This patch fixes a logread starvation error, which occurs after many
> logs are generated (around 16k if defaults are used). The log read
> process seems to halt silently and yet continues running.  A restart
> of the log services fixes it.
> 
> This is problematic because logs should be logged, instead of silently lost
> 
> Signed-off-by: “Ron Brash <“ron.brash@gmail.com”>
> ---
>  log/logread.c | 24 ++++++++++++++++++++++--
>  1 file changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/log/logread.c b/log/logread.c
> index edac1d9..7eb028e 100644
> --- a/log/logread.c
> +++ b/log/logread.c
> @@ -230,8 +230,10 @@ static void logread_fd_data_cb(struct ustream *s,
> int bytes)
>                         break;
> 
>                 cur_len = blob_len(a) + sizeof(*a);
> -               if (len < cur_len)
> +               if (len < cur_len) {
> +                       ustream_consume(s, len);
>                         break;
> +               }
This part looks wrong to me, it would probably break on incomplete reads
(when more data arrives later).

- Felix
Felix Fietkau July 13, 2017, 12:52 p.m.
On 2017-07-13 14:45, Ron Brash wrote:
> Hi Felix,
> 
> Is there a situation where an incomplete read could be forced? We just
> found that the reads would be stalled previously because events were not
> consumed and managed correctly.
> 
> Any other suggestions? :)
Try checking for s->eof.

- Felix
Petr Štetiar July 14, 2017, 7:57 a.m.
Jo-Philipp Wich <jo@mein.io> [2017-07-12 23:34:13]:

Hi,

> Do you have an easy test case for this? Is piping ~16K plaintext to the
> logger enough to trigger the starvation? I'd like to understand the
> problem some more before judging the patch.

I was trying to fix probably similar issue back in March, but the project's
deadline was approaching rather quickly so I've moved this into my TODO list
and started using syslog from Busybox...

Here's just commit message with test case as I wasn't able to fix it properly yet:

	Attempt to fix logread's stuck remote syslog or file logging on logd's restart

	I'm experiencing some strange behaviour of logread's file logging. From
	time to time I see on some machines, that the log file is not growing
	and I'm loosing all the system log messages.

	Steps to reproduce:

	1. /etc/init.d/log stop
	2. logd -S 100 &
	3. logread -f -F /tmp/messages -p /var/run/logread.1.pid -S 100 &
	4. pkill -9 logd
	5. logd -S 100 &

	Without this patch, logread will never log any new messages to
	/tmp/messages file, since the ubus async request is using the old log
	ubus `id` from [2], but logd has now new ubus log `id` from [5].

And here is one more fix which I wasn't able to test and submit yet:

	commit ecbc9e12dc298b76e2597a9bdaff6db092e3e402
	Author: Petr Štetiar <ynezz@true.cz>
	Date:   Sun Mar 19 22:15:56 2017 +0100

	    Check for log file write error and try to recover
	    
	    Signed-off-by: Petr Štetiar <ynezz@true.cz>

	diff --git a/log/logread.c b/log/logread.c
	index 1719976..589ea5f 100644
	--- a/log/logread.c
	+++ b/log/logread.c
	@@ -191,6 +191,14 @@ static int log_notify(struct blob_attr *msg)
				getcodetext(LOG_PRI(p), prioritynames),
				(blobmsg_get_u32(tb[LOG_SOURCE])) ? ("") : (" kernel:"), m);
			ret = write(sender.fd, buf, strlen(buf));
	+               if (ret < 0) {
	+                       fprintf(stderr, "log write failed: %s\n", strerror(errno));
	+                       close(sender.fd);
	+                       sender.fd = -1;
	+                       free(str);
	+                       uloop_end();
	+                       return ret;
	+               }
		}
	 
		free(str);

> > process seems to halt silently and yet continues running.  A restart
> > of the log services fixes it.

Yep.

-- ynezz

Patch hide | download patch | download mbox

diff --git a/log/logread.c b/log/logread.c
index edac1d9..7eb028e 100644
--- a/log/logread.c
+++ b/log/logread.c
@@ -230,8 +230,10 @@  static void logread_fd_data_cb(struct ustream *s,
int bytes)
                        break;

                cur_len = blob_len(a) + sizeof(*a);
-               if (len < cur_len)
+               if (len < cur_len) {
+                       ustream_consume(s, len);
                        break;
+               }

                log_notify(a);