diff mbox

[LEDE-DEV] procd: stop service using SIGKILL if SIGTERM failed to do so

Message ID 1486634573-12258-1-git-send-email-alin.nastac@gmail.com
State Accepted
Delegated to: John Crispin
Headers show

Commit Message

Alin Năstac Feb. 9, 2017, 10:02 a.m. UTC
SIGKILL is sent if instance process is still running after
<term_timeout> seconds after SIGTERM has been sent. To prevent
another daemon process being launched before old process dies,
the instance is kept until SIGCHLD confirms that service has
been stopped.

Signed-off-by: Alin Nastac <alin.nastac@gmail.com>
---
 service/instance.c | 44 +++++++++++++++++++++++++++++++++++++-------
 service/instance.h |  1 +
 service/service.c  | 26 ++++++++++++++++----------
 service/service.h  |  3 +++
 4 files changed, 57 insertions(+), 17 deletions(-)

Comments

John Crispin Feb. 9, 2017, 10:54 a.m. UTC | #1
Hi,

i know that someone else is about to send a fix for the same issue but
with a different approach of fixing it. i'd like to wait for this 2nd
patch to arrive before we decide which to merge

	John

On 09/02/2017 11:02, Alin Nastac wrote:
> SIGKILL is sent if instance process is still running after
> <term_timeout> seconds after SIGTERM has been sent. To prevent
> another daemon process being launched before old process dies,
> the instance is kept until SIGCHLD confirms that service has
> been stopped.
> 
> Signed-off-by: Alin Nastac <alin.nastac@gmail.com>
> ---
>  service/instance.c | 44 +++++++++++++++++++++++++++++++++++++-------
>  service/instance.h |  1 +
>  service/service.c  | 26 ++++++++++++++++----------
>  service/service.h  |  3 +++
>  4 files changed, 57 insertions(+), 17 deletions(-)
> 
> diff --git a/service/instance.c b/service/instance.c
> index 018db3c..4d340fd 100644
> --- a/service/instance.c
> +++ b/service/instance.c
> @@ -55,6 +55,7 @@ enum {
>  	INSTANCE_ATTR_SECCOMP,
>  	INSTANCE_ATTR_PIDFILE,
>  	INSTANCE_ATTR_RELOADSIG,
> +	INSTANCE_ATTR_TERMTIMEOUT,
>  	__INSTANCE_ATTR_MAX
>  };
>  
> @@ -79,6 +80,7 @@ static const struct blobmsg_policy instance_attr[__INSTANCE_ATTR_MAX] = {
>  	[INSTANCE_ATTR_SECCOMP] = { "seccomp", BLOBMSG_TYPE_STRING },
>  	[INSTANCE_ATTR_PIDFILE] = { "pidfile", BLOBMSG_TYPE_STRING },
>  	[INSTANCE_ATTR_RELOADSIG] = { "reload_signal", BLOBMSG_TYPE_INT32 },
> +	[INSTANCE_ATTR_TERMTIMEOUT] = { "term_timeout", BLOBMSG_TYPE_INT32 },
>  };
>  
>  enum {
> @@ -389,8 +391,16 @@ instance_start(struct service_instance *in)
>  		return;
>  	}
>  
> -	if (in->proc.pending || !in->command)
> +	if (!in->command) {
> +		LOG("Not starting instance %s::%s, command not set\n", in->srv->name, in->name);
>  		return;
> +	}
> +
> +	if (in->proc.pending) {
> +		if (in->halt)
> +			in->restart = true;
> +		return;
> +	}
>  
>  	instance_free_stdio(in);
>  	if (in->_stdout.fd.fd > -2) {
> @@ -408,7 +418,7 @@ instance_start(struct service_instance *in)
>  	}
>  
>  	in->restart = false;
> -	in->halt = !in->respawn;
> +	in->halt = false;
>  
>  	if (!in->valid)
>  		return;
> @@ -494,7 +504,11 @@ instance_timeout(struct uloop_timeout *t)
>  
>  	in = container_of(t, struct service_instance, timeout);
>  
> -	if (!in->halt && (in->restart || in->respawn))
> +	if (in->halt) {
> +		LOG("Instance %s::%s pid %d not stopped on SIGTERM, sending SIGKILL instead\n",
> +				in->srv->name, in->name, in->proc.pid);
> +		kill(in->proc.pid, SIGKILL);
> +	} else if (in->restart || in->respawn)
>  		instance_start(in);
>  }
>  
> @@ -515,8 +529,19 @@ instance_exit(struct uloop_process *p, int ret)
>  		return;
>  
>  	uloop_timeout_cancel(&in->timeout);
> +	service_event("instance.stop", in->srv->name, in->name);
> +
>  	if (in->halt) {
>  		instance_removepid(in);
> +		if (in->restart)
> +			instance_start(in);
> +		else {
> +			struct service *s = in->srv;
> +
> +			avl_delete(&s->instances.avl, &in->node.avl);
> +			instance_free(in);
> +			service_stopped(s);
> +		}
>  	} else if (in->restart) {
>  		instance_start(in);
>  	} else if (in->respawn) {
> @@ -535,7 +560,6 @@ instance_exit(struct uloop_process *p, int ret)
>  			uloop_timeout_set(&in->timeout, in->respawn_timeout * 1000);
>  		}
>  	}
> -	service_event("instance.stop", in->srv->name, in->name);
>  }
>  
>  void
> @@ -546,6 +570,7 @@ instance_stop(struct service_instance *in)
>  	in->halt = true;
>  	in->restart = in->respawn = false;
>  	kill(in->proc.pid, SIGTERM);
> +	uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
>  }
>  
>  static void
> @@ -559,10 +584,10 @@ instance_restart(struct service_instance *in)
>  		return;
>  	}
>  
> -	in->halt = false;
> +	in->halt = true;
>  	in->restart = true;
>  	kill(in->proc.pid, SIGTERM);
> -	instance_removepid(in);
> +	uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
>  }
>  
>  static bool
> @@ -796,6 +821,8 @@ instance_config_parse(struct service_instance *in)
>  	if (!instance_config_parse_command(in, tb))
>  		return false;
>  
> +	if (tb[INSTANCE_ATTR_TERMTIMEOUT])
> +		in->term_timeout = blobmsg_get_u32(tb[INSTANCE_ATTR_TERMTIMEOUT]);
>  	if (tb[INSTANCE_ATTR_RESPAWN]) {
>  		int i = 0;
>  		uint32_t vals[3] = { 3600, 5, 5};
> @@ -933,8 +960,9 @@ instance_update(struct service_instance *in, struct service_instance *in_new)
>  {
>  	bool changed = instance_config_changed(in, in_new);
>  	bool running = in->proc.pending;
> +	bool stopping = in->halt;
>  
> -	if (!running) {
> +	if (!running || stopping) {
>  		instance_config_move(in, in_new);
>  		instance_start(in);
>  	} else {
> @@ -967,6 +995,7 @@ instance_init(struct service_instance *in, struct service *s, struct blob_attr *
>  	in->config = config;
>  	in->timeout.cb = instance_timeout;
>  	in->proc.cb = instance_exit;
> +	in->term_timeout = 5;
>  
>  	in->_stdout.fd.fd = -2;
>  	in->_stdout.stream.string_data = true;
> @@ -999,6 +1028,7 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
>  		blobmsg_add_u32(b, "pid", in->proc.pid);
>  	if (in->command)
>  		blobmsg_add_blob(b, in->command);
> +	blobmsg_add_u32(b, "term_timeout", in->term_timeout);
>  
>  	if (!avl_is_empty(&in->errors.avl)) {
>  		struct blobmsg_list_node *var;
> diff --git a/service/instance.h b/service/instance.h
> index 3cc2009..78999c8 100644
> --- a/service/instance.h
> +++ b/service/instance.h
> @@ -59,6 +59,7 @@ struct service_instance {
>  	char *seccomp;
>  	char *pidfile;
>  
> +	uint32_t term_timeout;
>  	uint32_t respawn_timeout;
>  	uint32_t respawn_threshold;
>  	uint32_t respawn_retry;
> diff --git a/service/service.c b/service/service.c
> index 2c73901..0584ee0 100644
> --- a/service/service.c
> +++ b/service/service.c
> @@ -59,11 +59,10 @@ service_instance_update(struct vlist_tree *tree, struct vlist_node *node_new,
>  		instance_update(in_o, in_n);
>  		instance_free(in_n);
>  	} else if (in_o) {
> -		DEBUG(2, "Free instance %s::%s\n", in_o->srv->name, in_o->name);
> +		DEBUG(2, "Stop instance %s::%s\n", in_o->srv->name, in_o->name);
>  		instance_stop(in_o);
> -		instance_free(in_o);
>  	} else if (in_n) {
> -		DEBUG(2, "Create instance %s::%s\n", in_n->srv->name, in_n->name);
> +		DEBUG(2, "Start instance %s::%s\n", in_n->srv->name, in_n->name);
>  		instance_start(in_n);
>  	}
>  	blob_buf_init(&b, 0);
> @@ -80,7 +79,7 @@ service_alloc(const char *name)
>  	strcpy(new_name, name);
>  
>  	vlist_init(&s->instances, avl_strcmp, service_instance_update);
> -	s->instances.keep_old = true;
> +	s->instances.no_delete = true;
>  	s->name = new_name;
>  	s->avl.key = s->name;
>  	INIT_LIST_HEAD(&s->validators);
> @@ -149,13 +148,8 @@ service_update(struct service *s, struct blob_attr **tb, bool add)
>  static void
>  service_delete(struct service *s)
>  {
> -	service_event("service.stop", s->name, NULL);
>  	vlist_flush_all(&s->instances);
> -	avl_delete(&services, &s->avl);
> -	trigger_del(s);
> -	free(s->trigger);
> -	free(s);
> -	service_validate_del(s);
> +	service_stopped(s);
>  }
>  
>  enum {
> @@ -606,6 +600,18 @@ service_start_early(char *name, char *cmdline)
>  	return service_handle_set(NULL, NULL, NULL, "add", b.head);
>  }
>  
> +void service_stopped(struct service *s)
> +{
> +	if (avl_is_empty(&s->instances.avl)) {
> +		service_event("service.stop", s->name, NULL);
> +		avl_delete(&services, &s->avl);
> +		trigger_del(s);
> +		free(s->trigger);
> +		free(s);
> +		service_validate_del(s);
> +	}
> +}
> +
>  void service_event(const char *type, const char *service, const char *instance)
>  {
>  	if (!ctx)
> diff --git a/service/service.h b/service/service.h
> index c3f2964..d4f0a83 100644
> --- a/service/service.h
> +++ b/service/service.h
> @@ -50,7 +50,10 @@ void service_validate_add(struct service *s, struct blob_attr *attr);
>  void service_validate_dump(struct blob_buf *b, struct service *s);
>  void service_validate_dump_all(struct blob_buf *b, char *p, char *s);
>  int service_start_early(char *name, char *cmdline);
> +void service_stopped(struct service *s);
>  void service_validate_del(struct service *s);
>  void service_event(const char *type, const char *service, const char *instance);
>  
> +
> +
>  #endif
>
Alin Năstac Feb. 9, 2017, 11:43 a.m. UTC | #2
On Thu, Feb 9, 2017 at 11:54 AM, John Crispin <john@phrozen.org> wrote:
> Hi,
>
> i know that someone else is about to send a fix for the same issue but
> with a different approach of fixing it. i'd like to wait for this 2nd
> patch to arrive before we decide which to merge
Are you sure it wasn't me? :)
You said yesterday that I should send you a patch for it.

The only other approach I could think of would involve a
instance_stop() that waits for the service instance to exit.
I thought initially to do it like this, but decided that waiting
asynchronously for stop event would be a fer better technical solution
to the given issue, don't you agree?

>
>         John
>

Alin
John Crispin Feb. 9, 2017, 12:30 p.m. UTC | #3
On 09/02/2017 12:43, Alin Năstac wrote:
> On Thu, Feb 9, 2017 at 11:54 AM, John Crispin <john@phrozen.org> wrote:
>> Hi,
>>
>> i know that someone else is about to send a fix for the same issue but
>> with a different approach of fixing it. i'd like to wait for this 2nd
>> patch to arrive before we decide which to merge
> Are you sure it wasn't me? :)
> You said yesterday that I should send you a patch for it.
> 
> The only other approach I could think of would involve a
> instance_stop() that waits for the service instance to exit.
> I thought initially to do it like this, but decided that waiting
> asynchronously for stop event would be a fer better technical solution
> to the given issue, don't you agree?
> 
>>
>>         John
>>
> 
> Alin
> 

Hans just told me we are talking about 2 different issues, so there is
no 2nd patch, i'll process this one as normal in that case

	John

> _______________________________________________
> Lede-dev mailing list
> Lede-dev@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/lede-dev
>
diff mbox

Patch

diff --git a/service/instance.c b/service/instance.c
index 018db3c..4d340fd 100644
--- a/service/instance.c
+++ b/service/instance.c
@@ -55,6 +55,7 @@  enum {
 	INSTANCE_ATTR_SECCOMP,
 	INSTANCE_ATTR_PIDFILE,
 	INSTANCE_ATTR_RELOADSIG,
+	INSTANCE_ATTR_TERMTIMEOUT,
 	__INSTANCE_ATTR_MAX
 };
 
@@ -79,6 +80,7 @@  static const struct blobmsg_policy instance_attr[__INSTANCE_ATTR_MAX] = {
 	[INSTANCE_ATTR_SECCOMP] = { "seccomp", BLOBMSG_TYPE_STRING },
 	[INSTANCE_ATTR_PIDFILE] = { "pidfile", BLOBMSG_TYPE_STRING },
 	[INSTANCE_ATTR_RELOADSIG] = { "reload_signal", BLOBMSG_TYPE_INT32 },
+	[INSTANCE_ATTR_TERMTIMEOUT] = { "term_timeout", BLOBMSG_TYPE_INT32 },
 };
 
 enum {
@@ -389,8 +391,16 @@  instance_start(struct service_instance *in)
 		return;
 	}
 
-	if (in->proc.pending || !in->command)
+	if (!in->command) {
+		LOG("Not starting instance %s::%s, command not set\n", in->srv->name, in->name);
 		return;
+	}
+
+	if (in->proc.pending) {
+		if (in->halt)
+			in->restart = true;
+		return;
+	}
 
 	instance_free_stdio(in);
 	if (in->_stdout.fd.fd > -2) {
@@ -408,7 +418,7 @@  instance_start(struct service_instance *in)
 	}
 
 	in->restart = false;
-	in->halt = !in->respawn;
+	in->halt = false;
 
 	if (!in->valid)
 		return;
@@ -494,7 +504,11 @@  instance_timeout(struct uloop_timeout *t)
 
 	in = container_of(t, struct service_instance, timeout);
 
-	if (!in->halt && (in->restart || in->respawn))
+	if (in->halt) {
+		LOG("Instance %s::%s pid %d not stopped on SIGTERM, sending SIGKILL instead\n",
+				in->srv->name, in->name, in->proc.pid);
+		kill(in->proc.pid, SIGKILL);
+	} else if (in->restart || in->respawn)
 		instance_start(in);
 }
 
@@ -515,8 +529,19 @@  instance_exit(struct uloop_process *p, int ret)
 		return;
 
 	uloop_timeout_cancel(&in->timeout);
+	service_event("instance.stop", in->srv->name, in->name);
+
 	if (in->halt) {
 		instance_removepid(in);
+		if (in->restart)
+			instance_start(in);
+		else {
+			struct service *s = in->srv;
+
+			avl_delete(&s->instances.avl, &in->node.avl);
+			instance_free(in);
+			service_stopped(s);
+		}
 	} else if (in->restart) {
 		instance_start(in);
 	} else if (in->respawn) {
@@ -535,7 +560,6 @@  instance_exit(struct uloop_process *p, int ret)
 			uloop_timeout_set(&in->timeout, in->respawn_timeout * 1000);
 		}
 	}
-	service_event("instance.stop", in->srv->name, in->name);
 }
 
 void
@@ -546,6 +570,7 @@  instance_stop(struct service_instance *in)
 	in->halt = true;
 	in->restart = in->respawn = false;
 	kill(in->proc.pid, SIGTERM);
+	uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
 }
 
 static void
@@ -559,10 +584,10 @@  instance_restart(struct service_instance *in)
 		return;
 	}
 
-	in->halt = false;
+	in->halt = true;
 	in->restart = true;
 	kill(in->proc.pid, SIGTERM);
-	instance_removepid(in);
+	uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
 }
 
 static bool
@@ -796,6 +821,8 @@  instance_config_parse(struct service_instance *in)
 	if (!instance_config_parse_command(in, tb))
 		return false;
 
+	if (tb[INSTANCE_ATTR_TERMTIMEOUT])
+		in->term_timeout = blobmsg_get_u32(tb[INSTANCE_ATTR_TERMTIMEOUT]);
 	if (tb[INSTANCE_ATTR_RESPAWN]) {
 		int i = 0;
 		uint32_t vals[3] = { 3600, 5, 5};
@@ -933,8 +960,9 @@  instance_update(struct service_instance *in, struct service_instance *in_new)
 {
 	bool changed = instance_config_changed(in, in_new);
 	bool running = in->proc.pending;
+	bool stopping = in->halt;
 
-	if (!running) {
+	if (!running || stopping) {
 		instance_config_move(in, in_new);
 		instance_start(in);
 	} else {
@@ -967,6 +995,7 @@  instance_init(struct service_instance *in, struct service *s, struct blob_attr *
 	in->config = config;
 	in->timeout.cb = instance_timeout;
 	in->proc.cb = instance_exit;
+	in->term_timeout = 5;
 
 	in->_stdout.fd.fd = -2;
 	in->_stdout.stream.string_data = true;
@@ -999,6 +1028,7 @@  void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
 		blobmsg_add_u32(b, "pid", in->proc.pid);
 	if (in->command)
 		blobmsg_add_blob(b, in->command);
+	blobmsg_add_u32(b, "term_timeout", in->term_timeout);
 
 	if (!avl_is_empty(&in->errors.avl)) {
 		struct blobmsg_list_node *var;
diff --git a/service/instance.h b/service/instance.h
index 3cc2009..78999c8 100644
--- a/service/instance.h
+++ b/service/instance.h
@@ -59,6 +59,7 @@  struct service_instance {
 	char *seccomp;
 	char *pidfile;
 
+	uint32_t term_timeout;
 	uint32_t respawn_timeout;
 	uint32_t respawn_threshold;
 	uint32_t respawn_retry;
diff --git a/service/service.c b/service/service.c
index 2c73901..0584ee0 100644
--- a/service/service.c
+++ b/service/service.c
@@ -59,11 +59,10 @@  service_instance_update(struct vlist_tree *tree, struct vlist_node *node_new,
 		instance_update(in_o, in_n);
 		instance_free(in_n);
 	} else if (in_o) {
-		DEBUG(2, "Free instance %s::%s\n", in_o->srv->name, in_o->name);
+		DEBUG(2, "Stop instance %s::%s\n", in_o->srv->name, in_o->name);
 		instance_stop(in_o);
-		instance_free(in_o);
 	} else if (in_n) {
-		DEBUG(2, "Create instance %s::%s\n", in_n->srv->name, in_n->name);
+		DEBUG(2, "Start instance %s::%s\n", in_n->srv->name, in_n->name);
 		instance_start(in_n);
 	}
 	blob_buf_init(&b, 0);
@@ -80,7 +79,7 @@  service_alloc(const char *name)
 	strcpy(new_name, name);
 
 	vlist_init(&s->instances, avl_strcmp, service_instance_update);
-	s->instances.keep_old = true;
+	s->instances.no_delete = true;
 	s->name = new_name;
 	s->avl.key = s->name;
 	INIT_LIST_HEAD(&s->validators);
@@ -149,13 +148,8 @@  service_update(struct service *s, struct blob_attr **tb, bool add)
 static void
 service_delete(struct service *s)
 {
-	service_event("service.stop", s->name, NULL);
 	vlist_flush_all(&s->instances);
-	avl_delete(&services, &s->avl);
-	trigger_del(s);
-	free(s->trigger);
-	free(s);
-	service_validate_del(s);
+	service_stopped(s);
 }
 
 enum {
@@ -606,6 +600,18 @@  service_start_early(char *name, char *cmdline)
 	return service_handle_set(NULL, NULL, NULL, "add", b.head);
 }
 
+void service_stopped(struct service *s)
+{
+	if (avl_is_empty(&s->instances.avl)) {
+		service_event("service.stop", s->name, NULL);
+		avl_delete(&services, &s->avl);
+		trigger_del(s);
+		free(s->trigger);
+		free(s);
+		service_validate_del(s);
+	}
+}
+
 void service_event(const char *type, const char *service, const char *instance)
 {
 	if (!ctx)
diff --git a/service/service.h b/service/service.h
index c3f2964..d4f0a83 100644
--- a/service/service.h
+++ b/service/service.h
@@ -50,7 +50,10 @@  void service_validate_add(struct service *s, struct blob_attr *attr);
 void service_validate_dump(struct blob_buf *b, struct service *s);
 void service_validate_dump_all(struct blob_buf *b, char *p, char *s);
 int service_start_early(char *name, char *cmdline);
+void service_stopped(struct service *s);
 void service_validate_del(struct service *s);
 void service_event(const char *type, const char *service, const char *instance);
 
+
+
 #endif