diff mbox series

[trivial,1/2] close_all_open_fd(): move to oslib-posix.c

Message ID 94fcee0d7595865b3a6fab744982ad47715e5faf.1706221377.git.mjt@tls.msk.ru
State New
Headers show
Series split out os_close_all_open_fd and use it in net/tap.c too | expand

Commit Message

Michael Tokarev Jan. 25, 2024, 10:29 p.m. UTC
Initially in async-teardown.c, but the same construct is used
elsewhere too.

Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
---
 include/sysemu/os-posix.h |  1 +
 system/async-teardown.c   | 37 +------------------------------------
 util/oslib-posix.c        | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 36 deletions(-)

Comments

Laurent Vivier Jan. 26, 2024, 7:44 a.m. UTC | #1
Le 25/01/2024 à 23:29, Michael Tokarev a écrit :
> Initially in async-teardown.c, but the same construct is used
> elsewhere too.
> 
> Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
> ---
>   include/sysemu/os-posix.h |  1 +
>   system/async-teardown.c   | 37 +------------------------------------
>   util/oslib-posix.c        | 36 ++++++++++++++++++++++++++++++++++++
>   3 files changed, 38 insertions(+), 36 deletions(-)
> 
> diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
> index dff32ae185..4c91d03f44 100644
> --- a/include/sysemu/os-posix.h
> +++ b/include/sysemu/os-posix.h
> @@ -53,6 +53,7 @@ bool os_set_runas(const char *user_id);
>   void os_set_chroot(const char *path);
>   void os_setup_post(void);
>   int os_mlock(void);
> +void os_close_all_open_fd(int minfd);
>   
>   /**
>    * qemu_alloc_stack:
> diff --git a/system/async-teardown.c b/system/async-teardown.c
> index 396963c091..41d3d94935 100644
> --- a/system/async-teardown.c
> +++ b/system/async-teardown.c
> @@ -26,40 +26,6 @@
>   
>   static pid_t the_ppid;
>   
> -/*
> - * Close all open file descriptors.
> - */
> -static void close_all_open_fd(void)
> -{
> -    struct dirent *de;
> -    int fd, dfd;
> -    DIR *dir;
> -
> -#ifdef CONFIG_CLOSE_RANGE
> -    int r = close_range(0, ~0U, 0);
> -    if (!r) {
> -        /* Success, no need to try other ways. */
> -        return;
> -    }
> -#endif
> -
> -    dir = opendir("/proc/self/fd");
> -    if (!dir) {
> -        /* If /proc is not mounted, there is nothing that can be done. */
> -        return;
> -    }
> -    /* Avoid closing the directory. */
> -    dfd = dirfd(dir);
> -
> -    for (de = readdir(dir); de; de = readdir(dir)) {
> -        fd = atoi(de->d_name);
> -        if (fd != dfd) {
> -            close(fd);
> -        }
> -    }
> -    closedir(dir);
> -}
> -
>   static void hup_handler(int signal)
>   {
>       /* Check every second if this process has been reparented. */
> @@ -85,9 +51,8 @@ static int async_teardown_fn(void *arg)
>       /*
>        * Close all file descriptors that might have been inherited from the
>        * main qemu process when doing clone, needed to make libvirt happy.
> -     * Not using close_range for increased compatibility with older kernels.
>        */
> -    close_all_open_fd();
> +    os_close_all_open_fd(0);
>   
>       /* Set up a handler for SIGHUP and unblock SIGHUP. */
>       sigaction(SIGHUP, &sa, NULL);
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 7c297003b9..09d0ce4da6 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -27,6 +27,7 @@
>    */
>   
>   #include "qemu/osdep.h"
> +#include <dirent.h>
>   #include <termios.h>
>   
>   #include <glib/gprintf.h>
> @@ -106,6 +107,41 @@ int qemu_get_thread_id(void)
>   #endif
>   }
>   
> +/*
> + * Close all open file descriptors starting with minfd and up.
> + * Not using close_range for increased compatibility with older kernels.
> + */
> +void os_close_all_open_fd(int minfd)
> +{
> +    struct dirent *de;
> +    int fd, dfd;
> +    DIR *dir;
> +
> +#ifdef CONFIG_CLOSE_RANGE
> +    int r = close_range(minfd, ~0U, 0);
> +    if (!r) {
> +        /* Success, no need to try other ways. */
> +        return;
> +    }
> +#endif
> +
> +    dir = opendir("/proc/self/fd");
> +    if (!dir) {
> +        /* If /proc is not mounted, there is nothing that can be done. */
> +        return;
> +    }
> +    /* Avoid closing the directory. */
> +    dfd = dirfd(dir);
> +
> +    for (de = readdir(dir); de; de = readdir(dir)) {
> +        fd = atoi(de->d_name);
> +        if (fd >= minfd && fd != dfd) {
> +            close(fd);
> +        }
> +    }
> +    closedir(dir);
> +}

I think the way using sysconf(_SC_OPEN_MAX) is more portable, simpler and cleaner than the one using 
/proc/self/fd.

Thanks,
Laurent
> +
>   int qemu_daemon(int nochdir, int noclose)
>   {
>       return daemon(nochdir, noclose);
Daniel P. Berrangé Jan. 26, 2024, 9:06 a.m. UTC | #2
On Fri, Jan 26, 2024 at 08:44:13AM +0100, Laurent Vivier wrote:
> Le 25/01/2024 à 23:29, Michael Tokarev a écrit :
> > Initially in async-teardown.c, but the same construct is used
> > elsewhere too.
> > 
> > Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
> > ---
> >   include/sysemu/os-posix.h |  1 +
> >   system/async-teardown.c   | 37 +------------------------------------
> >   util/oslib-posix.c        | 36 ++++++++++++++++++++++++++++++++++++
> >   3 files changed, 38 insertions(+), 36 deletions(-)
> > 
> > diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
> > index dff32ae185..4c91d03f44 100644
> > --- a/include/sysemu/os-posix.h
> > +++ b/include/sysemu/os-posix.h
> > @@ -53,6 +53,7 @@ bool os_set_runas(const char *user_id);
> >   void os_set_chroot(const char *path);
> >   void os_setup_post(void);
> >   int os_mlock(void);
> > +void os_close_all_open_fd(int minfd);
> >   /**
> >    * qemu_alloc_stack:
> > diff --git a/system/async-teardown.c b/system/async-teardown.c
> > index 396963c091..41d3d94935 100644
> > --- a/system/async-teardown.c
> > +++ b/system/async-teardown.c
> > @@ -26,40 +26,6 @@
> >   static pid_t the_ppid;
> > -/*
> > - * Close all open file descriptors.
> > - */
> > -static void close_all_open_fd(void)
> > -{
> > -    struct dirent *de;
> > -    int fd, dfd;
> > -    DIR *dir;
> > -
> > -#ifdef CONFIG_CLOSE_RANGE
> > -    int r = close_range(0, ~0U, 0);
> > -    if (!r) {
> > -        /* Success, no need to try other ways. */
> > -        return;
> > -    }
> > -#endif
> > -
> > -    dir = opendir("/proc/self/fd");
> > -    if (!dir) {
> > -        /* If /proc is not mounted, there is nothing that can be done. */
> > -        return;
> > -    }
> > -    /* Avoid closing the directory. */
> > -    dfd = dirfd(dir);
> > -
> > -    for (de = readdir(dir); de; de = readdir(dir)) {
> > -        fd = atoi(de->d_name);
> > -        if (fd != dfd) {
> > -            close(fd);
> > -        }
> > -    }
> > -    closedir(dir);
> > -}
> > -
> >   static void hup_handler(int signal)
> >   {
> >       /* Check every second if this process has been reparented. */
> > @@ -85,9 +51,8 @@ static int async_teardown_fn(void *arg)
> >       /*
> >        * Close all file descriptors that might have been inherited from the
> >        * main qemu process when doing clone, needed to make libvirt happy.
> > -     * Not using close_range for increased compatibility with older kernels.
> >        */
> > -    close_all_open_fd();
> > +    os_close_all_open_fd(0);
> >       /* Set up a handler for SIGHUP and unblock SIGHUP. */
> >       sigaction(SIGHUP, &sa, NULL);
> > diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> > index 7c297003b9..09d0ce4da6 100644
> > --- a/util/oslib-posix.c
> > +++ b/util/oslib-posix.c
> > @@ -27,6 +27,7 @@
> >    */
> >   #include "qemu/osdep.h"
> > +#include <dirent.h>
> >   #include <termios.h>
> >   #include <glib/gprintf.h>
> > @@ -106,6 +107,41 @@ int qemu_get_thread_id(void)
> >   #endif
> >   }
> > +/*
> > + * Close all open file descriptors starting with minfd and up.
> > + * Not using close_range for increased compatibility with older kernels.
> > + */
> > +void os_close_all_open_fd(int minfd)
> > +{
> > +    struct dirent *de;
> > +    int fd, dfd;
> > +    DIR *dir;
> > +
> > +#ifdef CONFIG_CLOSE_RANGE
> > +    int r = close_range(minfd, ~0U, 0);
> > +    if (!r) {
> > +        /* Success, no need to try other ways. */
> > +        return;
> > +    }
> > +#endif
> > +
> > +    dir = opendir("/proc/self/fd");
> > +    if (!dir) {
> > +        /* If /proc is not mounted, there is nothing that can be done. */
> > +        return;
> > +    }
> > +    /* Avoid closing the directory. */
> > +    dfd = dirfd(dir);
> > +
> > +    for (de = readdir(dir); de; de = readdir(dir)) {
> > +        fd = atoi(de->d_name);
> > +        if (fd >= minfd && fd != dfd) {
> > +            close(fd);
> > +        }
> > +    }
> > +    closedir(dir);
> > +}
> 
> I think the way using sysconf(_SC_OPEN_MAX) is more portable, simpler and
> cleaner than the one using /proc/self/fd.

A fallback that uses _SC_OPEN_MAX is good for portability, but it is
should not be considered a replacement for iterating over /proc/self/fd,
rather an additional fallback for non-Linux, or when /proc is not mounted.
It is not uncommon for _SC_OPEN_MAX to be *exceedingly* high

  $ podman run -it quay.io/centos/centos:stream9
  [root@4a440d62935c /]# ulimit -n
  524288

Iterating over 1/2 a million FDs is a serious performance penalty that
we don't want to have, so _SC_OPEN_MAX should always be the last resort.

With regards,
Daniel
Michael Tokarev Jan. 26, 2024, 10:45 a.m. UTC | #3
26.01.2024 12:06, Daniel P. Berrangé wrote:
> On Fri, Jan 26, 2024 at 08:44:13AM +0100, Laurent Vivier wrote:
>> Le 25/01/2024 à 23:29, Michael Tokarev a écrit :


>> I think the way using sysconf(_SC_OPEN_MAX) is more portable, simpler and
>> cleaner than the one using /proc/self/fd.
> 
> A fallback that uses _SC_OPEN_MAX is good for portability, but it is
> should not be considered a replacement for iterating over /proc/self/fd,
> rather an additional fallback for non-Linux, or when /proc is not mounted.
> It is not uncommon for _SC_OPEN_MAX to be *exceedingly* high
> 
>    $ podman run -it quay.io/centos/centos:stream9
>    [root@4a440d62935c /]# ulimit -n
>    524288
> 
> Iterating over 1/2 a million FDs is a serious performance penalty that
> we don't want to have, so _SC_OPEN_MAX should always be the last resort.

 From yesterday conversation in IRC which started this:

  <mmlb> open files          (-n) 1073741816

(it is a docker container)
They weren't able to start qemu.. :)

Sanity of such setting is questionable, but ok.

Not only linux implement close_range(2) syscall, it is also
available on some *BSDs.

And the most important point is, - we should aim at using O_CLOEXEC
everywhere, without this need to close each FD at exec time.  I think
qemu is the only software with such paranoid closing when just running
an interface setup script..

So yes, loop though all FDs is okay too as a last resort but..
For scripts in net/tap.c, this isn't necessary at all.  I want to take
a look at all open(2)/socket(2)/etc calls in qemu to ensure they're all
using O_CLOEXEC or are closed promptly, after which this code can be
removed entirely, hopefully.  Maybe this patch wont be needed after
that (so only async-teardown will need that code since it doesn't
do exec()).

Thanks,

/mjt
Daniel P. Berrangé Jan. 26, 2024, 11:01 a.m. UTC | #4
On Fri, Jan 26, 2024 at 01:45:39PM +0300, Michael Tokarev wrote:
> 26.01.2024 12:06, Daniel P. Berrangé wrote:
> > On Fri, Jan 26, 2024 at 08:44:13AM +0100, Laurent Vivier wrote:
> > > Le 25/01/2024 à 23:29, Michael Tokarev a écrit :
> 
> 
> > > I think the way using sysconf(_SC_OPEN_MAX) is more portable, simpler and
> > > cleaner than the one using /proc/self/fd.
> > 
> > A fallback that uses _SC_OPEN_MAX is good for portability, but it is
> > should not be considered a replacement for iterating over /proc/self/fd,
> > rather an additional fallback for non-Linux, or when /proc is not mounted.
> > It is not uncommon for _SC_OPEN_MAX to be *exceedingly* high
> > 
> >    $ podman run -it quay.io/centos/centos:stream9
> >    [root@4a440d62935c /]# ulimit -n
> >    524288
> > 
> > Iterating over 1/2 a million FDs is a serious performance penalty that
> > we don't want to have, so _SC_OPEN_MAX should always be the last resort.
> 
> From yesterday conversation in IRC which started this:
> 
>  <mmlb> open files          (-n) 1073741816
> 
> (it is a docker container)
> They weren't able to start qemu.. :)
> 
> Sanity of such setting is questionable, but ok.
> 
> Not only linux implement close_range(2) syscall, it is also
> available on some *BSDs.
> 
> And the most important point is, - we should aim at using O_CLOEXEC
> everywhere, without this need to close each FD at exec time.  I think
> qemu is the only software with such paranoid closing when just running
> an interface setup script..

We should try to use O_CLOEXEC everywhere, but at the same time QEMU
links to a large number of libraries, and we can't assume that they've
reliably used O_CLOEXEC. Non-QEMU owned code that is mapped in process
likely dwarfs QEMU owned code by a factor of x10.

With regards,
Daniel
Michael Tokarev Jan. 26, 2024, 12:05 p.m. UTC | #5
26.01.2024 14:01, Daniel P. Berrangé:
[]
> We should try to use O_CLOEXEC everywhere, but at the same time QEMU
> links to a large number of libraries, and we can't assume that they've
> reliably used O_CLOEXEC. Non-QEMU owned code that is mapped in process
> likely dwarfs QEMU owned code by a factor of x10.

There are quite a few points here.

As I already mentioned, qemu is one of very few software out here which
is this paranoid, - I know no other software which does this.  External
libs are being fixed too, and that's the proper place to do that.

Please note that currently we only close all files when executing scripts
to setup/teardown tap interfaces, but not, say, when spawning a process
to receive migration stream and in some other places, where such closing
might be much more important.

This close_all_open_fd() can check all FDs it finds open for O_CLOEXEC
as a debugging aid, - maybe we missed something in qemu already.  After
it's done, we'll have much better confidence already.

And something else I forgot to mention :)

/mjt
diff mbox series

Patch

diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
index dff32ae185..4c91d03f44 100644
--- a/include/sysemu/os-posix.h
+++ b/include/sysemu/os-posix.h
@@ -53,6 +53,7 @@  bool os_set_runas(const char *user_id);
 void os_set_chroot(const char *path);
 void os_setup_post(void);
 int os_mlock(void);
+void os_close_all_open_fd(int minfd);
 
 /**
  * qemu_alloc_stack:
diff --git a/system/async-teardown.c b/system/async-teardown.c
index 396963c091..41d3d94935 100644
--- a/system/async-teardown.c
+++ b/system/async-teardown.c
@@ -26,40 +26,6 @@ 
 
 static pid_t the_ppid;
 
-/*
- * Close all open file descriptors.
- */
-static void close_all_open_fd(void)
-{
-    struct dirent *de;
-    int fd, dfd;
-    DIR *dir;
-
-#ifdef CONFIG_CLOSE_RANGE
-    int r = close_range(0, ~0U, 0);
-    if (!r) {
-        /* Success, no need to try other ways. */
-        return;
-    }
-#endif
-
-    dir = opendir("/proc/self/fd");
-    if (!dir) {
-        /* If /proc is not mounted, there is nothing that can be done. */
-        return;
-    }
-    /* Avoid closing the directory. */
-    dfd = dirfd(dir);
-
-    for (de = readdir(dir); de; de = readdir(dir)) {
-        fd = atoi(de->d_name);
-        if (fd != dfd) {
-            close(fd);
-        }
-    }
-    closedir(dir);
-}
-
 static void hup_handler(int signal)
 {
     /* Check every second if this process has been reparented. */
@@ -85,9 +51,8 @@  static int async_teardown_fn(void *arg)
     /*
      * Close all file descriptors that might have been inherited from the
      * main qemu process when doing clone, needed to make libvirt happy.
-     * Not using close_range for increased compatibility with older kernels.
      */
-    close_all_open_fd();
+    os_close_all_open_fd(0);
 
     /* Set up a handler for SIGHUP and unblock SIGHUP. */
     sigaction(SIGHUP, &sa, NULL);
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 7c297003b9..09d0ce4da6 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -27,6 +27,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include <dirent.h>
 #include <termios.h>
 
 #include <glib/gprintf.h>
@@ -106,6 +107,41 @@  int qemu_get_thread_id(void)
 #endif
 }
 
+/*
+ * Close all open file descriptors starting with minfd and up.
+ * Not using close_range for increased compatibility with older kernels.
+ */
+void os_close_all_open_fd(int minfd)
+{
+    struct dirent *de;
+    int fd, dfd;
+    DIR *dir;
+
+#ifdef CONFIG_CLOSE_RANGE
+    int r = close_range(minfd, ~0U, 0);
+    if (!r) {
+        /* Success, no need to try other ways. */
+        return;
+    }
+#endif
+
+    dir = opendir("/proc/self/fd");
+    if (!dir) {
+        /* If /proc is not mounted, there is nothing that can be done. */
+        return;
+    }
+    /* Avoid closing the directory. */
+    dfd = dirfd(dir);
+
+    for (de = readdir(dir); de; de = readdir(dir)) {
+        fd = atoi(de->d_name);
+        if (fd >= minfd && fd != dfd) {
+            close(fd);
+        }
+    }
+    closedir(dir);
+}
+
 int qemu_daemon(int nochdir, int noclose)
 {
     return daemon(nochdir, noclose);