Message ID | 20230201211055.649442-4-peterx@redhat.com |
---|---|
State | New |
Headers | show |
Series | util/userfaultfd: Support /dev/userfaultfd | expand |
Peter Xu <peterx@redhat.com> wrote: > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the > system call if either it's not there or doesn't have enough permission. > > Firstly, as long as the app has permission to access /dev/userfaultfd, it > always have the ability to trap kernel faults which QEMU mostly wants. > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be > forbidden, so it can be the major way to use postcopy in a restricted > environment with strict seccomp setup. > > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > Signed-off-by: Peter Xu <peterx@redhat.com> Hi Can we change this code to not use the global variable. > --- > util/trace-events | 1 + > util/userfaultfd.c | 37 +++++++++++++++++++++++++++++++++++++ > 2 files changed, 38 insertions(+) > > diff --git a/util/trace-events b/util/trace-events > index c8f53d7d9f..16f78d8fe5 100644 > --- a/util/trace-events > +++ b/util/trace-events > @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz > qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" > > #userfaultfd.c > +uffd_detect_open_mode(int mode) "%d" > uffd_query_features_nosys(int err) "errno: %i" > uffd_query_features_api_failed(int err) "errno: %i" > uffd_create_fd_nosys(int err) "errno: %i" > diff --git a/util/userfaultfd.c b/util/userfaultfd.c > index 9845a2ec81..7dceab51d6 100644 > --- a/util/userfaultfd.c > +++ b/util/userfaultfd.c > @@ -18,10 +18,47 @@ > #include <poll.h> > #include <sys/syscall.h> > #include <sys/ioctl.h> > +#include <fcntl.h> > + > +typedef enum { > + UFFD_UNINITIALIZED = 0, > + UFFD_USE_DEV_PATH, > + UFFD_USE_SYSCALL, > +} uffd_open_mode; > + > +static int uffd_dev; > + > +static uffd_open_mode uffd_detect_open_mode(void) > +{ > + static uffd_open_mode open_mode; > + > + if (open_mode == UFFD_UNINITIALIZED) { > + /* > + * Make /dev/userfaultfd the default approach because it has better > + * permission controls, meanwhile allows kernel faults without any > + * privilege requirement (e.g. SYS_CAP_PTRACE). > + */ > + uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); > + if (uffd_dev >= 0) { > + open_mode = UFFD_USE_DEV_PATH; > + } else { > + /* Fallback to the system call */ > + open_mode = UFFD_USE_SYSCALL; > + } > + trace_uffd_detect_open_mode(open_mode); > + } > + > + return open_mode; > +} > > int uffd_open(int flags) > { > #if defined(__linux__) && defined(__NR_userfaultfd) > + if (uffd_detect_open_mode() == UFFD_USE_DEV_PATH) { > + assert(uffd_dev >= 0); > + return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags); > + } > + > return syscall(__NR_userfaultfd, flags); > #else > return -EINVAL; static int open_userfaultd(void) { /* * Make /dev/userfaultfd the default approach because it has better * permission controls, meanwhile allows kernel faults without any * privilege requirement (e.g. SYS_CAP_PTRACE). */ int uffd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); if (uffd >= 0) { return uffd; } return -1; } int uffd_open(int flags) { #if defined(__linux__) && defined(__NR_userfaultfd) static int uffd = -2; if (uffd == -2) { uffd = open_userfaultd(); } if (uffd >= 0) { return ioctl(uffd, USERFAULTFD_IOC_NEW, flags); } return syscall(__NR_userfaultfd, flags); #else return -EINVAL; 27 lines vs 42 No need for enum type No need for global variable What do you think? Later, Juan.
On Thu, Feb 02, 2023 at 11:52:21AM +0100, Juan Quintela wrote: > Peter Xu <peterx@redhat.com> wrote: > > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the > > system call if either it's not there or doesn't have enough permission. > > > > Firstly, as long as the app has permission to access /dev/userfaultfd, it > > always have the ability to trap kernel faults which QEMU mostly wants. > > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be > > forbidden, so it can be the major way to use postcopy in a restricted > > environment with strict seccomp setup. > > > > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > > Signed-off-by: Peter Xu <peterx@redhat.com> > > > Hi Hi, Juan, > > Can we change this code to not use the global variable. > > > --- > > util/trace-events | 1 + > > util/userfaultfd.c | 37 +++++++++++++++++++++++++++++++++++++ > > 2 files changed, 38 insertions(+) > > > > diff --git a/util/trace-events b/util/trace-events > > index c8f53d7d9f..16f78d8fe5 100644 > > --- a/util/trace-events > > +++ b/util/trace-events > > @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz > > qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" > > > > #userfaultfd.c > > +uffd_detect_open_mode(int mode) "%d" > > uffd_query_features_nosys(int err) "errno: %i" > > uffd_query_features_api_failed(int err) "errno: %i" > > uffd_create_fd_nosys(int err) "errno: %i" > > diff --git a/util/userfaultfd.c b/util/userfaultfd.c > > index 9845a2ec81..7dceab51d6 100644 > > --- a/util/userfaultfd.c > > +++ b/util/userfaultfd.c > > @@ -18,10 +18,47 @@ > > #include <poll.h> > > #include <sys/syscall.h> > > #include <sys/ioctl.h> > > +#include <fcntl.h> > > + > > +typedef enum { > > + UFFD_UNINITIALIZED = 0, > > + UFFD_USE_DEV_PATH, > > + UFFD_USE_SYSCALL, > > +} uffd_open_mode; > > + > > +static int uffd_dev; > > + > > +static uffd_open_mode uffd_detect_open_mode(void) > > +{ > > + static uffd_open_mode open_mode; > > + > > + if (open_mode == UFFD_UNINITIALIZED) { > > + /* > > + * Make /dev/userfaultfd the default approach because it has better > > + * permission controls, meanwhile allows kernel faults without any > > + * privilege requirement (e.g. SYS_CAP_PTRACE). > > + */ > > + uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); > > + if (uffd_dev >= 0) { > > + open_mode = UFFD_USE_DEV_PATH; > > + } else { > > + /* Fallback to the system call */ > > + open_mode = UFFD_USE_SYSCALL; > > + } > > + trace_uffd_detect_open_mode(open_mode); > > + } > > + > > + return open_mode; > > +} > > > > int uffd_open(int flags) > > { > > #if defined(__linux__) && defined(__NR_userfaultfd) > > + if (uffd_detect_open_mode() == UFFD_USE_DEV_PATH) { > > + assert(uffd_dev >= 0); > > + return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags); > > + } > > + > > return syscall(__NR_userfaultfd, flags); > > #else > > return -EINVAL; > > static int open_userfaultd(void) > { > /* > * Make /dev/userfaultfd the default approach because it has better > * permission controls, meanwhile allows kernel faults without any > * privilege requirement (e.g. SYS_CAP_PTRACE). > */ > int uffd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); > if (uffd >= 0) { > return uffd; > } > return -1; > } > > int uffd_open(int flags) > { > #if defined(__linux__) && defined(__NR_userfaultfd) > static int uffd = -2; > if (uffd == -2) { > uffd = open_userfaultd(); > } > if (uffd >= 0) { > return ioctl(uffd, USERFAULTFD_IOC_NEW, flags); > } > return syscall(__NR_userfaultfd, flags); > #else > return -EINVAL; > > 27 lines vs 42 > > No need for enum type > No need for global variable > > What do you think? Yes, as I used to reply to Phil I think it can be simplified. I did this major for (1) better readability, and (2) being crystal clear on which way we used to open /dev/userfaultfd, then guarantee we're keeping using it. so at least I prefer keeping things like trace_uffd_detect_open_mode(). I also plan to add another mode when fd-mode is there even if it'll reuse the same USERFAULTFD_IOC_NEW; they can be useful information when a failure happens. Though if you insist, I can switch to the simple version too.
Peter Xu <peterx@redhat.com> wrote: > On Thu, Feb 02, 2023 at 11:52:21AM +0100, Juan Quintela wrote: >> Peter Xu <peterx@redhat.com> wrote: >> > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the >> > system call if either it's not there or doesn't have enough permission. >> > >> > Firstly, as long as the app has permission to access /dev/userfaultfd, it >> > always have the ability to trap kernel faults which QEMU mostly wants. >> > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be >> > forbidden, so it can be the major way to use postcopy in a restricted >> > environment with strict seccomp setup. >> > >> > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> >> > Signed-off-by: Peter Xu <peterx@redhat.com> >> >> >> Hi > > Hi, Juan, >> static int open_userfaultd(void) >> { >> /* >> * Make /dev/userfaultfd the default approach because it has better >> * permission controls, meanwhile allows kernel faults without any >> * privilege requirement (e.g. SYS_CAP_PTRACE). >> */ >> int uffd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); >> if (uffd >= 0) { >> return uffd; >> } >> return -1; >> } >> >> int uffd_open(int flags) >> { >> #if defined(__linux__) && defined(__NR_userfaultfd) Just an incise, checkpatch don't liue that you use __linux__ This file is compiled under CONFIG_LINUX, so you can drop it. >> static int uffd = -2; >> if (uffd == -2) { >> uffd = open_userfaultd(); >> } >> if (uffd >= 0) { >> return ioctl(uffd, USERFAULTFD_IOC_NEW, flags); >> } >> return syscall(__NR_userfaultfd, flags); >> #else >> return -EINVAL; >> >> 27 lines vs 42 >> >> No need for enum type >> No need for global variable >> >> What do you think? > > Yes, as I used to reply to Phil I think it can be simplified. I did this > major for (1) better readability, and (2) being crystal clear on which way > we used to open /dev/userfaultfd, then guarantee we're keeping using it. so > at least I prefer keeping things like trace_uffd_detect_open_mode(). The trace is ok for me. I just forgot to copy it on the rework, sorry. > I also plan to add another mode when fd-mode is there even if it'll reuse > the same USERFAULTFD_IOC_NEW; they can be useful information when a failure > happens. The other fd mode will change the uffd. What I *kind* of object is: - Using a global variable when it is not needed i.e. for me using a global variable means that anything else is worse. Not the case IMHO. - Call uffd_open_mode() for every call, when we know that it can change, it is going to return always the same value, so cache it. > Though if you insist, I can switch to the simple version too. I always told that the person who did the patch has the last word on style. I preffer my version, but it is up to you to take it or not. Later, Juan.
On Fri, Feb 03, 2023 at 10:01:04PM +0100, Juan Quintela wrote: > Peter Xu <peterx@redhat.com> wrote: > > On Thu, Feb 02, 2023 at 11:52:21AM +0100, Juan Quintela wrote: > >> Peter Xu <peterx@redhat.com> wrote: > >> > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the > >> > system call if either it's not there or doesn't have enough permission. > >> > > >> > Firstly, as long as the app has permission to access /dev/userfaultfd, it > >> > always have the ability to trap kernel faults which QEMU mostly wants. > >> > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be > >> > forbidden, so it can be the major way to use postcopy in a restricted > >> > environment with strict seccomp setup. > >> > > >> > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > >> > Signed-off-by: Peter Xu <peterx@redhat.com> > >> > >> > >> Hi > > > > Hi, Juan, > > > >> static int open_userfaultd(void) > >> { > >> /* > >> * Make /dev/userfaultfd the default approach because it has better > >> * permission controls, meanwhile allows kernel faults without any > >> * privilege requirement (e.g. SYS_CAP_PTRACE). > >> */ > >> int uffd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); > >> if (uffd >= 0) { > >> return uffd; > >> } > >> return -1; > >> } > >> > >> int uffd_open(int flags) > >> { > >> #if defined(__linux__) && defined(__NR_userfaultfd) > > Just an incise, checkpatch don't liue that you use __linux__ > > This file is compiled under CONFIG_LINUX, so you can drop it. Yes indeed. I'll drop it. > > >> static int uffd = -2; > >> if (uffd == -2) { > >> uffd = open_userfaultd(); > >> } > >> if (uffd >= 0) { > >> return ioctl(uffd, USERFAULTFD_IOC_NEW, flags); > >> } > >> return syscall(__NR_userfaultfd, flags); > >> #else > >> return -EINVAL; > >> > >> 27 lines vs 42 > >> > >> No need for enum type > >> No need for global variable > >> > >> What do you think? > > > > Yes, as I used to reply to Phil I think it can be simplified. I did this > > major for (1) better readability, and (2) being crystal clear on which way > > we used to open /dev/userfaultfd, then guarantee we're keeping using it. so > > at least I prefer keeping things like trace_uffd_detect_open_mode(). > > The trace is ok for me. I just forgot to copy it on the rework, sorry. > > > I also plan to add another mode when fd-mode is there even if it'll reuse > > the same USERFAULTFD_IOC_NEW; they can be useful information when a failure > > happens. > > The other fd mode will change the uffd. > > What I *kind* of object is: > - Using a global variable when it is not needed > i.e. for me using a global variable means that anything else is worse. > Not the case IMHO. IMHO globals are evil when they're used in multiple places; that's bad to readability. Here it's not the case because it's set once and for all. I wanted to have an easy and clear way to peek what's the mode chosen even without tracing enabled (e.g. from a dump or a live process). > - Call uffd_open_mode() for every call, when we know that it can change, > it is going to return always the same value, so cache it. uffd_detect_open_mode() caches the result already? Or maybe you meant something else? > > > Though if you insist, I can switch to the simple version too. > > I always told that the person who did the patch has the last word on > style. I preffer my version, but it is up to you to take it or not. Thanks,
Peter Xu <peterx@redhat.com> wrote: > On Fri, Feb 03, 2023 at 10:01:04PM +0100, Juan Quintela wrote: >> Peter Xu <peterx@redhat.com> wrote: >> > On Thu, Feb 02, 2023 at 11:52:21AM +0100, Juan Quintela wrote: >> >> Peter Xu <peterx@redhat.com> wrote: >> >> > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the >> >> > system call if either it's not there or doesn't have enough permission. >> >> > >> >> > Firstly, as long as the app has permission to access /dev/userfaultfd, it >> >> > always have the ability to trap kernel faults which QEMU mostly wants. >> >> > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be >> >> > forbidden, so it can be the major way to use postcopy in a restricted >> >> > environment with strict seccomp setup. >> >> > >> >> > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> >> >> > Signed-off-by: Peter Xu <peterx@redhat.com> >> >> >> >> >> >> Hi >> > >> > Hi, Juan, >> >> >> >> static int open_userfaultd(void) >> >> { >> >> /* >> >> * Make /dev/userfaultfd the default approach because it has better >> >> * permission controls, meanwhile allows kernel faults without any >> >> * privilege requirement (e.g. SYS_CAP_PTRACE). >> >> */ >> >> int uffd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); >> >> if (uffd >= 0) { >> >> return uffd; >> >> } >> >> return -1; >> >> } >> >> >> >> int uffd_open(int flags) >> >> { >> >> #if defined(__linux__) && defined(__NR_userfaultfd) >> >> Just an incise, checkpatch don't liue that you use __linux__ >> >> This file is compiled under CONFIG_LINUX, so you can drop it. > > Yes indeed. I'll drop it. > >> >> >> static int uffd = -2; >> >> if (uffd == -2) { >> >> uffd = open_userfaultd(); >> >> } >> >> if (uffd >= 0) { >> >> return ioctl(uffd, USERFAULTFD_IOC_NEW, flags); >> >> } >> >> return syscall(__NR_userfaultfd, flags); >> >> #else >> >> return -EINVAL; >> >> >> >> 27 lines vs 42 >> >> >> >> No need for enum type >> >> No need for global variable >> >> >> >> What do you think? >> > >> > Yes, as I used to reply to Phil I think it can be simplified. I did this >> > major for (1) better readability, and (2) being crystal clear on which way >> > we used to open /dev/userfaultfd, then guarantee we're keeping using it. so >> > at least I prefer keeping things like trace_uffd_detect_open_mode(). >> >> The trace is ok for me. I just forgot to copy it on the rework, sorry. >> >> > I also plan to add another mode when fd-mode is there even if it'll reuse >> > the same USERFAULTFD_IOC_NEW; they can be useful information when a failure >> > happens. >> >> The other fd mode will change the uffd. >> >> What I *kind* of object is: >> - Using a global variable when it is not needed >> i.e. for me using a global variable means that anything else is worse. >> Not the case IMHO. > > IMHO globals are evil when they're used in multiple places; that's bad to > readability. Here it's not the case because it's set once and for > all. That is part of the problem. int foo; I need to search all the code to see where it is used. int bar(...) { static int foo; .... } I am really sure that: - foo value is preserved between calls - it is not used anywhere else without a single grep through the code. > I > wanted to have an easy and clear way to peek what's the mode chosen even > without tracing enabled (e.g. from a dump or a live process). I haven't thought about this. But you want something different than this? (fada)$ cat /tmp/kk.c int bar(void) { static int foo = 42; return foo++; } int main(void) { int a = 7 + 1; return a + bar(); } (fada)$ gcc -Wall /tmp/kk.c -o /tmp/kkk -g (fada)$ gdb /tmp/kkk (gdb) b main Breakpoint 1 at 0x401123: file /tmp/kk.c, line 10. (gdb) p bar::foo $1 = 42 (gdb) And yes, I have to search how this is done O:-) >> - Call uffd_open_mode() for every call, when we know that it can change, >> it is going to return always the same value, so cache it. > > uffd_detect_open_mode() caches the result already? Or maybe you meant > something else? What I did. Only call the equilavent function once. You are calling it every time that uffd_open() is called. > >> >> > Though if you insist, I can switch to the simple version too. >> >> I always told that the person who did the patch has the last word on >> style. I preffer my version, but it is up to you to take it or not. > > Thanks, Later, Juan.
diff --git a/util/trace-events b/util/trace-events index c8f53d7d9f..16f78d8fe5 100644 --- a/util/trace-events +++ b/util/trace-events @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" #userfaultfd.c +uffd_detect_open_mode(int mode) "%d" uffd_query_features_nosys(int err) "errno: %i" uffd_query_features_api_failed(int err) "errno: %i" uffd_create_fd_nosys(int err) "errno: %i" diff --git a/util/userfaultfd.c b/util/userfaultfd.c index 9845a2ec81..7dceab51d6 100644 --- a/util/userfaultfd.c +++ b/util/userfaultfd.c @@ -18,10 +18,47 @@ #include <poll.h> #include <sys/syscall.h> #include <sys/ioctl.h> +#include <fcntl.h> + +typedef enum { + UFFD_UNINITIALIZED = 0, + UFFD_USE_DEV_PATH, + UFFD_USE_SYSCALL, +} uffd_open_mode; + +static int uffd_dev; + +static uffd_open_mode uffd_detect_open_mode(void) +{ + static uffd_open_mode open_mode; + + if (open_mode == UFFD_UNINITIALIZED) { + /* + * Make /dev/userfaultfd the default approach because it has better + * permission controls, meanwhile allows kernel faults without any + * privilege requirement (e.g. SYS_CAP_PTRACE). + */ + uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (uffd_dev >= 0) { + open_mode = UFFD_USE_DEV_PATH; + } else { + /* Fallback to the system call */ + open_mode = UFFD_USE_SYSCALL; + } + trace_uffd_detect_open_mode(open_mode); + } + + return open_mode; +} int uffd_open(int flags) { #if defined(__linux__) && defined(__NR_userfaultfd) + if (uffd_detect_open_mode() == UFFD_USE_DEV_PATH) { + assert(uffd_dev >= 0); + return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags); + } + return syscall(__NR_userfaultfd, flags); #else return -EINVAL;