diff mbox series

[v2,3/3] util/userfaultfd: Support /dev/userfaultfd

Message ID 20230201211055.649442-4-peterx@redhat.com
State New
Headers show
Series util/userfaultfd: Support /dev/userfaultfd | expand

Commit Message

Peter Xu Feb. 1, 2023, 9:10 p.m. UTC
Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
system call if either it's not there or doesn't have enough permission.

Firstly, as long as the app has permission to access /dev/userfaultfd, it
always have the ability to trap kernel faults which QEMU mostly wants.
Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
forbidden, so it can be the major way to use postcopy in a restricted
environment with strict seccomp setup.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 util/trace-events  |  1 +
 util/userfaultfd.c | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

Comments

Juan Quintela Feb. 2, 2023, 10:52 a.m. UTC | #1
Peter Xu <peterx@redhat.com> wrote:
> Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
> system call if either it's not there or doesn't have enough permission.
>
> Firstly, as long as the app has permission to access /dev/userfaultfd, it
> always have the ability to trap kernel faults which QEMU mostly wants.
> Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
> forbidden, so it can be the major way to use postcopy in a restricted
> environment with strict seccomp setup.
>
> Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
> Signed-off-by: Peter Xu <peterx@redhat.com>


Hi

Can we change this code to not use the global variable.

> ---
>  util/trace-events  |  1 +
>  util/userfaultfd.c | 37 +++++++++++++++++++++++++++++++++++++
>  2 files changed, 38 insertions(+)
>
> diff --git a/util/trace-events b/util/trace-events
> index c8f53d7d9f..16f78d8fe5 100644
> --- a/util/trace-events
> +++ b/util/trace-events
> @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz
>  qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
>  
>  #userfaultfd.c
> +uffd_detect_open_mode(int mode) "%d"
>  uffd_query_features_nosys(int err) "errno: %i"
>  uffd_query_features_api_failed(int err) "errno: %i"
>  uffd_create_fd_nosys(int err) "errno: %i"
> diff --git a/util/userfaultfd.c b/util/userfaultfd.c
> index 9845a2ec81..7dceab51d6 100644
> --- a/util/userfaultfd.c
> +++ b/util/userfaultfd.c
> @@ -18,10 +18,47 @@
>  #include <poll.h>
>  #include <sys/syscall.h>
>  #include <sys/ioctl.h>
> +#include <fcntl.h>
> +
> +typedef enum {
> +    UFFD_UNINITIALIZED = 0,
> +    UFFD_USE_DEV_PATH,
> +    UFFD_USE_SYSCALL,
> +} uffd_open_mode;
> +
> +static int uffd_dev;
> +
> +static uffd_open_mode uffd_detect_open_mode(void)
> +{
> +    static uffd_open_mode open_mode;
> +
> +    if (open_mode == UFFD_UNINITIALIZED) {
> +        /*
> +         * Make /dev/userfaultfd the default approach because it has better
> +         * permission controls, meanwhile allows kernel faults without any
> +         * privilege requirement (e.g. SYS_CAP_PTRACE).
> +         */
> +        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
> +        if (uffd_dev >= 0) {
> +            open_mode = UFFD_USE_DEV_PATH;
> +        } else {
> +            /* Fallback to the system call */
> +            open_mode = UFFD_USE_SYSCALL;
> +        }
> +        trace_uffd_detect_open_mode(open_mode);
> +    }
> +
> +    return open_mode;
> +}
>  
>  int uffd_open(int flags)
>  {
>  #if defined(__linux__) && defined(__NR_userfaultfd)
> +    if (uffd_detect_open_mode() == UFFD_USE_DEV_PATH) {
> +        assert(uffd_dev >= 0);
> +        return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
> +    }
> +
>      return syscall(__NR_userfaultfd, flags);
>  #else
>      return -EINVAL;

static int open_userfaultd(void)
{
    /*
     * Make /dev/userfaultfd the default approach because it has better
     * permission controls, meanwhile allows kernel faults without any
     * privilege requirement (e.g. SYS_CAP_PTRACE).
     */
     int uffd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
     if (uffd >= 0) {
            return uffd;
     }
     return -1;
}

int uffd_open(int flags)
{
#if defined(__linux__) && defined(__NR_userfaultfd)
    static int uffd = -2;
    if (uffd == -2) {
        uffd = open_userfaultd();
    }
    if (uffd >= 0) {
        return ioctl(uffd, USERFAULTFD_IOC_NEW, flags);
    }
    return syscall(__NR_userfaultfd, flags);
#else
     return -EINVAL;

27 lines vs 42

No need for enum type
No need for global variable

What do you think?

Later, Juan.
Peter Xu Feb. 2, 2023, 8:41 p.m. UTC | #2
On Thu, Feb 02, 2023 at 11:52:21AM +0100, Juan Quintela wrote:
> Peter Xu <peterx@redhat.com> wrote:
> > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
> > system call if either it's not there or doesn't have enough permission.
> >
> > Firstly, as long as the app has permission to access /dev/userfaultfd, it
> > always have the ability to trap kernel faults which QEMU mostly wants.
> > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
> > forbidden, so it can be the major way to use postcopy in a restricted
> > environment with strict seccomp setup.
> >
> > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> 
> Hi

Hi, Juan,

> 
> Can we change this code to not use the global variable.
> 
> > ---
> >  util/trace-events  |  1 +
> >  util/userfaultfd.c | 37 +++++++++++++++++++++++++++++++++++++
> >  2 files changed, 38 insertions(+)
> >
> > diff --git a/util/trace-events b/util/trace-events
> > index c8f53d7d9f..16f78d8fe5 100644
> > --- a/util/trace-events
> > +++ b/util/trace-events
> > @@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz
> >  qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
> >  
> >  #userfaultfd.c
> > +uffd_detect_open_mode(int mode) "%d"
> >  uffd_query_features_nosys(int err) "errno: %i"
> >  uffd_query_features_api_failed(int err) "errno: %i"
> >  uffd_create_fd_nosys(int err) "errno: %i"
> > diff --git a/util/userfaultfd.c b/util/userfaultfd.c
> > index 9845a2ec81..7dceab51d6 100644
> > --- a/util/userfaultfd.c
> > +++ b/util/userfaultfd.c
> > @@ -18,10 +18,47 @@
> >  #include <poll.h>
> >  #include <sys/syscall.h>
> >  #include <sys/ioctl.h>
> > +#include <fcntl.h>
> > +
> > +typedef enum {
> > +    UFFD_UNINITIALIZED = 0,
> > +    UFFD_USE_DEV_PATH,
> > +    UFFD_USE_SYSCALL,
> > +} uffd_open_mode;
> > +
> > +static int uffd_dev;
> > +
> > +static uffd_open_mode uffd_detect_open_mode(void)
> > +{
> > +    static uffd_open_mode open_mode;
> > +
> > +    if (open_mode == UFFD_UNINITIALIZED) {
> > +        /*
> > +         * Make /dev/userfaultfd the default approach because it has better
> > +         * permission controls, meanwhile allows kernel faults without any
> > +         * privilege requirement (e.g. SYS_CAP_PTRACE).
> > +         */
> > +        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
> > +        if (uffd_dev >= 0) {
> > +            open_mode = UFFD_USE_DEV_PATH;
> > +        } else {
> > +            /* Fallback to the system call */
> > +            open_mode = UFFD_USE_SYSCALL;
> > +        }
> > +        trace_uffd_detect_open_mode(open_mode);
> > +    }
> > +
> > +    return open_mode;
> > +}
> >  
> >  int uffd_open(int flags)
> >  {
> >  #if defined(__linux__) && defined(__NR_userfaultfd)
> > +    if (uffd_detect_open_mode() == UFFD_USE_DEV_PATH) {
> > +        assert(uffd_dev >= 0);
> > +        return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
> > +    }
> > +
> >      return syscall(__NR_userfaultfd, flags);
> >  #else
> >      return -EINVAL;
> 
> static int open_userfaultd(void)
> {
>     /*
>      * Make /dev/userfaultfd the default approach because it has better
>      * permission controls, meanwhile allows kernel faults without any
>      * privilege requirement (e.g. SYS_CAP_PTRACE).
>      */
>      int uffd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
>      if (uffd >= 0) {
>             return uffd;
>      }
>      return -1;
> }
> 
> int uffd_open(int flags)
> {
> #if defined(__linux__) && defined(__NR_userfaultfd)
>     static int uffd = -2;
>     if (uffd == -2) {
>         uffd = open_userfaultd();
>     }
>     if (uffd >= 0) {
>         return ioctl(uffd, USERFAULTFD_IOC_NEW, flags);
>     }
>     return syscall(__NR_userfaultfd, flags);
> #else
>      return -EINVAL;
> 
> 27 lines vs 42
> 
> No need for enum type
> No need for global variable
> 
> What do you think?

Yes, as I used to reply to Phil I think it can be simplified.  I did this
major for (1) better readability, and (2) being crystal clear on which way
we used to open /dev/userfaultfd, then guarantee we're keeping using it. so
at least I prefer keeping things like trace_uffd_detect_open_mode().

I also plan to add another mode when fd-mode is there even if it'll reuse
the same USERFAULTFD_IOC_NEW; they can be useful information when a failure
happens.

Though if you insist, I can switch to the simple version too.
Juan Quintela Feb. 3, 2023, 9:01 p.m. UTC | #3
Peter Xu <peterx@redhat.com> wrote:
> On Thu, Feb 02, 2023 at 11:52:21AM +0100, Juan Quintela wrote:
>> Peter Xu <peterx@redhat.com> wrote:
>> > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
>> > system call if either it's not there or doesn't have enough permission.
>> >
>> > Firstly, as long as the app has permission to access /dev/userfaultfd, it
>> > always have the ability to trap kernel faults which QEMU mostly wants.
>> > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
>> > forbidden, so it can be the major way to use postcopy in a restricted
>> > environment with strict seccomp setup.
>> >
>> > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
>> > Signed-off-by: Peter Xu <peterx@redhat.com>
>> 
>> 
>> Hi
>
> Hi, Juan,


>> static int open_userfaultd(void)
>> {
>>     /*
>>      * Make /dev/userfaultfd the default approach because it has better
>>      * permission controls, meanwhile allows kernel faults without any
>>      * privilege requirement (e.g. SYS_CAP_PTRACE).
>>      */
>>      int uffd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
>>      if (uffd >= 0) {
>>             return uffd;
>>      }
>>      return -1;
>> }
>> 
>> int uffd_open(int flags)
>> {
>> #if defined(__linux__) && defined(__NR_userfaultfd)

Just an incise, checkpatch don't liue that you use __linux__

This file is compiled under CONFIG_LINUX, so you can drop it.

>>     static int uffd = -2;
>>     if (uffd == -2) {
>>         uffd = open_userfaultd();
>>     }
>>     if (uffd >= 0) {
>>         return ioctl(uffd, USERFAULTFD_IOC_NEW, flags);
>>     }
>>     return syscall(__NR_userfaultfd, flags);
>> #else
>>      return -EINVAL;
>> 
>> 27 lines vs 42
>> 
>> No need for enum type
>> No need for global variable
>> 
>> What do you think?
>
> Yes, as I used to reply to Phil I think it can be simplified.  I did this
> major for (1) better readability, and (2) being crystal clear on which way
> we used to open /dev/userfaultfd, then guarantee we're keeping using it. so
> at least I prefer keeping things like trace_uffd_detect_open_mode().

The trace is ok for me.  I just forgot to copy it on the rework, sorry.

> I also plan to add another mode when fd-mode is there even if it'll reuse
> the same USERFAULTFD_IOC_NEW; they can be useful information when a failure
> happens.

The other fd mode will change the uffd.

What I *kind* of object is:
- Using a global variable when it is not needed
  i.e. for me using a global variable means that anything else is worse.
  Not the case IMHO.
- Call uffd_open_mode() for every call, when we know that it can change,
  it is going to return always the same value, so cache it.

> Though if you insist, I can switch to the simple version too.

I always told that the person who did the patch has the last word on
style.  I preffer my version, but it is up to you to take it or not.

Later, Juan.
Peter Xu Feb. 6, 2023, 9:31 p.m. UTC | #4
On Fri, Feb 03, 2023 at 10:01:04PM +0100, Juan Quintela wrote:
> Peter Xu <peterx@redhat.com> wrote:
> > On Thu, Feb 02, 2023 at 11:52:21AM +0100, Juan Quintela wrote:
> >> Peter Xu <peterx@redhat.com> wrote:
> >> > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
> >> > system call if either it's not there or doesn't have enough permission.
> >> >
> >> > Firstly, as long as the app has permission to access /dev/userfaultfd, it
> >> > always have the ability to trap kernel faults which QEMU mostly wants.
> >> > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
> >> > forbidden, so it can be the major way to use postcopy in a restricted
> >> > environment with strict seccomp setup.
> >> >
> >> > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
> >> > Signed-off-by: Peter Xu <peterx@redhat.com>
> >> 
> >> 
> >> Hi
> >
> > Hi, Juan,
> 
> 
> >> static int open_userfaultd(void)
> >> {
> >>     /*
> >>      * Make /dev/userfaultfd the default approach because it has better
> >>      * permission controls, meanwhile allows kernel faults without any
> >>      * privilege requirement (e.g. SYS_CAP_PTRACE).
> >>      */
> >>      int uffd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
> >>      if (uffd >= 0) {
> >>             return uffd;
> >>      }
> >>      return -1;
> >> }
> >> 
> >> int uffd_open(int flags)
> >> {
> >> #if defined(__linux__) && defined(__NR_userfaultfd)
> 
> Just an incise, checkpatch don't liue that you use __linux__
> 
> This file is compiled under CONFIG_LINUX, so you can drop it.

Yes indeed.  I'll drop it.

> 
> >>     static int uffd = -2;
> >>     if (uffd == -2) {
> >>         uffd = open_userfaultd();
> >>     }
> >>     if (uffd >= 0) {
> >>         return ioctl(uffd, USERFAULTFD_IOC_NEW, flags);
> >>     }
> >>     return syscall(__NR_userfaultfd, flags);
> >> #else
> >>      return -EINVAL;
> >> 
> >> 27 lines vs 42
> >> 
> >> No need for enum type
> >> No need for global variable
> >> 
> >> What do you think?
> >
> > Yes, as I used to reply to Phil I think it can be simplified.  I did this
> > major for (1) better readability, and (2) being crystal clear on which way
> > we used to open /dev/userfaultfd, then guarantee we're keeping using it. so
> > at least I prefer keeping things like trace_uffd_detect_open_mode().
> 
> The trace is ok for me.  I just forgot to copy it on the rework, sorry.
> 
> > I also plan to add another mode when fd-mode is there even if it'll reuse
> > the same USERFAULTFD_IOC_NEW; they can be useful information when a failure
> > happens.
> 
> The other fd mode will change the uffd.
> 
> What I *kind* of object is:
> - Using a global variable when it is not needed
>   i.e. for me using a global variable means that anything else is worse.
>   Not the case IMHO.

IMHO globals are evil when they're used in multiple places; that's bad to
readability.  Here it's not the case because it's set once and for all.  I
wanted to have an easy and clear way to peek what's the mode chosen even
without tracing enabled (e.g. from a dump or a live process).

> - Call uffd_open_mode() for every call, when we know that it can change,
>   it is going to return always the same value, so cache it.

uffd_detect_open_mode() caches the result already?  Or maybe you meant
something else?

> 
> > Though if you insist, I can switch to the simple version too.
> 
> I always told that the person who did the patch has the last word on
> style.  I preffer my version, but it is up to you to take it or not.

Thanks,
Juan Quintela Feb. 7, 2023, 12:11 a.m. UTC | #5
Peter Xu <peterx@redhat.com> wrote:
> On Fri, Feb 03, 2023 at 10:01:04PM +0100, Juan Quintela wrote:
>> Peter Xu <peterx@redhat.com> wrote:
>> > On Thu, Feb 02, 2023 at 11:52:21AM +0100, Juan Quintela wrote:
>> >> Peter Xu <peterx@redhat.com> wrote:
>> >> > Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
>> >> > system call if either it's not there or doesn't have enough permission.
>> >> >
>> >> > Firstly, as long as the app has permission to access /dev/userfaultfd, it
>> >> > always have the ability to trap kernel faults which QEMU mostly wants.
>> >> > Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
>> >> > forbidden, so it can be the major way to use postcopy in a restricted
>> >> > environment with strict seccomp setup.
>> >> >
>> >> > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
>> >> > Signed-off-by: Peter Xu <peterx@redhat.com>
>> >> 
>> >> 
>> >> Hi
>> >
>> > Hi, Juan,
>> 
>> 
>> >> static int open_userfaultd(void)
>> >> {
>> >>     /*
>> >>      * Make /dev/userfaultfd the default approach because it has better
>> >>      * permission controls, meanwhile allows kernel faults without any
>> >>      * privilege requirement (e.g. SYS_CAP_PTRACE).
>> >>      */
>> >>      int uffd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
>> >>      if (uffd >= 0) {
>> >>             return uffd;
>> >>      }
>> >>      return -1;
>> >> }
>> >> 
>> >> int uffd_open(int flags)
>> >> {
>> >> #if defined(__linux__) && defined(__NR_userfaultfd)
>> 
>> Just an incise, checkpatch don't liue that you use __linux__
>> 
>> This file is compiled under CONFIG_LINUX, so you can drop it.
>
> Yes indeed.  I'll drop it.
>
>> 
>> >>     static int uffd = -2;
>> >>     if (uffd == -2) {
>> >>         uffd = open_userfaultd();
>> >>     }
>> >>     if (uffd >= 0) {
>> >>         return ioctl(uffd, USERFAULTFD_IOC_NEW, flags);
>> >>     }
>> >>     return syscall(__NR_userfaultfd, flags);
>> >> #else
>> >>      return -EINVAL;
>> >> 
>> >> 27 lines vs 42
>> >> 
>> >> No need for enum type
>> >> No need for global variable
>> >> 
>> >> What do you think?
>> >
>> > Yes, as I used to reply to Phil I think it can be simplified.  I did this
>> > major for (1) better readability, and (2) being crystal clear on which way
>> > we used to open /dev/userfaultfd, then guarantee we're keeping using it. so
>> > at least I prefer keeping things like trace_uffd_detect_open_mode().
>> 
>> The trace is ok for me.  I just forgot to copy it on the rework, sorry.
>> 
>> > I also plan to add another mode when fd-mode is there even if it'll reuse
>> > the same USERFAULTFD_IOC_NEW; they can be useful information when a failure
>> > happens.
>> 
>> The other fd mode will change the uffd.
>> 
>> What I *kind* of object is:
>> - Using a global variable when it is not needed
>>   i.e. for me using a global variable means that anything else is worse.
>>   Not the case IMHO.
>
> IMHO globals are evil when they're used in multiple places; that's bad to
> readability.  Here it's not the case because it's set once and for
> all.

That is part of the problem.

int foo;

I need to search all the code to see where it is used.

int bar(...)
{
    static int foo;
    ....
}

I am really sure that:
- foo value is preserved between calls
- it is not used anywhere else

without a single grep through the code.

> I
> wanted to have an easy and clear way to peek what's the mode chosen even
> without tracing enabled (e.g. from a dump or a live process).

I haven't thought about this.  But you want something different than this?

(fada)$ cat /tmp/kk.c

int bar(void)
{
	static int foo = 42;
	return foo++;
}

int main(void)
{
	int a = 7 + 1;
	return a + bar();
}
(fada)$ gcc -Wall /tmp/kk.c -o /tmp/kkk -g
(fada)$ gdb /tmp/kkk
(gdb) b main
Breakpoint 1 at 0x401123: file /tmp/kk.c, line 10.
(gdb) p bar::foo
$1 = 42
(gdb) 

And yes, I have to search how this is done O:-)

>> - Call uffd_open_mode() for every call, when we know that it can change,
>>   it is going to return always the same value, so cache it.
>
> uffd_detect_open_mode() caches the result already?  Or maybe you meant
> something else?

What I did.  Only call the equilavent function once.  You are calling it
every time that uffd_open() is called.

>
>> 
>> > Though if you insist, I can switch to the simple version too.
>> 
>> I always told that the person who did the patch has the last word on
>> style.  I preffer my version, but it is up to you to take it or not.
>
> Thanks,

Later, Juan.
diff mbox series

Patch

diff --git a/util/trace-events b/util/trace-events
index c8f53d7d9f..16f78d8fe5 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -93,6 +93,7 @@  qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz
 qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
 
 #userfaultfd.c
+uffd_detect_open_mode(int mode) "%d"
 uffd_query_features_nosys(int err) "errno: %i"
 uffd_query_features_api_failed(int err) "errno: %i"
 uffd_create_fd_nosys(int err) "errno: %i"
diff --git a/util/userfaultfd.c b/util/userfaultfd.c
index 9845a2ec81..7dceab51d6 100644
--- a/util/userfaultfd.c
+++ b/util/userfaultfd.c
@@ -18,10 +18,47 @@ 
 #include <poll.h>
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
+#include <fcntl.h>
+
+typedef enum {
+    UFFD_UNINITIALIZED = 0,
+    UFFD_USE_DEV_PATH,
+    UFFD_USE_SYSCALL,
+} uffd_open_mode;
+
+static int uffd_dev;
+
+static uffd_open_mode uffd_detect_open_mode(void)
+{
+    static uffd_open_mode open_mode;
+
+    if (open_mode == UFFD_UNINITIALIZED) {
+        /*
+         * Make /dev/userfaultfd the default approach because it has better
+         * permission controls, meanwhile allows kernel faults without any
+         * privilege requirement (e.g. SYS_CAP_PTRACE).
+         */
+        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
+        if (uffd_dev >= 0) {
+            open_mode = UFFD_USE_DEV_PATH;
+        } else {
+            /* Fallback to the system call */
+            open_mode = UFFD_USE_SYSCALL;
+        }
+        trace_uffd_detect_open_mode(open_mode);
+    }
+
+    return open_mode;
+}
 
 int uffd_open(int flags)
 {
 #if defined(__linux__) && defined(__NR_userfaultfd)
+    if (uffd_detect_open_mode() == UFFD_USE_DEV_PATH) {
+        assert(uffd_dev >= 0);
+        return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
+    }
+
     return syscall(__NR_userfaultfd, flags);
 #else
     return -EINVAL;