diff mbox series

[v4,14/29] libvhost-user+postcopy: Register new regions with the ufd

Message ID 20180308195811.24894-15-dgilbert@redhat.com
State New
Headers show
Series postcopy+vhost-user/shared ram | expand

Commit Message

Dr. David Alan Gilbert March 8, 2018, 7:57 p.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

When new regions are sent to the client using SET_MEM_TABLE, register
them with the userfaultfd.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

Comments

Peter Xu March 12, 2018, 10:20 a.m. UTC | #1
On Thu, Mar 08, 2018 at 07:57:56PM +0000, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> When new regions are sent to the client using SET_MEM_TABLE, register
> them with the userfaultfd.
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  contrib/libvhost-user/libvhost-user.c | 34 ++++++++++++++++++++++++++++++++++
>  1 file changed, 34 insertions(+)
> 
> diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> index 4922b2c722..a18bc74a7c 100644
> --- a/contrib/libvhost-user/libvhost-user.c
> +++ b/contrib/libvhost-user/libvhost-user.c
> @@ -494,6 +494,40 @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
>          close(vmsg->fds[i]);
>      }
>  
> +    /* TODO: Get address back to QEMU */
> +    for (i = 0; i < dev->nregions; i++) {
> +        VuDevRegion *dev_region = &dev->regions[i];
> +#ifdef UFFDIO_REGISTER
> +        /* We should already have an open ufd. Mark each memory
> +         * range as ufd.
> +         * Note: Do we need any madvises? Well it's not been accessed
> +         * yet, still probably need no THP to be safe, discard to be safe?
> +         */
> +        struct uffdio_register reg_struct;
> +        reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
> +        reg_struct.range.len = dev_region->size + dev_region->mmap_offset;

Do we really care the page faults between offset zero to mmap_offset?
I'm thinking whether we should add that mmap_offset into range.start
instead of range.len.

Also, I see that in current vu_set_mem_table_exec():

        /* We don't use offset argument of mmap() since the
         * mapped address has to be page aligned, and we use huge
         * pages.  */
        mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
                         PROT_READ | PROT_WRITE, MAP_SHARED,
                         vmsg->fds[i], 0);

So adding the mmap_offset will help to make sure we'll use huge pages?
Could it?  Or say, how could we be sure that size+mmap_offset would be
page aligned?

Thanks,

> +        reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
> +
> +        if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
> +            vu_panic(dev, "%s: Failed to userfault region %d "
> +                          "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
> +                     __func__, i,
> +                     dev_region->mmap_addr,
> +                     dev_region->size, dev_region->mmap_offset,
> +                     dev->postcopy_ufd, strerror(errno));
> +            return false;
> +        }
> +        if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
> +            vu_panic(dev, "%s Region (%d) doesn't support COPY",
> +                     __func__, i);
> +            return false;
> +        }
> +        DPRINT("%s: region %d: Registered userfault for %llx + %llx\n",
> +                __func__, i, reg_struct.range.start, reg_struct.range.len);
> +        /* TODO: Stash 'zero' support flags somewhere */
> +#endif
> +    }
> +
>      return false;
>  }
>  
> -- 
> 2.14.3
>
Dr. David Alan Gilbert March 12, 2018, 1:23 p.m. UTC | #2
* Peter Xu (peterx@redhat.com) wrote:
> On Thu, Mar 08, 2018 at 07:57:56PM +0000, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > When new regions are sent to the client using SET_MEM_TABLE, register
> > them with the userfaultfd.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  contrib/libvhost-user/libvhost-user.c | 34 ++++++++++++++++++++++++++++++++++
> >  1 file changed, 34 insertions(+)
> > 
> > diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> > index 4922b2c722..a18bc74a7c 100644
> > --- a/contrib/libvhost-user/libvhost-user.c
> > +++ b/contrib/libvhost-user/libvhost-user.c
> > @@ -494,6 +494,40 @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
> >          close(vmsg->fds[i]);
> >      }
> >  
> > +    /* TODO: Get address back to QEMU */
> > +    for (i = 0; i < dev->nregions; i++) {
> > +        VuDevRegion *dev_region = &dev->regions[i];
> > +#ifdef UFFDIO_REGISTER
> > +        /* We should already have an open ufd. Mark each memory
> > +         * range as ufd.
> > +         * Note: Do we need any madvises? Well it's not been accessed
> > +         * yet, still probably need no THP to be safe, discard to be safe?
> > +         */
> > +        struct uffdio_register reg_struct;
> > +        reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
> > +        reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
> 
> Do we really care the page faults between offset zero to mmap_offset?

No, but if we saw them we'd think it meant something had gone wrong,
so it's good to trap them.

> I'm thinking whether we should add that mmap_offset into range.start
> instead of range.len.
> 
> Also, I see that in current vu_set_mem_table_exec():
> 
>         /* We don't use offset argument of mmap() since the
>          * mapped address has to be page aligned, and we use huge
>          * pages.  */
>         mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
>                          PROT_READ | PROT_WRITE, MAP_SHARED,
>                          vmsg->fds[i], 0);
> 
> So adding the mmap_offset will help to make sure we'll use huge pages?
> Could it?  Or say, how could we be sure that size+mmap_offset would be
> page aligned?

If you look into the set_mem_table_exec (non-postcopy) you'll see that
code and comment comes from the non-postcopy version; but it's something
which as you say we could probably simplify now.

The problem used to be, before we did the merging as part of this series
(0026 vhost Huge page align and merge), we could end up with mappings
being passed from the qemu that were for small ranges of memory that
weren't aligned to a huge page boundary and thus the mmap would fail.
With the merging code that's no longer true, so it means we
could simplify as you say;  although this way it's a smaller change from
the existing code.

Dave

> Thanks,
> 
> > +        reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
> > +
> > +        if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
> > +            vu_panic(dev, "%s: Failed to userfault region %d "
> > +                          "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
> > +                     __func__, i,
> > +                     dev_region->mmap_addr,
> > +                     dev_region->size, dev_region->mmap_offset,
> > +                     dev->postcopy_ufd, strerror(errno));
> > +            return false;
> > +        }
> > +        if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
> > +            vu_panic(dev, "%s Region (%d) doesn't support COPY",
> > +                     __func__, i);
> > +            return false;
> > +        }
> > +        DPRINT("%s: region %d: Registered userfault for %llx + %llx\n",
> > +                __func__, i, reg_struct.range.start, reg_struct.range.len);
> > +        /* TODO: Stash 'zero' support flags somewhere */
> > +#endif
> > +    }
> > +
> >      return false;
> >  }
> >  
> > -- 
> > 2.14.3
> > 
> 
> -- 
> Peter Xu
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Marc-André Lureau March 12, 2018, 2:40 p.m. UTC | #3
On Thu, Mar 8, 2018 at 8:57 PM, Dr. David Alan Gilbert (git)
<dgilbert@redhat.com> wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
>
> When new regions are sent to the client using SET_MEM_TABLE, register
> them with the userfaultfd.
>
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>


> ---
>  contrib/libvhost-user/libvhost-user.c | 34 ++++++++++++++++++++++++++++++++++
>  1 file changed, 34 insertions(+)
>
> diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> index 4922b2c722..a18bc74a7c 100644
> --- a/contrib/libvhost-user/libvhost-user.c
> +++ b/contrib/libvhost-user/libvhost-user.c
> @@ -494,6 +494,40 @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
>          close(vmsg->fds[i]);
>      }
>
> +    /* TODO: Get address back to QEMU */
> +    for (i = 0; i < dev->nregions; i++) {
> +        VuDevRegion *dev_region = &dev->regions[i];
> +#ifdef UFFDIO_REGISTER
> +        /* We should already have an open ufd. Mark each memory
> +         * range as ufd.
> +         * Note: Do we need any madvises? Well it's not been accessed
> +         * yet, still probably need no THP to be safe, discard to be safe?
> +         */
> +        struct uffdio_register reg_struct;
> +        reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
> +        reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
> +        reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
> +
> +        if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
> +            vu_panic(dev, "%s: Failed to userfault region %d "
> +                          "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
> +                     __func__, i,
> +                     dev_region->mmap_addr,
> +                     dev_region->size, dev_region->mmap_offset,
> +                     dev->postcopy_ufd, strerror(errno));
> +            return false;
> +        }
> +        if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
> +            vu_panic(dev, "%s Region (%d) doesn't support COPY",
> +                     __func__, i);
> +            return false;
> +        }
> +        DPRINT("%s: region %d: Registered userfault for %llx + %llx\n",
> +                __func__, i, reg_struct.range.start, reg_struct.range.len);
> +        /* TODO: Stash 'zero' support flags somewhere */
> +#endif
> +    }
> +
>      return false;
>  }
>
> --
> 2.14.3
>
>
Peter Xu March 13, 2018, 8:28 a.m. UTC | #4
On Mon, Mar 12, 2018 at 01:23:21PM +0000, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > On Thu, Mar 08, 2018 at 07:57:56PM +0000, Dr. David Alan Gilbert (git) wrote:
> > > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > > 
> > > When new regions are sent to the client using SET_MEM_TABLE, register
> > > them with the userfaultfd.
> > > 
> > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > ---
> > >  contrib/libvhost-user/libvhost-user.c | 34 ++++++++++++++++++++++++++++++++++
> > >  1 file changed, 34 insertions(+)
> > > 
> > > diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> > > index 4922b2c722..a18bc74a7c 100644
> > > --- a/contrib/libvhost-user/libvhost-user.c
> > > +++ b/contrib/libvhost-user/libvhost-user.c
> > > @@ -494,6 +494,40 @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
> > >          close(vmsg->fds[i]);
> > >      }
> > >  
> > > +    /* TODO: Get address back to QEMU */
> > > +    for (i = 0; i < dev->nregions; i++) {
> > > +        VuDevRegion *dev_region = &dev->regions[i];
> > > +#ifdef UFFDIO_REGISTER
> > > +        /* We should already have an open ufd. Mark each memory
> > > +         * range as ufd.
> > > +         * Note: Do we need any madvises? Well it's not been accessed
> > > +         * yet, still probably need no THP to be safe, discard to be safe?
> > > +         */
> > > +        struct uffdio_register reg_struct;
> > > +        reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
> > > +        reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
> > 
> > Do we really care the page faults between offset zero to mmap_offset?
> 
> No, but if we saw them we'd think it meant something had gone wrong,
> so it's good to trap them.

I'm fine with that, especially since that's now only used in the test
codes.  However that's a still bit confusing to me, especially if
current QEMU won't really handle that page fault (and it seems that
should never happen).  Maybe at least a comment would help on
explaining why we need to explicitly extend the range to listen, just
like below code when we do the mapping, though with a different
reason.

> 
> > I'm thinking whether we should add that mmap_offset into range.start
> > instead of range.len.
> > 
> > Also, I see that in current vu_set_mem_table_exec():
> > 
> >         /* We don't use offset argument of mmap() since the
> >          * mapped address has to be page aligned, and we use huge
> >          * pages.  */
> >         mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
> >                          PROT_READ | PROT_WRITE, MAP_SHARED,
> >                          vmsg->fds[i], 0);
> > 
> > So adding the mmap_offset will help to make sure we'll use huge pages?
> > Could it?  Or say, how could we be sure that size+mmap_offset would be
> > page aligned?
> 
> If you look into the set_mem_table_exec (non-postcopy) you'll see that
> code and comment comes from the non-postcopy version; but it's something
> which as you say we could probably simplify now.
> 
> The problem used to be, before we did the merging as part of this series
> (0026 vhost Huge page align and merge), we could end up with mappings
> being passed from the qemu that were for small ranges of memory that
> weren't aligned to a huge page boundary and thus the mmap would fail.
> With the merging code that's no longer true, so it means we
> could simplify as you say;  although this way it's a smaller change from
> the existing code.

I was thinking what if the memory section was e.g. splitted as below:

- range A: [0x0,      0x10):     non-RAM range, size     0x10
- range B: [0x10,     0x1ffff0):     RAM range, size 0x1fffe0
- range C: [0x1ffff0, 0x200000): non-RAM range, size     0x10

Ranges A+B+C will cover a 2M page while vhost-user master should only
send range B to the client. Then even size+mmap_offset (which is
0x1fffe0+0x10=0x1ffff0) shouldn't be aligned with the 2M boundary.
If previous mmap() can fail, would this fail too?

For sure this is a question not directly related to current code -
it's just something I'm not sure about.  So it is not a blocker of
current patch.  Thanks,
Dr. David Alan Gilbert March 15, 2018, 9:41 a.m. UTC | #5
* Peter Xu (peterx@redhat.com) wrote:
> On Mon, Mar 12, 2018 at 01:23:21PM +0000, Dr. David Alan Gilbert wrote:
> > * Peter Xu (peterx@redhat.com) wrote:
> > > On Thu, Mar 08, 2018 at 07:57:56PM +0000, Dr. David Alan Gilbert (git) wrote:
> > > > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > > > 
> > > > When new regions are sent to the client using SET_MEM_TABLE, register
> > > > them with the userfaultfd.
> > > > 
> > > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > > ---
> > > >  contrib/libvhost-user/libvhost-user.c | 34 ++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 34 insertions(+)
> > > > 
> > > > diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> > > > index 4922b2c722..a18bc74a7c 100644
> > > > --- a/contrib/libvhost-user/libvhost-user.c
> > > > +++ b/contrib/libvhost-user/libvhost-user.c
> > > > @@ -494,6 +494,40 @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
> > > >          close(vmsg->fds[i]);
> > > >      }
> > > >  
> > > > +    /* TODO: Get address back to QEMU */
> > > > +    for (i = 0; i < dev->nregions; i++) {
> > > > +        VuDevRegion *dev_region = &dev->regions[i];
> > > > +#ifdef UFFDIO_REGISTER
> > > > +        /* We should already have an open ufd. Mark each memory
> > > > +         * range as ufd.
> > > > +         * Note: Do we need any madvises? Well it's not been accessed
> > > > +         * yet, still probably need no THP to be safe, discard to be safe?
> > > > +         */
> > > > +        struct uffdio_register reg_struct;
> > > > +        reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
> > > > +        reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
> > > 
> > > Do we really care the page faults between offset zero to mmap_offset?
> > 
> > No, but if we saw them we'd think it meant something had gone wrong,
> > so it's good to trap them.
> 
> I'm fine with that, especially since that's now only used in the test
> codes.  However that's a still bit confusing to me, especially if
> current QEMU won't really handle that page fault (and it seems that
> should never happen).  Maybe at least a comment would help on
> explaining why we need to explicitly extend the range to listen, just
> like below code when we do the mapping, though with a different
> reason.
> 
> > 
> > > I'm thinking whether we should add that mmap_offset into range.start
> > > instead of range.len.
> > > 
> > > Also, I see that in current vu_set_mem_table_exec():
> > > 
> > >         /* We don't use offset argument of mmap() since the
> > >          * mapped address has to be page aligned, and we use huge
> > >          * pages.  */
> > >         mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
> > >                          PROT_READ | PROT_WRITE, MAP_SHARED,
> > >                          vmsg->fds[i], 0);
> > > 
> > > So adding the mmap_offset will help to make sure we'll use huge pages?
> > > Could it?  Or say, how could we be sure that size+mmap_offset would be
> > > page aligned?
> > 
> > If you look into the set_mem_table_exec (non-postcopy) you'll see that
> > code and comment comes from the non-postcopy version; but it's something
> > which as you say we could probably simplify now.
> > 
> > The problem used to be, before we did the merging as part of this series
> > (0026 vhost Huge page align and merge), we could end up with mappings
> > being passed from the qemu that were for small ranges of memory that
> > weren't aligned to a huge page boundary and thus the mmap would fail.
> > With the merging code that's no longer true, so it means we
> > could simplify as you say;  although this way it's a smaller change from
> > the existing code.
> 
> I was thinking what if the memory section was e.g. splitted as below:
> 
> - range A: [0x0,      0x10):     non-RAM range, size     0x10
> - range B: [0x10,     0x1ffff0):     RAM range, size 0x1fffe0
> - range C: [0x1ffff0, 0x200000): non-RAM range, size     0x10
> 
> Ranges A+B+C will cover a 2M page while vhost-user master should only
> send range B to the client. Then even size+mmap_offset (which is
> 0x1fffe0+0x10=0x1ffff0) shouldn't be aligned with the 2M boundary.
> If previous mmap() can fail, would this fail too?

I think, 'vhost: Huge page align and merge' will round (B) to page
boundaries so that the range sent to the slave will be aligned.

Dave

> For sure this is a question not directly related to current code -
> it's just something I'm not sure about.  So it is not a blocker of
> current patch.  Thanks,
> 
> -- 
> Peter Xu
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Xu March 16, 2018, 4:18 a.m. UTC | #6
On Thu, Mar 15, 2018 at 09:41:00AM +0000, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > On Mon, Mar 12, 2018 at 01:23:21PM +0000, Dr. David Alan Gilbert wrote:
> > > * Peter Xu (peterx@redhat.com) wrote:
> > > > On Thu, Mar 08, 2018 at 07:57:56PM +0000, Dr. David Alan Gilbert (git) wrote:
> > > > > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > > > > 
> > > > > When new regions are sent to the client using SET_MEM_TABLE, register
> > > > > them with the userfaultfd.
> > > > > 
> > > > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > > > ---
> > > > >  contrib/libvhost-user/libvhost-user.c | 34 ++++++++++++++++++++++++++++++++++
> > > > >  1 file changed, 34 insertions(+)
> > > > > 
> > > > > diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> > > > > index 4922b2c722..a18bc74a7c 100644
> > > > > --- a/contrib/libvhost-user/libvhost-user.c
> > > > > +++ b/contrib/libvhost-user/libvhost-user.c
> > > > > @@ -494,6 +494,40 @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
> > > > >          close(vmsg->fds[i]);
> > > > >      }
> > > > >  
> > > > > +    /* TODO: Get address back to QEMU */
> > > > > +    for (i = 0; i < dev->nregions; i++) {
> > > > > +        VuDevRegion *dev_region = &dev->regions[i];
> > > > > +#ifdef UFFDIO_REGISTER
> > > > > +        /* We should already have an open ufd. Mark each memory
> > > > > +         * range as ufd.
> > > > > +         * Note: Do we need any madvises? Well it's not been accessed
> > > > > +         * yet, still probably need no THP to be safe, discard to be safe?
> > > > > +         */
> > > > > +        struct uffdio_register reg_struct;
> > > > > +        reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
> > > > > +        reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
> > > > 
> > > > Do we really care the page faults between offset zero to mmap_offset?
> > > 
> > > No, but if we saw them we'd think it meant something had gone wrong,
> > > so it's good to trap them.
> > 
> > I'm fine with that, especially since that's now only used in the test
> > codes.  However that's a still bit confusing to me, especially if
> > current QEMU won't really handle that page fault (and it seems that
> > should never happen).  Maybe at least a comment would help on
> > explaining why we need to explicitly extend the range to listen, just
> > like below code when we do the mapping, though with a different
> > reason.
> > 
> > > 
> > > > I'm thinking whether we should add that mmap_offset into range.start
> > > > instead of range.len.
> > > > 
> > > > Also, I see that in current vu_set_mem_table_exec():
> > > > 
> > > >         /* We don't use offset argument of mmap() since the
> > > >          * mapped address has to be page aligned, and we use huge
> > > >          * pages.  */
> > > >         mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
> > > >                          PROT_READ | PROT_WRITE, MAP_SHARED,
> > > >                          vmsg->fds[i], 0);
> > > > 
> > > > So adding the mmap_offset will help to make sure we'll use huge pages?
> > > > Could it?  Or say, how could we be sure that size+mmap_offset would be
> > > > page aligned?
> > > 
> > > If you look into the set_mem_table_exec (non-postcopy) you'll see that
> > > code and comment comes from the non-postcopy version; but it's something
> > > which as you say we could probably simplify now.
> > > 
> > > The problem used to be, before we did the merging as part of this series
> > > (0026 vhost Huge page align and merge), we could end up with mappings
> > > being passed from the qemu that were for small ranges of memory that
> > > weren't aligned to a huge page boundary and thus the mmap would fail.
> > > With the merging code that's no longer true, so it means we
> > > could simplify as you say;  although this way it's a smaller change from
> > > the existing code.
> > 
> > I was thinking what if the memory section was e.g. splitted as below:
> > 
> > - range A: [0x0,      0x10):     non-RAM range, size     0x10
> > - range B: [0x10,     0x1ffff0):     RAM range, size 0x1fffe0
> > - range C: [0x1ffff0, 0x200000): non-RAM range, size     0x10
> > 
> > Ranges A+B+C will cover a 2M page while vhost-user master should only
> > send range B to the client. Then even size+mmap_offset (which is
> > 0x1fffe0+0x10=0x1ffff0) shouldn't be aligned with the 2M boundary.
> > If previous mmap() can fail, would this fail too?
> 
> I think, 'vhost: Huge page align and merge' will round (B) to page
> boundaries so that the range sent to the slave will be aligned.

Ah I see what you meant.  Yes that seems to work, and yes the old code
won't hurt too even with some useless extra pages mapped.
diff mbox series

Patch

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 4922b2c722..a18bc74a7c 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -494,6 +494,40 @@  vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
         close(vmsg->fds[i]);
     }
 
+    /* TODO: Get address back to QEMU */
+    for (i = 0; i < dev->nregions; i++) {
+        VuDevRegion *dev_region = &dev->regions[i];
+#ifdef UFFDIO_REGISTER
+        /* We should already have an open ufd. Mark each memory
+         * range as ufd.
+         * Note: Do we need any madvises? Well it's not been accessed
+         * yet, still probably need no THP to be safe, discard to be safe?
+         */
+        struct uffdio_register reg_struct;
+        reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
+        reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
+        reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+        if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
+            vu_panic(dev, "%s: Failed to userfault region %d "
+                          "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
+                     __func__, i,
+                     dev_region->mmap_addr,
+                     dev_region->size, dev_region->mmap_offset,
+                     dev->postcopy_ufd, strerror(errno));
+            return false;
+        }
+        if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
+            vu_panic(dev, "%s Region (%d) doesn't support COPY",
+                     __func__, i);
+            return false;
+        }
+        DPRINT("%s: region %d: Registered userfault for %llx + %llx\n",
+                __func__, i, reg_struct.range.start, reg_struct.range.len);
+        /* TODO: Stash 'zero' support flags somewhere */
+#endif
+    }
+
     return false;
 }