diff mbox series

[v9,Kernel,2/5] vfio iommu: Add ioctl defination to get dirty pages bitmap.

Message ID 1573578220-7530-3-git-send-email-kwankhede@nvidia.com
State New
Headers show
Series Add KABIs to support migration for VFIO devices | expand

Commit Message

Kirti Wankhede Nov. 12, 2019, 5:03 p.m. UTC
All pages pinned by vendor driver through vfio_pin_pages API should be
considered as dirty during migration. IOMMU container maintains a list of
all such pinned pages. Added an ioctl defination to get bitmap of such
pinned pages for requested IO virtual address range.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

Comments

Alex Williamson Nov. 12, 2019, 10:30 p.m. UTC | #1
On Tue, 12 Nov 2019 22:33:37 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> All pages pinned by vendor driver through vfio_pin_pages API should be
> considered as dirty during migration. IOMMU container maintains a list of
> all such pinned pages. Added an ioctl defination to get bitmap of such

definition

> pinned pages for requested IO virtual address range.

Additionally, all mapped pages are considered dirty when physically
mapped through to an IOMMU, modulo we discussed devices opting in to
per page pinning to indicate finer granularity with a TBD mechanism to
figure out if any non-opt-in devices remain.

> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++
>  1 file changed, 23 insertions(+)
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 35b09427ad9f..6fd3822aa610 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap {
>  #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
>  #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
>  
> +/**
> + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
> + *                                     struct vfio_iommu_type1_dirty_bitmap)
> + *
> + * IOCTL to get dirty pages bitmap for IOMMU container during migration.
> + * Get dirty pages bitmap of given IO virtual addresses range using
> + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of
> + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get
> + * bitmap and should set size of allocated memory in bitmap_size field.
> + * One bit is used to represent per page consecutively starting from iova
> + * offset. Bit set indicates page at that offset from iova is dirty.
> + */
> +struct vfio_iommu_type1_dirty_bitmap {
> +	__u32        argsz;
> +	__u32        flags;
> +	__u64        iova;                      /* IO virtual address */
> +	__u64        size;                      /* Size of iova range */
> +	__u64        bitmap_size;               /* in bytes */

This seems redundant.  We can calculate the size of the bitmap based on
the iova size.

> +	void __user *bitmap;                    /* one bit per page */

Should we define that as a __u64* to (a) help with the size
calculation, and (b) assure that we can use 8-byte ops on it?

However, who defines page size?  Is it necessarily the processor page
size?  A physical IOMMU may support page sizes other than the CPU page
size.  It might be more important to indicate the expected page size
than the bitmap size.  Thanks,

Alex

> +};
> +
> +#define VFIO_IOMMU_GET_DIRTY_BITMAP             _IO(VFIO_TYPE, VFIO_BASE + 17)
> +
>  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>  
>  /*
Kirti Wankhede Nov. 13, 2019, 7:37 p.m. UTC | #2
On 11/13/2019 4:00 AM, Alex Williamson wrote:
> On Tue, 12 Nov 2019 22:33:37 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> All pages pinned by vendor driver through vfio_pin_pages API should be
>> considered as dirty during migration. IOMMU container maintains a list of
>> all such pinned pages. Added an ioctl defination to get bitmap of such
> 
> definition
> 
>> pinned pages for requested IO virtual address range.
> 
> Additionally, all mapped pages are considered dirty when physically
> mapped through to an IOMMU, modulo we discussed devices opting in to
> per page pinning to indicate finer granularity with a TBD mechanism to
> figure out if any non-opt-in devices remain.
> 

You mean, in case of device direct assignment (device pass through)?

>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++
>>   1 file changed, 23 insertions(+)
>>
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 35b09427ad9f..6fd3822aa610 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap {
>>   #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
>>   #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
>>   
>> +/**
>> + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
>> + *                                     struct vfio_iommu_type1_dirty_bitmap)
>> + *
>> + * IOCTL to get dirty pages bitmap for IOMMU container during migration.
>> + * Get dirty pages bitmap of given IO virtual addresses range using
>> + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of
>> + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get
>> + * bitmap and should set size of allocated memory in bitmap_size field.
>> + * One bit is used to represent per page consecutively starting from iova
>> + * offset. Bit set indicates page at that offset from iova is dirty.
>> + */
>> +struct vfio_iommu_type1_dirty_bitmap {
>> +	__u32        argsz;
>> +	__u32        flags;
>> +	__u64        iova;                      /* IO virtual address */
>> +	__u64        size;                      /* Size of iova range */
>> +	__u64        bitmap_size;               /* in bytes */
> 
> This seems redundant.  We can calculate the size of the bitmap based on
> the iova size.
>

But in kernel space, we need to validate the size of memory allocated by 
user instead of assuming user is always correct, right?

>> +	void __user *bitmap;                    /* one bit per page */
> 
> Should we define that as a __u64* to (a) help with the size
> calculation, and (b) assure that we can use 8-byte ops on it?
> 
> However, who defines page size?  Is it necessarily the processor page
> size?  A physical IOMMU may support page sizes other than the CPU page
> size.  It might be more important to indicate the expected page size
> than the bitmap size.  Thanks,
>

I see in QEMU and in vfio_iommu_type1 module, page sizes considered for 
mapping are CPU page size, 4K. Do we still need to have such argument?

Thanks,
Kirti

> Alex
> 
>> +};
>> +
>> +#define VFIO_IOMMU_GET_DIRTY_BITMAP             _IO(VFIO_TYPE, VFIO_BASE + 17)
>> +
>>   /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>>   
>>   /*
>
Alex Williamson Nov. 13, 2019, 8:07 p.m. UTC | #3
On Thu, 14 Nov 2019 01:07:21 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 11/13/2019 4:00 AM, Alex Williamson wrote:
> > On Tue, 12 Nov 2019 22:33:37 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> All pages pinned by vendor driver through vfio_pin_pages API should be
> >> considered as dirty during migration. IOMMU container maintains a list of
> >> all such pinned pages. Added an ioctl defination to get bitmap of such  
> > 
> > definition
> >   
> >> pinned pages for requested IO virtual address range.  
> > 
> > Additionally, all mapped pages are considered dirty when physically
> > mapped through to an IOMMU, modulo we discussed devices opting in to
> > per page pinning to indicate finer granularity with a TBD mechanism to
> > figure out if any non-opt-in devices remain.
> >   
> 
> You mean, in case of device direct assignment (device pass through)?

Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
pinned and mapped, then the correct dirty page set is all mapped pages.
We discussed using the vpfn list as a mechanism for vendor drivers to
reduce their migration footprint, but we also discussed that we would
need a way to determine that all participants in the container have
explicitly pinned their working pages or else we must consider the
entire potential working set as dirty.

> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> ---
> >>   include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++
> >>   1 file changed, 23 insertions(+)
> >>
> >> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >> index 35b09427ad9f..6fd3822aa610 100644
> >> --- a/include/uapi/linux/vfio.h
> >> +++ b/include/uapi/linux/vfio.h
> >> @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap {
> >>   #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
> >>   #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
> >>   
> >> +/**
> >> + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
> >> + *                                     struct vfio_iommu_type1_dirty_bitmap)
> >> + *
> >> + * IOCTL to get dirty pages bitmap for IOMMU container during migration.
> >> + * Get dirty pages bitmap of given IO virtual addresses range using
> >> + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of
> >> + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get
> >> + * bitmap and should set size of allocated memory in bitmap_size field.
> >> + * One bit is used to represent per page consecutively starting from iova
> >> + * offset. Bit set indicates page at that offset from iova is dirty.
> >> + */
> >> +struct vfio_iommu_type1_dirty_bitmap {
> >> +	__u32        argsz;
> >> +	__u32        flags;
> >> +	__u64        iova;                      /* IO virtual address */
> >> +	__u64        size;                      /* Size of iova range */
> >> +	__u64        bitmap_size;               /* in bytes */  
> > 
> > This seems redundant.  We can calculate the size of the bitmap based on
> > the iova size.
> >  
> 
> But in kernel space, we need to validate the size of memory allocated by 
> user instead of assuming user is always correct, right?

What does it buy us for the user to tell us the size?  They could be
wrong, they could be malicious.  The argsz field on the ioctl is mostly
for the handshake that the user is competent, we should get faults from
the copy-user operation if it's incorrect.
 
> >> +	void __user *bitmap;                    /* one bit per page */  
> > 
> > Should we define that as a __u64* to (a) help with the size
> > calculation, and (b) assure that we can use 8-byte ops on it?
> > 
> > However, who defines page size?  Is it necessarily the processor page
> > size?  A physical IOMMU may support page sizes other than the CPU page
> > size.  It might be more important to indicate the expected page size
> > than the bitmap size.  Thanks,
> >  
> 
> I see in QEMU and in vfio_iommu_type1 module, page sizes considered for 
> mapping are CPU page size, 4K. Do we still need to have such argument?

That assumption exists for backwards compatibility prior to supporting
the iova_pgsizes field in vfio_iommu_type1_info.  AFAIK the current
interface has no page size assumptions and we should not add any.
Thanks,

Alex
Kirti Wankhede Nov. 14, 2019, 6:56 p.m. UTC | #4
On 11/14/2019 1:37 AM, Alex Williamson wrote:
> On Thu, 14 Nov 2019 01:07:21 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 11/13/2019 4:00 AM, Alex Williamson wrote:
>>> On Tue, 12 Nov 2019 22:33:37 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>    
>>>> All pages pinned by vendor driver through vfio_pin_pages API should be
>>>> considered as dirty during migration. IOMMU container maintains a list of
>>>> all such pinned pages. Added an ioctl defination to get bitmap of such
>>>
>>> definition
>>>    
>>>> pinned pages for requested IO virtual address range.
>>>
>>> Additionally, all mapped pages are considered dirty when physically
>>> mapped through to an IOMMU, modulo we discussed devices opting in to
>>> per page pinning to indicate finer granularity with a TBD mechanism to
>>> figure out if any non-opt-in devices remain.
>>>    
>>
>> You mean, in case of device direct assignment (device pass through)?
> 
> Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> pinned and mapped, then the correct dirty page set is all mapped pages.
> We discussed using the vpfn list as a mechanism for vendor drivers to
> reduce their migration footprint, but we also discussed that we would
> need a way to determine that all participants in the container have
> explicitly pinned their working pages or else we must consider the
> entire potential working set as dirty.
> 

How can vendor driver tell this capability to iommu module? Any suggestions?

>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>> ---
>>>>    include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++
>>>>    1 file changed, 23 insertions(+)
>>>>
>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>>> index 35b09427ad9f..6fd3822aa610 100644
>>>> --- a/include/uapi/linux/vfio.h
>>>> +++ b/include/uapi/linux/vfio.h
>>>> @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap {
>>>>    #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
>>>>    #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
>>>>    
>>>> +/**
>>>> + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
>>>> + *                                     struct vfio_iommu_type1_dirty_bitmap)
>>>> + *
>>>> + * IOCTL to get dirty pages bitmap for IOMMU container during migration.
>>>> + * Get dirty pages bitmap of given IO virtual addresses range using
>>>> + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of
>>>> + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get
>>>> + * bitmap and should set size of allocated memory in bitmap_size field.
>>>> + * One bit is used to represent per page consecutively starting from iova
>>>> + * offset. Bit set indicates page at that offset from iova is dirty.
>>>> + */
>>>> +struct vfio_iommu_type1_dirty_bitmap {
>>>> +	__u32        argsz;
>>>> +	__u32        flags;
>>>> +	__u64        iova;                      /* IO virtual address */
>>>> +	__u64        size;                      /* Size of iova range */
>>>> +	__u64        bitmap_size;               /* in bytes */
>>>
>>> This seems redundant.  We can calculate the size of the bitmap based on
>>> the iova size.
>>>   
>>
>> But in kernel space, we need to validate the size of memory allocated by
>> user instead of assuming user is always correct, right?
> 
> What does it buy us for the user to tell us the size?  They could be
> wrong, they could be malicious.  The argsz field on the ioctl is mostly
> for the handshake that the user is competent, we should get faults from
> the copy-user operation if it's incorrect.
>

It is to mainly fail safe.

>>>> +	void __user *bitmap;                    /* one bit per page */
>>>
>>> Should we define that as a __u64* to (a) help with the size
>>> calculation, and (b) assure that we can use 8-byte ops on it?
>>>
>>> However, who defines page size?  Is it necessarily the processor page
>>> size?  A physical IOMMU may support page sizes other than the CPU page
>>> size.  It might be more important to indicate the expected page size
>>> than the bitmap size.  Thanks,
>>>   
>>
>> I see in QEMU and in vfio_iommu_type1 module, page sizes considered for
>> mapping are CPU page size, 4K. Do we still need to have such argument?
> 
> That assumption exists for backwards compatibility prior to supporting
> the iova_pgsizes field in vfio_iommu_type1_info.  AFAIK the current
> interface has no page size assumptions and we should not add any.

So userspace has iova_pgsizes information, which can be input to this 
ioctl. Bitmap should be considering smallest page size. Does that makes 
sense?


Thanks,
Kirti

> Thanks,
> 
> Alex
>
Alex Williamson Nov. 14, 2019, 9:06 p.m. UTC | #5
On Fri, 15 Nov 2019 00:26:07 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 11/14/2019 1:37 AM, Alex Williamson wrote:
> > On Thu, 14 Nov 2019 01:07:21 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 11/13/2019 4:00 AM, Alex Williamson wrote:  
> >>> On Tue, 12 Nov 2019 22:33:37 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>      
> >>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> >>>> considered as dirty during migration. IOMMU container maintains a list of
> >>>> all such pinned pages. Added an ioctl defination to get bitmap of such  
> >>>
> >>> definition
> >>>      
> >>>> pinned pages for requested IO virtual address range.  
> >>>
> >>> Additionally, all mapped pages are considered dirty when physically
> >>> mapped through to an IOMMU, modulo we discussed devices opting in to
> >>> per page pinning to indicate finer granularity with a TBD mechanism to
> >>> figure out if any non-opt-in devices remain.
> >>>      
> >>
> >> You mean, in case of device direct assignment (device pass through)?  
> > 
> > Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> > pinned and mapped, then the correct dirty page set is all mapped pages.
> > We discussed using the vpfn list as a mechanism for vendor drivers to
> > reduce their migration footprint, but we also discussed that we would
> > need a way to determine that all participants in the container have
> > explicitly pinned their working pages or else we must consider the
> > entire potential working set as dirty.
> >   
> 
> How can vendor driver tell this capability to iommu module? Any suggestions?

I think it does so by pinning pages.  Is it acceptable that if the
vendor driver pins any pages, then from that point forward we consider
the IOMMU group dirty page scope to be limited to pinned pages?  There
are complications around non-singleton IOMMU groups, but I think we're
already leaning towards that being a non-worthwhile problem to solve.
So if we require that only singleton IOMMU groups can pin pages and we
pass the IOMMU group as a parameter to
vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
flag on its local vfio_group struct to indicate dirty page scope is
limited to pinned pages.  We might want to keep a flag on the
vfio_iommu struct to indicate if all of the vfio_groups for each
vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
pinned pages as an optimization to avoid walking lists too often.  Then
we could test if vfio_iommu.domain_list is not empty and this new flag
does not limit the dirty page scope, then everything within each
vfio_dma is considered dirty.
 
> >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >>>> ---
> >>>>    include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++
> >>>>    1 file changed, 23 insertions(+)
> >>>>
> >>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >>>> index 35b09427ad9f..6fd3822aa610 100644
> >>>> --- a/include/uapi/linux/vfio.h
> >>>> +++ b/include/uapi/linux/vfio.h
> >>>> @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap {
> >>>>    #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
> >>>>    #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
> >>>>    
> >>>> +/**
> >>>> + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
> >>>> + *                                     struct vfio_iommu_type1_dirty_bitmap)
> >>>> + *
> >>>> + * IOCTL to get dirty pages bitmap for IOMMU container during migration.
> >>>> + * Get dirty pages bitmap of given IO virtual addresses range using
> >>>> + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of
> >>>> + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get
> >>>> + * bitmap and should set size of allocated memory in bitmap_size field.
> >>>> + * One bit is used to represent per page consecutively starting from iova
> >>>> + * offset. Bit set indicates page at that offset from iova is dirty.
> >>>> + */
> >>>> +struct vfio_iommu_type1_dirty_bitmap {
> >>>> +	__u32        argsz;
> >>>> +	__u32        flags;
> >>>> +	__u64        iova;                      /* IO virtual address */
> >>>> +	__u64        size;                      /* Size of iova range */
> >>>> +	__u64        bitmap_size;               /* in bytes */  
> >>>
> >>> This seems redundant.  We can calculate the size of the bitmap based on
> >>> the iova size.
> >>>     
> >>
> >> But in kernel space, we need to validate the size of memory allocated by
> >> user instead of assuming user is always correct, right?  
> > 
> > What does it buy us for the user to tell us the size?  They could be
> > wrong, they could be malicious.  The argsz field on the ioctl is mostly
> > for the handshake that the user is competent, we should get faults from
> > the copy-user operation if it's incorrect.
> >  
> 
> It is to mainly fail safe.
> 
> >>>> +	void __user *bitmap;                    /* one bit per page */  
> >>>
> >>> Should we define that as a __u64* to (a) help with the size
> >>> calculation, and (b) assure that we can use 8-byte ops on it?
> >>>
> >>> However, who defines page size?  Is it necessarily the processor page
> >>> size?  A physical IOMMU may support page sizes other than the CPU page
> >>> size.  It might be more important to indicate the expected page size
> >>> than the bitmap size.  Thanks,
> >>>     
> >>
> >> I see in QEMU and in vfio_iommu_type1 module, page sizes considered for
> >> mapping are CPU page size, 4K. Do we still need to have such argument?  
> > 
> > That assumption exists for backwards compatibility prior to supporting
> > the iova_pgsizes field in vfio_iommu_type1_info.  AFAIK the current
> > interface has no page size assumptions and we should not add any.  
> 
> So userspace has iova_pgsizes information, which can be input to this 
> ioctl. Bitmap should be considering smallest page size. Does that makes 
> sense?

I'm not sure.  I thought I had an argument that the iova_pgsize could
indicate support for sizes smaller than the processor page size, which
would make the user responsible for using a different base for their
page size, but vfio_pgsize_bitmap() already masks out sub-page sizes.
Clearly the vendor driver is pinning based on processor sized pages,
but that's independent of an IOMMU and not part of a user ABI.

I'm tempted to say your bitmap_size field has a use here, but it seems
to fail in validating the user page size at the low extremes.  For
example if we have a single page mapping, the user can specify the iova
size as 4K (for example), but the minimum bitmap_size they can indicate
is 1 byte, would we therefore assume the user's bitmap page size is 512
bytes (ie. they provided us with 8 bits to describe a 4K range)?  We'd
need to be careful to specify that the minimum iova_pgsize indicated
page size is our lower bound as well.  But then what do we do if the
user provides us with a smaller buffer than we expect?  For example, a
128MB iova range and only an 8-byte buffer.  Do we go ahead and assume
a 2MB page size and fill the bitmap accordingly or do we generate an
error?  If the latter, might we support that at some point in time and
is it sufficient to let the user perform trial and error to test if that
exists?  Thanks,

Alex
Yan Zhao Nov. 15, 2019, 2:40 a.m. UTC | #6
On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
> On Fri, 15 Nov 2019 00:26:07 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 11/14/2019 1:37 AM, Alex Williamson wrote:
> > > On Thu, 14 Nov 2019 01:07:21 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >   
> > >> On 11/13/2019 4:00 AM, Alex Williamson wrote:  
> > >>> On Tue, 12 Nov 2019 22:33:37 +0530
> > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>>      
> > >>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> > >>>> considered as dirty during migration. IOMMU container maintains a list of
> > >>>> all such pinned pages. Added an ioctl defination to get bitmap of such  
> > >>>
> > >>> definition
> > >>>      
> > >>>> pinned pages for requested IO virtual address range.  
> > >>>
> > >>> Additionally, all mapped pages are considered dirty when physically
> > >>> mapped through to an IOMMU, modulo we discussed devices opting in to
> > >>> per page pinning to indicate finer granularity with a TBD mechanism to
> > >>> figure out if any non-opt-in devices remain.
> > >>>      
> > >>
> > >> You mean, in case of device direct assignment (device pass through)?  
> > > 
> > > Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> > > pinned and mapped, then the correct dirty page set is all mapped pages.
> > > We discussed using the vpfn list as a mechanism for vendor drivers to
> > > reduce their migration footprint, but we also discussed that we would
> > > need a way to determine that all participants in the container have
> > > explicitly pinned their working pages or else we must consider the
> > > entire potential working set as dirty.
> > >   
> > 
> > How can vendor driver tell this capability to iommu module? Any suggestions?
> 
> I think it does so by pinning pages.  Is it acceptable that if the
> vendor driver pins any pages, then from that point forward we consider
> the IOMMU group dirty page scope to be limited to pinned pages?  There
> are complications around non-singleton IOMMU groups, but I think we're
> already leaning towards that being a non-worthwhile problem to solve.
> So if we require that only singleton IOMMU groups can pin pages and we
> pass the IOMMU group as a parameter to
> vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
> flag on its local vfio_group struct to indicate dirty page scope is
> limited to pinned pages.  We might want to keep a flag on the
> vfio_iommu struct to indicate if all of the vfio_groups for each
> vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
> pinned pages as an optimization to avoid walking lists too often.  Then
> we could test if vfio_iommu.domain_list is not empty and this new flag
> does not limit the dirty page scope, then everything within each
> vfio_dma is considered dirty.
>

hi Alex
could you help clarify whether my understandings below are right?
In future,
1. for mdev and for passthrough device withoug hardware ability to track
dirty pages, the vendor driver has to explicitly call
vfio_pin_pages()/vfio_unpin_pages() + a flag to tell vfio its dirty page set.

2. for those devices with hardware ability to track dirty pages, will still
provide a callback to vendor driver to get dirty pages. (as for those devices,
it is hard to explicitly call vfio_pin_pages()/vfio_unpin_pages())

3. for devices relying on dirty bit info in physical IOMMU, there
will be a callback to physical IOMMU driver to get dirty page set from
vfio.

Thanks
Yan

> > >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > >>>> ---
> > >>>>    include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++
> > >>>>    1 file changed, 23 insertions(+)
> > >>>>
> > >>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > >>>> index 35b09427ad9f..6fd3822aa610 100644
> > >>>> --- a/include/uapi/linux/vfio.h
> > >>>> +++ b/include/uapi/linux/vfio.h
> > >>>> @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap {
> > >>>>    #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
> > >>>>    #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
> > >>>>    
> > >>>> +/**
> > >>>> + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
> > >>>> + *                                     struct vfio_iommu_type1_dirty_bitmap)
> > >>>> + *
> > >>>> + * IOCTL to get dirty pages bitmap for IOMMU container during migration.
> > >>>> + * Get dirty pages bitmap of given IO virtual addresses range using
> > >>>> + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of
> > >>>> + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get
> > >>>> + * bitmap and should set size of allocated memory in bitmap_size field.
> > >>>> + * One bit is used to represent per page consecutively starting from iova
> > >>>> + * offset. Bit set indicates page at that offset from iova is dirty.
> > >>>> + */
> > >>>> +struct vfio_iommu_type1_dirty_bitmap {
> > >>>> +	__u32        argsz;
> > >>>> +	__u32        flags;
> > >>>> +	__u64        iova;                      /* IO virtual address */
> > >>>> +	__u64        size;                      /* Size of iova range */
> > >>>> +	__u64        bitmap_size;               /* in bytes */  
> > >>>
> > >>> This seems redundant.  We can calculate the size of the bitmap based on
> > >>> the iova size.
> > >>>     
> > >>
> > >> But in kernel space, we need to validate the size of memory allocated by
> > >> user instead of assuming user is always correct, right?  
> > > 
> > > What does it buy us for the user to tell us the size?  They could be
> > > wrong, they could be malicious.  The argsz field on the ioctl is mostly
> > > for the handshake that the user is competent, we should get faults from
> > > the copy-user operation if it's incorrect.
> > >  
> > 
> > It is to mainly fail safe.
> > 
> > >>>> +	void __user *bitmap;                    /* one bit per page */  
> > >>>
> > >>> Should we define that as a __u64* to (a) help with the size
> > >>> calculation, and (b) assure that we can use 8-byte ops on it?
> > >>>
> > >>> However, who defines page size?  Is it necessarily the processor page
> > >>> size?  A physical IOMMU may support page sizes other than the CPU page
> > >>> size.  It might be more important to indicate the expected page size
> > >>> than the bitmap size.  Thanks,
> > >>>     
> > >>
> > >> I see in QEMU and in vfio_iommu_type1 module, page sizes considered for
> > >> mapping are CPU page size, 4K. Do we still need to have such argument?  
> > > 
> > > That assumption exists for backwards compatibility prior to supporting
> > > the iova_pgsizes field in vfio_iommu_type1_info.  AFAIK the current
> > > interface has no page size assumptions and we should not add any.  
> > 
> > So userspace has iova_pgsizes information, which can be input to this 
> > ioctl. Bitmap should be considering smallest page size. Does that makes 
> > sense?
> 
> I'm not sure.  I thought I had an argument that the iova_pgsize could
> indicate support for sizes smaller than the processor page size, which
> would make the user responsible for using a different base for their
> page size, but vfio_pgsize_bitmap() already masks out sub-page sizes.
> Clearly the vendor driver is pinning based on processor sized pages,
> but that's independent of an IOMMU and not part of a user ABI.
> 
> I'm tempted to say your bitmap_size field has a use here, but it seems
> to fail in validating the user page size at the low extremes.  For
> example if we have a single page mapping, the user can specify the iova
> size as 4K (for example), but the minimum bitmap_size they can indicate
> is 1 byte, would we therefore assume the user's bitmap page size is 512
> bytes (ie. they provided us with 8 bits to describe a 4K range)?  We'd
> need to be careful to specify that the minimum iova_pgsize indicated
> page size is our lower bound as well.  But then what do we do if the
> user provides us with a smaller buffer than we expect?  For example, a
> 128MB iova range and only an 8-byte buffer.  Do we go ahead and assume
> a 2MB page size and fill the bitmap accordingly or do we generate an
> error?  If the latter, might we support that at some point in time and
> is it sufficient to let the user perform trial and error to test if that
> exists?  Thanks,
> 
> Alex
>
Alex Williamson Nov. 15, 2019, 3:21 a.m. UTC | #7
On Thu, 14 Nov 2019 21:40:35 -0500
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
> > On Fri, 15 Nov 2019 00:26:07 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> > > On 11/14/2019 1:37 AM, Alex Williamson wrote:  
> > > > On Thu, 14 Nov 2019 01:07:21 +0530
> > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >     
> > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote:    
> > > >>> On Tue, 12 Nov 2019 22:33:37 +0530
> > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >>>        
> > > >>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> > > >>>> considered as dirty during migration. IOMMU container maintains a list of
> > > >>>> all such pinned pages. Added an ioctl defination to get bitmap of such    
> > > >>>
> > > >>> definition
> > > >>>        
> > > >>>> pinned pages for requested IO virtual address range.    
> > > >>>
> > > >>> Additionally, all mapped pages are considered dirty when physically
> > > >>> mapped through to an IOMMU, modulo we discussed devices opting in to
> > > >>> per page pinning to indicate finer granularity with a TBD mechanism to
> > > >>> figure out if any non-opt-in devices remain.
> > > >>>        
> > > >>
> > > >> You mean, in case of device direct assignment (device pass through)?    
> > > > 
> > > > Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> > > > pinned and mapped, then the correct dirty page set is all mapped pages.
> > > > We discussed using the vpfn list as a mechanism for vendor drivers to
> > > > reduce their migration footprint, but we also discussed that we would
> > > > need a way to determine that all participants in the container have
> > > > explicitly pinned their working pages or else we must consider the
> > > > entire potential working set as dirty.
> > > >     
> > > 
> > > How can vendor driver tell this capability to iommu module? Any suggestions?  
> > 
> > I think it does so by pinning pages.  Is it acceptable that if the
> > vendor driver pins any pages, then from that point forward we consider
> > the IOMMU group dirty page scope to be limited to pinned pages?  There
> > are complications around non-singleton IOMMU groups, but I think we're
> > already leaning towards that being a non-worthwhile problem to solve.
> > So if we require that only singleton IOMMU groups can pin pages and we
> > pass the IOMMU group as a parameter to
> > vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
> > flag on its local vfio_group struct to indicate dirty page scope is
> > limited to pinned pages.  We might want to keep a flag on the
> > vfio_iommu struct to indicate if all of the vfio_groups for each
> > vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
> > pinned pages as an optimization to avoid walking lists too often.  Then
> > we could test if vfio_iommu.domain_list is not empty and this new flag
> > does not limit the dirty page scope, then everything within each
> > vfio_dma is considered dirty.
> >  
> 
> hi Alex
> could you help clarify whether my understandings below are right?
> In future,
> 1. for mdev and for passthrough device withoug hardware ability to track
> dirty pages, the vendor driver has to explicitly call
> vfio_pin_pages()/vfio_unpin_pages() + a flag to tell vfio its dirty page set.

For non-IOMMU backed mdevs without hardware dirty page tracking,
there's no change to the vendor driver currently.  Pages pinned by the
vendor driver are marked as dirty.

For any IOMMU backed device, mdev or direct assignment, all mapped
memory would be considered dirty unless there are explicit calls to pin
pages on top of the IOMMU page pinning and mapping.  These would likely
be enabled only when the device is in the _SAVING device_state.

> 2. for those devices with hardware ability to track dirty pages, will still
> provide a callback to vendor driver to get dirty pages. (as for those devices,
> it is hard to explicitly call vfio_pin_pages()/vfio_unpin_pages())
>
> 3. for devices relying on dirty bit info in physical IOMMU, there
> will be a callback to physical IOMMU driver to get dirty page set from
> vfio.

The proposal here does not cover exactly how these would be
implemented, it only establishes the container as the point of user
interaction with the dirty bitmap and hopefully allows us to maintain
that interface regardless of whether we have dirty tracking at the
device or the system IOMMU.  Ideally devices with dirty tracking would
make use of page pinning and we'd extend the interface to allow vendor
drivers the ability to indicate the clean/dirty state of those pinned
pages.  For system IOMMU dirty page tracking, that potentially might
mean that we support IOMMU page faults and the container manages those
faults such that the container is the central record of dirty pages.
Until these interfaces are designed, we can only speculate, but the
goal is to design a user interface compatible with how those features
might evolve.  If you identify something that can't work, please raise
the issue.  Thanks,

Alex
Tian, Kevin Nov. 15, 2019, 5:10 a.m. UTC | #8
> From: Alex Williamson
> Sent: Friday, November 15, 2019 11:22 AM
> 
> On Thu, 14 Nov 2019 21:40:35 -0500
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
> > > On Fri, 15 Nov 2019 00:26:07 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >
> > > > On 11/14/2019 1:37 AM, Alex Williamson wrote:
> > > > > On Thu, 14 Nov 2019 01:07:21 +0530
> > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >
> > > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote:
> > > > >>> On Tue, 12 Nov 2019 22:33:37 +0530
> > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >>>
> > > > >>>> All pages pinned by vendor driver through vfio_pin_pages API
> should be
> > > > >>>> considered as dirty during migration. IOMMU container
> maintains a list of
> > > > >>>> all such pinned pages. Added an ioctl defination to get bitmap of
> such
> > > > >>>
> > > > >>> definition
> > > > >>>
> > > > >>>> pinned pages for requested IO virtual address range.
> > > > >>>
> > > > >>> Additionally, all mapped pages are considered dirty when
> physically
> > > > >>> mapped through to an IOMMU, modulo we discussed devices
> opting in to
> > > > >>> per page pinning to indicate finer granularity with a TBD
> mechanism to
> > > > >>> figure out if any non-opt-in devices remain.
> > > > >>>
> > > > >>
> > > > >> You mean, in case of device direct assignment (device pass
> through)?
> > > > >
> > > > > Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are
> fully
> > > > > pinned and mapped, then the correct dirty page set is all mapped
> pages.
> > > > > We discussed using the vpfn list as a mechanism for vendor drivers
> to
> > > > > reduce their migration footprint, but we also discussed that we
> would
> > > > > need a way to determine that all participants in the container have
> > > > > explicitly pinned their working pages or else we must consider the
> > > > > entire potential working set as dirty.
> > > > >
> > > >
> > > > How can vendor driver tell this capability to iommu module? Any
> suggestions?
> > >
> > > I think it does so by pinning pages.  Is it acceptable that if the
> > > vendor driver pins any pages, then from that point forward we consider
> > > the IOMMU group dirty page scope to be limited to pinned pages?
> There
> > > are complications around non-singleton IOMMU groups, but I think
> we're
> > > already leaning towards that being a non-worthwhile problem to solve.
> > > So if we require that only singleton IOMMU groups can pin pages and
> we
> > > pass the IOMMU group as a parameter to
> > > vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
> > > flag on its local vfio_group struct to indicate dirty page scope is
> > > limited to pinned pages.  We might want to keep a flag on the
> > > vfio_iommu struct to indicate if all of the vfio_groups for each
> > > vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
> > > pinned pages as an optimization to avoid walking lists too often.  Then
> > > we could test if vfio_iommu.domain_list is not empty and this new flag
> > > does not limit the dirty page scope, then everything within each
> > > vfio_dma is considered dirty.
> > >
> >
> > hi Alex
> > could you help clarify whether my understandings below are right?
> > In future,
> > 1. for mdev and for passthrough device withoug hardware ability to track
> > dirty pages, the vendor driver has to explicitly call
> > vfio_pin_pages()/vfio_unpin_pages() + a flag to tell vfio its dirty page set.
> 
> For non-IOMMU backed mdevs without hardware dirty page tracking,
> there's no change to the vendor driver currently.  Pages pinned by the
> vendor driver are marked as dirty.

What about the vendor driver can figure out, in software means, which
pinned pages are actually dirty? In that case, would a separate mark_dirty
interface make more sense? Or introduce read/write flag to the pin_pages
interface similar to DMA API? Existing drivers always set both r/w flags but
just in case then a specific driver may set read-only or write-only...

> 
> For any IOMMU backed device, mdev or direct assignment, all mapped
> memory would be considered dirty unless there are explicit calls to pin
> pages on top of the IOMMU page pinning and mapping.  These would likely
> be enabled only when the device is in the _SAVING device_state.
> 
> > 2. for those devices with hardware ability to track dirty pages, will still
> > provide a callback to vendor driver to get dirty pages. (as for those
> devices,
> > it is hard to explicitly call vfio_pin_pages()/vfio_unpin_pages())
> >
> > 3. for devices relying on dirty bit info in physical IOMMU, there
> > will be a callback to physical IOMMU driver to get dirty page set from
> > vfio.
> 
> The proposal here does not cover exactly how these would be
> implemented, it only establishes the container as the point of user
> interaction with the dirty bitmap and hopefully allows us to maintain
> that interface regardless of whether we have dirty tracking at the
> device or the system IOMMU.  Ideally devices with dirty tracking would
> make use of page pinning and we'd extend the interface to allow vendor
> drivers the ability to indicate the clean/dirty state of those pinned

I don't think "dirty tracking" == "page pinning". It's possible that a device
support tracking/logging dirty page info into a driver-registered buffer, 
then the host vendor driver doesn't need to mediate fast-path operations. 
In such case, the entire guest memory is always pinned and we just need 
a log-sync like interface for vendor driver to fill dirty bitmap.

> pages.  For system IOMMU dirty page tracking, that potentially might
> mean that we support IOMMU page faults and the container manages
> those
> faults such that the container is the central record of dirty pages.

IOMMU dirty-bit is not equivalent to IOMMU page fault. The latter
is much more complex which requires support both in IOMMU and in
device. Here similar to above device-dirty-tracking case, we just need a
log-sync interface calling into iommu driver to get dirty info filled for
requested address range.

> Until these interfaces are designed, we can only speculate, but the
> goal is to design a user interface compatible with how those features
> might evolve.  If you identify something that can't work, please raise
> the issue.  Thanks,
> 
> Alex

Here is the desired scheme in my mind. Feel free to correct me. :-)

1. iommu log-buf callback is preferred if underlying IOMMU reports
such capability. The iommu driver walks IOMMU page table to find
dirty pages for requested address range;
2. otherwise vendor driver log-buf callback is preferred if the vendor
driver reports such capability when registering mdev types. The
vendor driver calls device-specific interface to fill dirty info;
3. otherwise pages pined by vfio_pin_pages (with WRITE flag) are
considered dirty. This covers normal mediated devices or using
fast-path mediation for migrating passthrough device;
4. otherwise all mapped pages are considered dirty;

Currently we're working on 1) based on VT-d rev3.0. I know some
vendors implement 2) in their own code base. 3) has real usages 
already. 4) is the fall-back.

Alex, are you willing to have all the interfaces ready in one batch,
or support them based on available usages? I'm fine with either
way, but even just doing 3/4 in this series, I'd prefer to having
above scheme included in the code comment, to give the whole 
picture of all possible situations. :-)

Thanks
Kevin
Alex Williamson Nov. 19, 2019, 11:16 p.m. UTC | #9
On Fri, 15 Nov 2019 05:10:53 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson
> > Sent: Friday, November 15, 2019 11:22 AM
> > 
> > On Thu, 14 Nov 2019 21:40:35 -0500
> > Yan Zhao <yan.y.zhao@intel.com> wrote:
> >   
> > > On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:  
> > > > On Fri, 15 Nov 2019 00:26:07 +0530
> > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >  
> > > > > On 11/14/2019 1:37 AM, Alex Williamson wrote:  
> > > > > > On Thu, 14 Nov 2019 01:07:21 +0530
> > > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > >  
> > > > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote:  
> > > > > >>> On Tue, 12 Nov 2019 22:33:37 +0530
> > > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > >>>  
> > > > > >>>> All pages pinned by vendor driver through vfio_pin_pages API  
> > should be  
> > > > > >>>> considered as dirty during migration. IOMMU container  
> > maintains a list of  
> > > > > >>>> all such pinned pages. Added an ioctl defination to get bitmap of  
> > such  
> > > > > >>>
> > > > > >>> definition
> > > > > >>>  
> > > > > >>>> pinned pages for requested IO virtual address range.  
> > > > > >>>
> > > > > >>> Additionally, all mapped pages are considered dirty when  
> > physically  
> > > > > >>> mapped through to an IOMMU, modulo we discussed devices  
> > opting in to  
> > > > > >>> per page pinning to indicate finer granularity with a TBD  
> > mechanism to  
> > > > > >>> figure out if any non-opt-in devices remain.
> > > > > >>>  
> > > > > >>
> > > > > >> You mean, in case of device direct assignment (device pass  
> > through)?  
> > > > > >
> > > > > > Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are  
> > fully  
> > > > > > pinned and mapped, then the correct dirty page set is all mapped  
> > pages.  
> > > > > > We discussed using the vpfn list as a mechanism for vendor drivers  
> > to  
> > > > > > reduce their migration footprint, but we also discussed that we  
> > would  
> > > > > > need a way to determine that all participants in the container have
> > > > > > explicitly pinned their working pages or else we must consider the
> > > > > > entire potential working set as dirty.
> > > > > >  
> > > > >
> > > > > How can vendor driver tell this capability to iommu module? Any  
> > suggestions?  
> > > >
> > > > I think it does so by pinning pages.  Is it acceptable that if the
> > > > vendor driver pins any pages, then from that point forward we consider
> > > > the IOMMU group dirty page scope to be limited to pinned pages?  
> > There  
> > > > are complications around non-singleton IOMMU groups, but I think  
> > we're  
> > > > already leaning towards that being a non-worthwhile problem to solve.
> > > > So if we require that only singleton IOMMU groups can pin pages and  
> > we  
> > > > pass the IOMMU group as a parameter to
> > > > vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
> > > > flag on its local vfio_group struct to indicate dirty page scope is
> > > > limited to pinned pages.  We might want to keep a flag on the
> > > > vfio_iommu struct to indicate if all of the vfio_groups for each
> > > > vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
> > > > pinned pages as an optimization to avoid walking lists too often.  Then
> > > > we could test if vfio_iommu.domain_list is not empty and this new flag
> > > > does not limit the dirty page scope, then everything within each
> > > > vfio_dma is considered dirty.
> > > >  
> > >
> > > hi Alex
> > > could you help clarify whether my understandings below are right?
> > > In future,
> > > 1. for mdev and for passthrough device withoug hardware ability to track
> > > dirty pages, the vendor driver has to explicitly call
> > > vfio_pin_pages()/vfio_unpin_pages() + a flag to tell vfio its dirty page set.  
> > 
> > For non-IOMMU backed mdevs without hardware dirty page tracking,
> > there's no change to the vendor driver currently.  Pages pinned by the
> > vendor driver are marked as dirty.  
> 
> What about the vendor driver can figure out, in software means, which
> pinned pages are actually dirty? In that case, would a separate mark_dirty
> interface make more sense? Or introduce read/write flag to the pin_pages
> interface similar to DMA API? Existing drivers always set both r/w flags but
> just in case then a specific driver may set read-only or write-only...

You're jumping ahead to 2. below, where my reply is that we need to
extend the interface to allow the vendor driver to manipulate
clean/dirty state.  I don't know exactly what those interfaces should
look like, but yes, something should exist to allow that control.  If
the default is to mark pinned pages dirty, then we might need a
separate pin_pages_clean callback.

> > For any IOMMU backed device, mdev or direct assignment, all mapped
> > memory would be considered dirty unless there are explicit calls to pin
> > pages on top of the IOMMU page pinning and mapping.  These would likely
> > be enabled only when the device is in the _SAVING device_state.
> >   
> > > 2. for those devices with hardware ability to track dirty pages, will still
> > > provide a callback to vendor driver to get dirty pages. (as for those  
> > devices,  
> > > it is hard to explicitly call vfio_pin_pages()/vfio_unpin_pages())
> > >
> > > 3. for devices relying on dirty bit info in physical IOMMU, there
> > > will be a callback to physical IOMMU driver to get dirty page set from
> > > vfio.  
> > 
> > The proposal here does not cover exactly how these would be
> > implemented, it only establishes the container as the point of user
> > interaction with the dirty bitmap and hopefully allows us to maintain
> > that interface regardless of whether we have dirty tracking at the
> > device or the system IOMMU.  Ideally devices with dirty tracking would
> > make use of page pinning and we'd extend the interface to allow vendor
> > drivers the ability to indicate the clean/dirty state of those pinned  
> 
> I don't think "dirty tracking" == "page pinning". It's possible that a device
> support tracking/logging dirty page info into a driver-registered buffer, 
> then the host vendor driver doesn't need to mediate fast-path operations. 
> In such case, the entire guest memory is always pinned and we just need 
> a log-sync like interface for vendor driver to fill dirty bitmap.

An mdev device only has access to the pages that have been pinned on
behalf of the device, so just as we assume that any page pinned and
mapped through the IOMMU might be dirtied by a device, we can by
default assume that an page pinned for an mdev device is dirty.  This
maps fairly well to our existing mdev devices that don't seem to have
finer granularity dirty page tracking.  As I state above though, I
certainly don't expect this to be the final extent of dirty page
tracking.  I'm reluctant to commit to a log-sync interface though as
that seems to put the responsibility on the container to poll every
attached device whereas I was rather hoping that making the container
the central interface for dirty tracking would have devices marking
dirty pages in the container asynchronous to the user polling.

> > pages.  For system IOMMU dirty page tracking, that potentially might
> > mean that we support IOMMU page faults and the container manages
> > those
> > faults such that the container is the central record of dirty pages.  
> 
> IOMMU dirty-bit is not equivalent to IOMMU page fault. The latter
> is much more complex which requires support both in IOMMU and in
> device. Here similar to above device-dirty-tracking case, we just need a
> log-sync interface calling into iommu driver to get dirty info filled for
> requested address range.
> 
> > Until these interfaces are designed, we can only speculate, but the
> > goal is to design a user interface compatible with how those features
> > might evolve.  If you identify something that can't work, please raise
> > the issue.  Thanks,
> > 
> > Alex  
> 
> Here is the desired scheme in my mind. Feel free to correct me. :-)
> 
> 1. iommu log-buf callback is preferred if underlying IOMMU reports
> such capability. The iommu driver walks IOMMU page table to find
> dirty pages for requested address range;
> 2. otherwise vendor driver log-buf callback is preferred if the vendor
> driver reports such capability when registering mdev types. The
> vendor driver calls device-specific interface to fill dirty info;
> 3. otherwise pages pined by vfio_pin_pages (with WRITE flag) are
> considered dirty. This covers normal mediated devices or using
> fast-path mediation for migrating passthrough device;
> 4. otherwise all mapped pages are considered dirty;
> 
> Currently we're working on 1) based on VT-d rev3.0. I know some
> vendors implement 2) in their own code base. 3) has real usages 
> already. 4) is the fall-back.
> 
> Alex, are you willing to have all the interfaces ready in one batch,
> or support them based on available usages? I'm fine with either
> way, but even just doing 3/4 in this series, I'd prefer to having
> above scheme included in the code comment, to give the whole 
> picture of all possible situations. :-)

My intention was to cover 3 & 4 given the current state of device and
IOMMU dirty tracking.  I expect the user interface to remain unchanged
for 1 & 2 but the API between vfio, the vendor drivers, and the IOMMU
is internal to the kernel and more flexible.  Thanks,

Alex
Tian, Kevin Nov. 20, 2019, 1:04 a.m. UTC | #10
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Wednesday, November 20, 2019 7:17 AM
> 
> On Fri, 15 Nov 2019 05:10:53 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson
> > > Sent: Friday, November 15, 2019 11:22 AM
> > >
> > > On Thu, 14 Nov 2019 21:40:35 -0500
> > > Yan Zhao <yan.y.zhao@intel.com> wrote:
> > >
> > > > On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
> > > > > On Fri, 15 Nov 2019 00:26:07 +0530
> > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >
> > > > > > On 11/14/2019 1:37 AM, Alex Williamson wrote:
> > > > > > > On Thu, 14 Nov 2019 01:07:21 +0530
> > > > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > > >
> > > > > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote:
> > > > > > >>> On Tue, 12 Nov 2019 22:33:37 +0530
> > > > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > > >>>
> > > > > > >>>> All pages pinned by vendor driver through vfio_pin_pages
> API
> > > should be
> > > > > > >>>> considered as dirty during migration. IOMMU container
> > > maintains a list of
> > > > > > >>>> all such pinned pages. Added an ioctl defination to get
> bitmap of
> > > such
> > > > > > >>>
> > > > > > >>> definition
> > > > > > >>>
> > > > > > >>>> pinned pages for requested IO virtual address range.
> > > > > > >>>
> > > > > > >>> Additionally, all mapped pages are considered dirty when
> > > physically
> > > > > > >>> mapped through to an IOMMU, modulo we discussed devices
> > > opting in to
> > > > > > >>> per page pinning to indicate finer granularity with a TBD
> > > mechanism to
> > > > > > >>> figure out if any non-opt-in devices remain.
> > > > > > >>>
> > > > > > >>
> > > > > > >> You mean, in case of device direct assignment (device pass
> > > through)?
> > > > > > >
> > > > > > > Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are
> > > fully
> > > > > > > pinned and mapped, then the correct dirty page set is all
> mapped
> > > pages.
> > > > > > > We discussed using the vpfn list as a mechanism for vendor
> drivers
> > > to
> > > > > > > reduce their migration footprint, but we also discussed that we
> > > would
> > > > > > > need a way to determine that all participants in the container
> have
> > > > > > > explicitly pinned their working pages or else we must consider
> the
> > > > > > > entire potential working set as dirty.
> > > > > > >
> > > > > >
> > > > > > How can vendor driver tell this capability to iommu module? Any
> > > suggestions?
> > > > >
> > > > > I think it does so by pinning pages.  Is it acceptable that if the
> > > > > vendor driver pins any pages, then from that point forward we
> consider
> > > > > the IOMMU group dirty page scope to be limited to pinned pages?
> > > There
> > > > > are complications around non-singleton IOMMU groups, but I think
> > > we're
> > > > > already leaning towards that being a non-worthwhile problem to
> solve.
> > > > > So if we require that only singleton IOMMU groups can pin pages
> and
> > > we
> > > > > pass the IOMMU group as a parameter to
> > > > > vfio_iommu_driver_ops.pin_pages(), then the type1 backend can
> set a
> > > > > flag on its local vfio_group struct to indicate dirty page scope is
> > > > > limited to pinned pages.  We might want to keep a flag on the
> > > > > vfio_iommu struct to indicate if all of the vfio_groups for each
> > > > > vfio_domain in the vfio_iommu.domain_list dirty page scope limited
> to
> > > > > pinned pages as an optimization to avoid walking lists too often.
> Then
> > > > > we could test if vfio_iommu.domain_list is not empty and this new
> flag
> > > > > does not limit the dirty page scope, then everything within each
> > > > > vfio_dma is considered dirty.
> > > > >
> > > >
> > > > hi Alex
> > > > could you help clarify whether my understandings below are right?
> > > > In future,
> > > > 1. for mdev and for passthrough device withoug hardware ability to
> track
> > > > dirty pages, the vendor driver has to explicitly call
> > > > vfio_pin_pages()/vfio_unpin_pages() + a flag to tell vfio its dirty page
> set.
> > >
> > > For non-IOMMU backed mdevs without hardware dirty page tracking,
> > > there's no change to the vendor driver currently.  Pages pinned by the
> > > vendor driver are marked as dirty.
> >
> > What about the vendor driver can figure out, in software means, which
> > pinned pages are actually dirty? In that case, would a separate mark_dirty
> > interface make more sense? Or introduce read/write flag to the
> pin_pages
> > interface similar to DMA API? Existing drivers always set both r/w flags
> but
> > just in case then a specific driver may set read-only or write-only...
> 
> You're jumping ahead to 2. below, where my reply is that we need to

They are different. 2) is about hardware support which may collect
dirty page info in a log buffer and then report to driver when the latter
requests. Here I'm talking about software approach, i.e. when the
vendor driver intercepts certain guest operations, it may figure out
whether the captured DMA pages are used for write or read. The
hardware approach is log-based, which needs a driver callback so 
container can notify vendor driver to report. The latter is trap-based, 
which needs a VFIO API to update dirty status synchronously. 

> extend the interface to allow the vendor driver to manipulate
> clean/dirty state.  I don't know exactly what those interfaces should
> look like, but yes, something should exist to allow that control.  If
> the default is to mark pinned pages dirty, then we might need a
> separate pin_pages_clean callback.
> 
> > > For any IOMMU backed device, mdev or direct assignment, all mapped
> > > memory would be considered dirty unless there are explicit calls to pin
> > > pages on top of the IOMMU page pinning and mapping.  These would
> likely
> > > be enabled only when the device is in the _SAVING device_state.
> > >
> > > > 2. for those devices with hardware ability to track dirty pages, will still
> > > > provide a callback to vendor driver to get dirty pages. (as for those
> > > devices,
> > > > it is hard to explicitly call vfio_pin_pages()/vfio_unpin_pages())
> > > >
> > > > 3. for devices relying on dirty bit info in physical IOMMU, there
> > > > will be a callback to physical IOMMU driver to get dirty page set from
> > > > vfio.
> > >
> > > The proposal here does not cover exactly how these would be
> > > implemented, it only establishes the container as the point of user
> > > interaction with the dirty bitmap and hopefully allows us to maintain
> > > that interface regardless of whether we have dirty tracking at the
> > > device or the system IOMMU.  Ideally devices with dirty tracking would
> > > make use of page pinning and we'd extend the interface to allow
> vendor
> > > drivers the ability to indicate the clean/dirty state of those pinned
> >
> > I don't think "dirty tracking" == "page pinning". It's possible that a device
> > support tracking/logging dirty page info into a driver-registered buffer,
> > then the host vendor driver doesn't need to mediate fast-path operations.
> > In such case, the entire guest memory is always pinned and we just need
> > a log-sync like interface for vendor driver to fill dirty bitmap.
> 
> An mdev device only has access to the pages that have been pinned on
> behalf of the device, so just as we assume that any page pinned and
> mapped through the IOMMU might be dirtied by a device, we can by
> default assume that an page pinned for an mdev device is dirty.  This
> maps fairly well to our existing mdev devices that don't seem to have
> finer granularity dirty page tracking.  As I state above though, I
> certainly don't expect this to be the final extent of dirty page
> tracking.  I'm reluctant to commit to a log-sync interface though as
> that seems to put the responsibility on the container to poll every
> attached device whereas I was rather hoping that making the container
> the central interface for dirty tracking would have devices marking
> dirty pages in the container asynchronous to the user polling.

Having device to mark dirty pages asynchronous only applies to the
software approach which tracks dirty pages by mediating fast-path
guest operations. In case the device logging dirty info into a buffer,
we need a log-sync interface so vendor driver can be notified to
collect hardware-logged information. Whether to have vendor driver
directly update container's dirty bitmap, or have vendor driver to
call mark_dirty for every recorded dirty page, it's just an implementation
choice and you make the decision. :-) This is actually similar to IOMMU 
dirty-bit collection, where we also need an interface to notify IOMMU 
driver to collect its dirty bits.

> 
> > > pages.  For system IOMMU dirty page tracking, that potentially might
> > > mean that we support IOMMU page faults and the container manages
> > > those
> > > faults such that the container is the central record of dirty pages.
> >
> > IOMMU dirty-bit is not equivalent to IOMMU page fault. The latter
> > is much more complex which requires support both in IOMMU and in
> > device. Here similar to above device-dirty-tracking case, we just need a
> > log-sync interface calling into iommu driver to get dirty info filled for
> > requested address range.
> >
> > > Until these interfaces are designed, we can only speculate, but the
> > > goal is to design a user interface compatible with how those features
> > > might evolve.  If you identify something that can't work, please raise
> > > the issue.  Thanks,
> > >
> > > Alex
> >
> > Here is the desired scheme in my mind. Feel free to correct me. :-)
> >
> > 1. iommu log-buf callback is preferred if underlying IOMMU reports
> > such capability. The iommu driver walks IOMMU page table to find
> > dirty pages for requested address range;
> > 2. otherwise vendor driver log-buf callback is preferred if the vendor
> > driver reports such capability when registering mdev types. The
> > vendor driver calls device-specific interface to fill dirty info;
> > 3. otherwise pages pined by vfio_pin_pages (with WRITE flag) are
> > considered dirty. This covers normal mediated devices or using
> > fast-path mediation for migrating passthrough device;
> > 4. otherwise all mapped pages are considered dirty;
> >
> > Currently we're working on 1) based on VT-d rev3.0. I know some
> > vendors implement 2) in their own code base. 3) has real usages
> > already. 4) is the fall-back.
> >
> > Alex, are you willing to have all the interfaces ready in one batch,
> > or support them based on available usages? I'm fine with either
> > way, but even just doing 3/4 in this series, I'd prefer to having
> > above scheme included in the code comment, to give the whole
> > picture of all possible situations. :-)
> 
> My intention was to cover 3 & 4 given the current state of device and
> IOMMU dirty tracking.  I expect the user interface to remain unchanged
> for 1 & 2 but the API between vfio, the vendor drivers, and the IOMMU
> is internal to the kernel and more flexible.  Thanks,
> 

Sure. I also don't expect any change to user space API when extending
to 1 and 2. here our discussion is purely about kernel internal APIs. 

Thanks
Kevin
Yan Zhao Nov. 20, 2019, 1:51 a.m. UTC | #11
On Fri, Nov 15, 2019 at 11:21:33AM +0800, Alex Williamson wrote:
> On Thu, 14 Nov 2019 21:40:35 -0500
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
> > > On Fri, 15 Nov 2019 00:26:07 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >   
> > > > On 11/14/2019 1:37 AM, Alex Williamson wrote:  
> > > > > On Thu, 14 Nov 2019 01:07:21 +0530
> > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >     
> > > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote:    
> > > > >>> On Tue, 12 Nov 2019 22:33:37 +0530
> > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >>>        
> > > > >>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> > > > >>>> considered as dirty during migration. IOMMU container maintains a list of
> > > > >>>> all such pinned pages. Added an ioctl defination to get bitmap of such    
> > > > >>>
> > > > >>> definition
> > > > >>>        
> > > > >>>> pinned pages for requested IO virtual address range.    
> > > > >>>
> > > > >>> Additionally, all mapped pages are considered dirty when physically
> > > > >>> mapped through to an IOMMU, modulo we discussed devices opting in to
> > > > >>> per page pinning to indicate finer granularity with a TBD mechanism to
> > > > >>> figure out if any non-opt-in devices remain.
> > > > >>>        
> > > > >>
> > > > >> You mean, in case of device direct assignment (device pass through)?    
> > > > > 
> > > > > Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> > > > > pinned and mapped, then the correct dirty page set is all mapped pages.
> > > > > We discussed using the vpfn list as a mechanism for vendor drivers to
> > > > > reduce their migration footprint, but we also discussed that we would
> > > > > need a way to determine that all participants in the container have
> > > > > explicitly pinned their working pages or else we must consider the
> > > > > entire potential working set as dirty.
> > > > >     
> > > > 
> > > > How can vendor driver tell this capability to iommu module? Any suggestions?  
> > > 
> > > I think it does so by pinning pages.  Is it acceptable that if the
> > > vendor driver pins any pages, then from that point forward we consider
> > > the IOMMU group dirty page scope to be limited to pinned pages?  There
> > > are complications around non-singleton IOMMU groups, but I think we're
> > > already leaning towards that being a non-worthwhile problem to solve.
> > > So if we require that only singleton IOMMU groups can pin pages and we
> > > pass the IOMMU group as a parameter to
> > > vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
> > > flag on its local vfio_group struct to indicate dirty page scope is
> > > limited to pinned pages.  We might want to keep a flag on the
> > > vfio_iommu struct to indicate if all of the vfio_groups for each
> > > vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
> > > pinned pages as an optimization to avoid walking lists too often.  Then
> > > we could test if vfio_iommu.domain_list is not empty and this new flag
> > > does not limit the dirty page scope, then everything within each
> > > vfio_dma is considered dirty.
> > >  
> > 
> > hi Alex
> > could you help clarify whether my understandings below are right?
> > In future,
> > 1. for mdev and for passthrough device withoug hardware ability to track
> > dirty pages, the vendor driver has to explicitly call
> > vfio_pin_pages()/vfio_unpin_pages() + a flag to tell vfio its dirty page set.
> 
> For non-IOMMU backed mdevs without hardware dirty page tracking,
> there's no change to the vendor driver currently.  Pages pinned by the
> vendor driver are marked as dirty.
> 
> For any IOMMU backed device, mdev or direct assignment, all mapped
> memory would be considered dirty unless there are explicit calls to pin
> pages on top of the IOMMU page pinning and mapping.  These would likely
> be enabled only when the device is in the _SAVING device_state.
> 
> > 2. for those devices with hardware ability to track dirty pages, will still
> > provide a callback to vendor driver to get dirty pages. (as for those devices,
> > it is hard to explicitly call vfio_pin_pages()/vfio_unpin_pages())
> >
> > 3. for devices relying on dirty bit info in physical IOMMU, there
> > will be a callback to physical IOMMU driver to get dirty page set from
> > vfio.
> 
> The proposal here does not cover exactly how these would be
> implemented, it only establishes the container as the point of user
> interaction with the dirty bitmap and hopefully allows us to maintain
> that interface regardless of whether we have dirty tracking at the
> device or the system IOMMU.  Ideally devices with dirty tracking would
> make use of page pinning and we'd extend the interface to allow vendor
> drivers the ability to indicate the clean/dirty state of those pinned
> pages.  For system IOMMU dirty page tracking, that potentially might
> mean that we support IOMMU page faults and the container manages those
> faults such that the container is the central record of dirty pages.
> Until these interfaces are designed, we can only speculate, but the
> goal is to design a user interface compatible with how those features
> might evolve.  If you identify something that can't work, please raise
> the issue.  Thanks,
> 
> Alex
>
hi Alex
I think there are two downsides of centralizing dirty page tracking into
vfio container.
1. vendor driver has to report dirty pages to vfio container immediately
after it detects a dirty page.
It lost the freedom to record dirty pages in whatever way and query it on
receiving log_sync call.
e.g. it can record dirty page info in its internal hardware buffer or
record in hardware IOMMU and ask for that by itself.

2. the vfio container, if based on pin_pages, only knows which pages are
dirty or not dirty, but don't know incremental information. That's why
in Kirti's QEMU implementation only query dirty pages after stopping
device, right? but if in that way, QEMU migration may generate a wrong
downtime expectation and cause a longer downtime. E.g.  before stopping
device, QEMU thinks there's only 100M data left and can limit
migration downtime to certain value. but after stopping device and query
dirty pages again, it finds out there're actually 1000M more...

That's more concerns to it.

Thanks
Yan
Yan Zhao Nov. 26, 2019, 12:57 a.m. UTC | #12
On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
> On Fri, 15 Nov 2019 00:26:07 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 11/14/2019 1:37 AM, Alex Williamson wrote:
> > > On Thu, 14 Nov 2019 01:07:21 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >   
> > >> On 11/13/2019 4:00 AM, Alex Williamson wrote:  
> > >>> On Tue, 12 Nov 2019 22:33:37 +0530
> > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>>      
> > >>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> > >>>> considered as dirty during migration. IOMMU container maintains a list of
> > >>>> all such pinned pages. Added an ioctl defination to get bitmap of such  
> > >>>
> > >>> definition
> > >>>      
> > >>>> pinned pages for requested IO virtual address range.  
> > >>>
> > >>> Additionally, all mapped pages are considered dirty when physically
> > >>> mapped through to an IOMMU, modulo we discussed devices opting in to
> > >>> per page pinning to indicate finer granularity with a TBD mechanism to
> > >>> figure out if any non-opt-in devices remain.
> > >>>      
> > >>
> > >> You mean, in case of device direct assignment (device pass through)?  
> > > 
> > > Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> > > pinned and mapped, then the correct dirty page set is all mapped pages.
> > > We discussed using the vpfn list as a mechanism for vendor drivers to
> > > reduce their migration footprint, but we also discussed that we would
> > > need a way to determine that all participants in the container have
> > > explicitly pinned their working pages or else we must consider the
> > > entire potential working set as dirty.
> > >   
> > 
> > How can vendor driver tell this capability to iommu module? Any suggestions?
> 
> I think it does so by pinning pages.  Is it acceptable that if the
> vendor driver pins any pages, then from that point forward we consider
> the IOMMU group dirty page scope to be limited to pinned pages?  There
we should also be aware of that dirty page scope is pinned pages + unpinned pages,
which means ever since a page is pinned, it should be regarded as dirty
no matter whether it's unpinned later. only after log_sync is called and
dirty info retrieved, its dirty state should be cleared.

> are complications around non-singleton IOMMU groups, but I think we're
> already leaning towards that being a non-worthwhile problem to solve.
> So if we require that only singleton IOMMU groups can pin pages and we
> pass the IOMMU group as a parameter to
> vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
> flag on its local vfio_group struct to indicate dirty page scope is
> limited to pinned pages.  We might want to keep a flag on the
> vfio_iommu struct to indicate if all of the vfio_groups for each
> vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
> pinned pages as an optimization to avoid walking lists too often.  Then
> we could test if vfio_iommu.domain_list is not empty and this new flag
> does not limit the dirty page scope, then everything within each
> vfio_dma is considered dirty.
>  
> > >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > >>>> ---
> > >>>>    include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++
> > >>>>    1 file changed, 23 insertions(+)
> > >>>>
> > >>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > >>>> index 35b09427ad9f..6fd3822aa610 100644
> > >>>> --- a/include/uapi/linux/vfio.h
> > >>>> +++ b/include/uapi/linux/vfio.h
> > >>>> @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap {
> > >>>>    #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
> > >>>>    #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
> > >>>>    
> > >>>> +/**
> > >>>> + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
> > >>>> + *                                     struct vfio_iommu_type1_dirty_bitmap)
> > >>>> + *
> > >>>> + * IOCTL to get dirty pages bitmap for IOMMU container during migration.
> > >>>> + * Get dirty pages bitmap of given IO virtual addresses range using
> > >>>> + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of
> > >>>> + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get
> > >>>> + * bitmap and should set size of allocated memory in bitmap_size field.
> > >>>> + * One bit is used to represent per page consecutively starting from iova
> > >>>> + * offset. Bit set indicates page at that offset from iova is dirty.
> > >>>> + */
> > >>>> +struct vfio_iommu_type1_dirty_bitmap {
> > >>>> +	__u32        argsz;
> > >>>> +	__u32        flags;
> > >>>> +	__u64        iova;                      /* IO virtual address */
> > >>>> +	__u64        size;                      /* Size of iova range */
> > >>>> +	__u64        bitmap_size;               /* in bytes */  
> > >>>
> > >>> This seems redundant.  We can calculate the size of the bitmap based on
> > >>> the iova size.
> > >>>     
> > >>
> > >> But in kernel space, we need to validate the size of memory allocated by
> > >> user instead of assuming user is always correct, right?  
> > > 
> > > What does it buy us for the user to tell us the size?  They could be
> > > wrong, they could be malicious.  The argsz field on the ioctl is mostly
> > > for the handshake that the user is competent, we should get faults from
> > > the copy-user operation if it's incorrect.
> > >  
> > 
> > It is to mainly fail safe.
> > 
> > >>>> +	void __user *bitmap;                    /* one bit per page */  
> > >>>
> > >>> Should we define that as a __u64* to (a) help with the size
> > >>> calculation, and (b) assure that we can use 8-byte ops on it?
> > >>>
> > >>> However, who defines page size?  Is it necessarily the processor page
> > >>> size?  A physical IOMMU may support page sizes other than the CPU page
> > >>> size.  It might be more important to indicate the expected page size
> > >>> than the bitmap size.  Thanks,
> > >>>     
> > >>
> > >> I see in QEMU and in vfio_iommu_type1 module, page sizes considered for
> > >> mapping are CPU page size, 4K. Do we still need to have such argument?  
> > > 
> > > That assumption exists for backwards compatibility prior to supporting
> > > the iova_pgsizes field in vfio_iommu_type1_info.  AFAIK the current
> > > interface has no page size assumptions and we should not add any.  
> > 
> > So userspace has iova_pgsizes information, which can be input to this 
> > ioctl. Bitmap should be considering smallest page size. Does that makes 
> > sense?
> 
> I'm not sure.  I thought I had an argument that the iova_pgsize could
> indicate support for sizes smaller than the processor page size, which
> would make the user responsible for using a different base for their
> page size, but vfio_pgsize_bitmap() already masks out sub-page sizes.
> Clearly the vendor driver is pinning based on processor sized pages,
> but that's independent of an IOMMU and not part of a user ABI.
> 
> I'm tempted to say your bitmap_size field has a use here, but it seems
> to fail in validating the user page size at the low extremes.  For
> example if we have a single page mapping, the user can specify the iova
> size as 4K (for example), but the minimum bitmap_size they can indicate
> is 1 byte, would we therefore assume the user's bitmap page size is 512
> bytes (ie. they provided us with 8 bits to describe a 4K range)?  We'd
> need to be careful to specify that the minimum iova_pgsize indicated
> page size is our lower bound as well.  But then what do we do if the
> user provides us with a smaller buffer than we expect?  For example, a
> 128MB iova range and only an 8-byte buffer.  Do we go ahead and assume
> a 2MB page size and fill the bitmap accordingly or do we generate an
> error?  If the latter, might we support that at some point in time and
> is it sufficient to let the user perform trial and error to test if that
> exists?  Thanks,
> 
> Alex
>
Alex Williamson Dec. 3, 2019, 6:04 p.m. UTC | #13
On Mon, 25 Nov 2019 19:57:39 -0500
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
> > On Fri, 15 Nov 2019 00:26:07 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> > > On 11/14/2019 1:37 AM, Alex Williamson wrote:  
> > > > On Thu, 14 Nov 2019 01:07:21 +0530
> > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >     
> > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote:    
> > > >>> On Tue, 12 Nov 2019 22:33:37 +0530
> > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >>>        
> > > >>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> > > >>>> considered as dirty during migration. IOMMU container maintains a list of
> > > >>>> all such pinned pages. Added an ioctl defination to get bitmap of such    
> > > >>>
> > > >>> definition
> > > >>>        
> > > >>>> pinned pages for requested IO virtual address range.    
> > > >>>
> > > >>> Additionally, all mapped pages are considered dirty when physically
> > > >>> mapped through to an IOMMU, modulo we discussed devices opting in to
> > > >>> per page pinning to indicate finer granularity with a TBD mechanism to
> > > >>> figure out if any non-opt-in devices remain.
> > > >>>        
> > > >>
> > > >> You mean, in case of device direct assignment (device pass through)?    
> > > > 
> > > > Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> > > > pinned and mapped, then the correct dirty page set is all mapped pages.
> > > > We discussed using the vpfn list as a mechanism for vendor drivers to
> > > > reduce their migration footprint, but we also discussed that we would
> > > > need a way to determine that all participants in the container have
> > > > explicitly pinned their working pages or else we must consider the
> > > > entire potential working set as dirty.
> > > >     
> > > 
> > > How can vendor driver tell this capability to iommu module? Any suggestions?  
> > 
> > I think it does so by pinning pages.  Is it acceptable that if the
> > vendor driver pins any pages, then from that point forward we consider
> > the IOMMU group dirty page scope to be limited to pinned pages?  There  
> we should also be aware of that dirty page scope is pinned pages + unpinned pages,
> which means ever since a page is pinned, it should be regarded as dirty
> no matter whether it's unpinned later. only after log_sync is called and
> dirty info retrieved, its dirty state should be cleared.

Yes, good point.  We can't just remove a vpfn when a page is unpinned
or else we'd lose information that the page potentially had been
dirtied while it was pinned.  Maybe that vpfn needs to move to a dirty
list and both the currently pinned vpfns and the dirty vpfns are walked
on a log_sync.  The dirty vpfns list would be cleared after a log_sync.
The container would need to know that dirty tracking is enabled and
only manage the dirty vpfns list when necessary.  Thanks,

Alex
 
> > are complications around non-singleton IOMMU groups, but I think we're
> > already leaning towards that being a non-worthwhile problem to solve.
> > So if we require that only singleton IOMMU groups can pin pages and we
> > pass the IOMMU group as a parameter to
> > vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
> > flag on its local vfio_group struct to indicate dirty page scope is
> > limited to pinned pages.  We might want to keep a flag on the
> > vfio_iommu struct to indicate if all of the vfio_groups for each
> > vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
> > pinned pages as an optimization to avoid walking lists too often.  Then
> > we could test if vfio_iommu.domain_list is not empty and this new flag
> > does not limit the dirty page scope, then everything within each
> > vfio_dma is considered dirty.
> >    
> > > >>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > >>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > >>>> ---
> > > >>>>    include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++
> > > >>>>    1 file changed, 23 insertions(+)
> > > >>>>
> > > >>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > >>>> index 35b09427ad9f..6fd3822aa610 100644
> > > >>>> --- a/include/uapi/linux/vfio.h
> > > >>>> +++ b/include/uapi/linux/vfio.h
> > > >>>> @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap {
> > > >>>>    #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
> > > >>>>    #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
> > > >>>>    
> > > >>>> +/**
> > > >>>> + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
> > > >>>> + *                                     struct vfio_iommu_type1_dirty_bitmap)
> > > >>>> + *
> > > >>>> + * IOCTL to get dirty pages bitmap for IOMMU container during migration.
> > > >>>> + * Get dirty pages bitmap of given IO virtual addresses range using
> > > >>>> + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of
> > > >>>> + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get
> > > >>>> + * bitmap and should set size of allocated memory in bitmap_size field.
> > > >>>> + * One bit is used to represent per page consecutively starting from iova
> > > >>>> + * offset. Bit set indicates page at that offset from iova is dirty.
> > > >>>> + */
> > > >>>> +struct vfio_iommu_type1_dirty_bitmap {
> > > >>>> +	__u32        argsz;
> > > >>>> +	__u32        flags;
> > > >>>> +	__u64        iova;                      /* IO virtual address */
> > > >>>> +	__u64        size;                      /* Size of iova range */
> > > >>>> +	__u64        bitmap_size;               /* in bytes */    
> > > >>>
> > > >>> This seems redundant.  We can calculate the size of the bitmap based on
> > > >>> the iova size.
> > > >>>       
> > > >>
> > > >> But in kernel space, we need to validate the size of memory allocated by
> > > >> user instead of assuming user is always correct, right?    
> > > > 
> > > > What does it buy us for the user to tell us the size?  They could be
> > > > wrong, they could be malicious.  The argsz field on the ioctl is mostly
> > > > for the handshake that the user is competent, we should get faults from
> > > > the copy-user operation if it's incorrect.
> > > >    
> > > 
> > > It is to mainly fail safe.
> > >   
> > > >>>> +	void __user *bitmap;                    /* one bit per page */    
> > > >>>
> > > >>> Should we define that as a __u64* to (a) help with the size
> > > >>> calculation, and (b) assure that we can use 8-byte ops on it?
> > > >>>
> > > >>> However, who defines page size?  Is it necessarily the processor page
> > > >>> size?  A physical IOMMU may support page sizes other than the CPU page
> > > >>> size.  It might be more important to indicate the expected page size
> > > >>> than the bitmap size.  Thanks,
> > > >>>       
> > > >>
> > > >> I see in QEMU and in vfio_iommu_type1 module, page sizes considered for
> > > >> mapping are CPU page size, 4K. Do we still need to have such argument?    
> > > > 
> > > > That assumption exists for backwards compatibility prior to supporting
> > > > the iova_pgsizes field in vfio_iommu_type1_info.  AFAIK the current
> > > > interface has no page size assumptions and we should not add any.    
> > > 
> > > So userspace has iova_pgsizes information, which can be input to this 
> > > ioctl. Bitmap should be considering smallest page size. Does that makes 
> > > sense?  
> > 
> > I'm not sure.  I thought I had an argument that the iova_pgsize could
> > indicate support for sizes smaller than the processor page size, which
> > would make the user responsible for using a different base for their
> > page size, but vfio_pgsize_bitmap() already masks out sub-page sizes.
> > Clearly the vendor driver is pinning based on processor sized pages,
> > but that's independent of an IOMMU and not part of a user ABI.
> > 
> > I'm tempted to say your bitmap_size field has a use here, but it seems
> > to fail in validating the user page size at the low extremes.  For
> > example if we have a single page mapping, the user can specify the iova
> > size as 4K (for example), but the minimum bitmap_size they can indicate
> > is 1 byte, would we therefore assume the user's bitmap page size is 512
> > bytes (ie. they provided us with 8 bits to describe a 4K range)?  We'd
> > need to be careful to specify that the minimum iova_pgsize indicated
> > page size is our lower bound as well.  But then what do we do if the
> > user provides us with a smaller buffer than we expect?  For example, a
> > 128MB iova range and only an 8-byte buffer.  Do we go ahead and assume
> > a 2MB page size and fill the bitmap accordingly or do we generate an
> > error?  If the latter, might we support that at some point in time and
> > is it sufficient to let the user perform trial and error to test if that
> > exists?  Thanks,
> > 
> > Alex
> >   
>
Kirti Wankhede Dec. 4, 2019, 6:10 p.m. UTC | #14
On 12/3/2019 11:34 PM, Alex Williamson wrote:
> On Mon, 25 Nov 2019 19:57:39 -0500
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
>> On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
>>> On Fri, 15 Nov 2019 00:26:07 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>    
>>>> On 11/14/2019 1:37 AM, Alex Williamson wrote:
>>>>> On Thu, 14 Nov 2019 01:07:21 +0530
>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>      
>>>>>> On 11/13/2019 4:00 AM, Alex Williamson wrote:
>>>>>>> On Tue, 12 Nov 2019 22:33:37 +0530
>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>>         
>>>>>>>> All pages pinned by vendor driver through vfio_pin_pages API should be
>>>>>>>> considered as dirty during migration. IOMMU container maintains a list of
>>>>>>>> all such pinned pages. Added an ioctl defination to get bitmap of such
>>>>>>>
>>>>>>> definition
>>>>>>>         
>>>>>>>> pinned pages for requested IO virtual address range.
>>>>>>>
>>>>>>> Additionally, all mapped pages are considered dirty when physically
>>>>>>> mapped through to an IOMMU, modulo we discussed devices opting in to
>>>>>>> per page pinning to indicate finer granularity with a TBD mechanism to
>>>>>>> figure out if any non-opt-in devices remain.
>>>>>>>         
>>>>>>
>>>>>> You mean, in case of device direct assignment (device pass through)?
>>>>>
>>>>> Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
>>>>> pinned and mapped, then the correct dirty page set is all mapped pages.
>>>>> We discussed using the vpfn list as a mechanism for vendor drivers to
>>>>> reduce their migration footprint, but we also discussed that we would
>>>>> need a way to determine that all participants in the container have
>>>>> explicitly pinned their working pages or else we must consider the
>>>>> entire potential working set as dirty.
>>>>>      
>>>>
>>>> How can vendor driver tell this capability to iommu module? Any suggestions?
>>>
>>> I think it does so by pinning pages.  Is it acceptable that if the
>>> vendor driver pins any pages, then from that point forward we consider
>>> the IOMMU group dirty page scope to be limited to pinned pages?  There
>> we should also be aware of that dirty page scope is pinned pages + unpinned pages,
>> which means ever since a page is pinned, it should be regarded as dirty
>> no matter whether it's unpinned later. only after log_sync is called and
>> dirty info retrieved, its dirty state should be cleared.
> 
> Yes, good point.  We can't just remove a vpfn when a page is unpinned
> or else we'd lose information that the page potentially had been
> dirtied while it was pinned.  Maybe that vpfn needs to move to a dirty
> list and both the currently pinned vpfns and the dirty vpfns are walked
> on a log_sync.  The dirty vpfns list would be cleared after a log_sync.
> The container would need to know that dirty tracking is enabled and
> only manage the dirty vpfns list when necessary.  Thanks,
> 

If page is unpinned, then that page is available in free page pool for 
others to use, then how can we say that unpinned page has valid data?

If suppose, one driver A unpins a page and when driver B of some other 
device gets that page and he pins it, uses it, and then unpins it, then 
how can we say that page has valid data for driver A?

Can you give one example where unpinned page data is considered reliable 
and valid?

Thanks,
Kirti

> Alex
>   
>>> are complications around non-singleton IOMMU groups, but I think we're
>>> already leaning towards that being a non-worthwhile problem to solve.
>>> So if we require that only singleton IOMMU groups can pin pages and we
>>> pass the IOMMU group as a parameter to
>>> vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a
>>> flag on its local vfio_group struct to indicate dirty page scope is
>>> limited to pinned pages.  We might want to keep a flag on the
>>> vfio_iommu struct to indicate if all of the vfio_groups for each
>>> vfio_domain in the vfio_iommu.domain_list dirty page scope limited to
>>> pinned pages as an optimization to avoid walking lists too often.  Then
>>> we could test if vfio_iommu.domain_list is not empty and this new flag
>>> does not limit the dirty page scope, then everything within each
>>> vfio_dma is considered dirty.
>>>     
>>>>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>>>>>> ---
>>>>>>>>     include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++
>>>>>>>>     1 file changed, 23 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>>>>>>> index 35b09427ad9f..6fd3822aa610 100644
>>>>>>>> --- a/include/uapi/linux/vfio.h
>>>>>>>> +++ b/include/uapi/linux/vfio.h
>>>>>>>> @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap {
>>>>>>>>     #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
>>>>>>>>     #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
>>>>>>>>     
>>>>>>>> +/**
>>>>>>>> + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
>>>>>>>> + *                                     struct vfio_iommu_type1_dirty_bitmap)
>>>>>>>> + *
>>>>>>>> + * IOCTL to get dirty pages bitmap for IOMMU container during migration.
>>>>>>>> + * Get dirty pages bitmap of given IO virtual addresses range using
>>>>>>>> + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of
>>>>>>>> + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get
>>>>>>>> + * bitmap and should set size of allocated memory in bitmap_size field.
>>>>>>>> + * One bit is used to represent per page consecutively starting from iova
>>>>>>>> + * offset. Bit set indicates page at that offset from iova is dirty.
>>>>>>>> + */
>>>>>>>> +struct vfio_iommu_type1_dirty_bitmap {
>>>>>>>> +	__u32        argsz;
>>>>>>>> +	__u32        flags;
>>>>>>>> +	__u64        iova;                      /* IO virtual address */
>>>>>>>> +	__u64        size;                      /* Size of iova range */
>>>>>>>> +	__u64        bitmap_size;               /* in bytes */
>>>>>>>
>>>>>>> This seems redundant.  We can calculate the size of the bitmap based on
>>>>>>> the iova size.
>>>>>>>        
>>>>>>
>>>>>> But in kernel space, we need to validate the size of memory allocated by
>>>>>> user instead of assuming user is always correct, right?
>>>>>
>>>>> What does it buy us for the user to tell us the size?  They could be
>>>>> wrong, they could be malicious.  The argsz field on the ioctl is mostly
>>>>> for the handshake that the user is competent, we should get faults from
>>>>> the copy-user operation if it's incorrect.
>>>>>     
>>>>
>>>> It is to mainly fail safe.
>>>>    
>>>>>>>> +	void __user *bitmap;                    /* one bit per page */
>>>>>>>
>>>>>>> Should we define that as a __u64* to (a) help with the size
>>>>>>> calculation, and (b) assure that we can use 8-byte ops on it?
>>>>>>>
>>>>>>> However, who defines page size?  Is it necessarily the processor page
>>>>>>> size?  A physical IOMMU may support page sizes other than the CPU page
>>>>>>> size.  It might be more important to indicate the expected page size
>>>>>>> than the bitmap size.  Thanks,
>>>>>>>        
>>>>>>
>>>>>> I see in QEMU and in vfio_iommu_type1 module, page sizes considered for
>>>>>> mapping are CPU page size, 4K. Do we still need to have such argument?
>>>>>
>>>>> That assumption exists for backwards compatibility prior to supporting
>>>>> the iova_pgsizes field in vfio_iommu_type1_info.  AFAIK the current
>>>>> interface has no page size assumptions and we should not add any.
>>>>
>>>> So userspace has iova_pgsizes information, which can be input to this
>>>> ioctl. Bitmap should be considering smallest page size. Does that makes
>>>> sense?
>>>
>>> I'm not sure.  I thought I had an argument that the iova_pgsize could
>>> indicate support for sizes smaller than the processor page size, which
>>> would make the user responsible for using a different base for their
>>> page size, but vfio_pgsize_bitmap() already masks out sub-page sizes.
>>> Clearly the vendor driver is pinning based on processor sized pages,
>>> but that's independent of an IOMMU and not part of a user ABI.
>>>
>>> I'm tempted to say your bitmap_size field has a use here, but it seems
>>> to fail in validating the user page size at the low extremes.  For
>>> example if we have a single page mapping, the user can specify the iova
>>> size as 4K (for example), but the minimum bitmap_size they can indicate
>>> is 1 byte, would we therefore assume the user's bitmap page size is 512
>>> bytes (ie. they provided us with 8 bits to describe a 4K range)?  We'd
>>> need to be careful to specify that the minimum iova_pgsize indicated
>>> page size is our lower bound as well.  But then what do we do if the
>>> user provides us with a smaller buffer than we expect?  For example, a
>>> 128MB iova range and only an 8-byte buffer.  Do we go ahead and assume
>>> a 2MB page size and fill the bitmap accordingly or do we generate an
>>> error?  If the latter, might we support that at some point in time and
>>> is it sufficient to let the user perform trial and error to test if that
>>> exists?  Thanks,
>>>
>>> Alex
>>>    
>>
>
Alex Williamson Dec. 4, 2019, 6:34 p.m. UTC | #15
On Wed, 4 Dec 2019 23:40:25 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 12/3/2019 11:34 PM, Alex Williamson wrote:
> > On Mon, 25 Nov 2019 19:57:39 -0500
> > Yan Zhao <yan.y.zhao@intel.com> wrote:
> >   
> >> On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:  
> >>> On Fri, 15 Nov 2019 00:26:07 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>      
> >>>> On 11/14/2019 1:37 AM, Alex Williamson wrote:  
> >>>>> On Thu, 14 Nov 2019 01:07:21 +0530
> >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>        
> >>>>>> On 11/13/2019 4:00 AM, Alex Williamson wrote:  
> >>>>>>> On Tue, 12 Nov 2019 22:33:37 +0530
> >>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>>           
> >>>>>>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> >>>>>>>> considered as dirty during migration. IOMMU container maintains a list of
> >>>>>>>> all such pinned pages. Added an ioctl defination to get bitmap of such  
> >>>>>>>
> >>>>>>> definition
> >>>>>>>           
> >>>>>>>> pinned pages for requested IO virtual address range.  
> >>>>>>>
> >>>>>>> Additionally, all mapped pages are considered dirty when physically
> >>>>>>> mapped through to an IOMMU, modulo we discussed devices opting in to
> >>>>>>> per page pinning to indicate finer granularity with a TBD mechanism to
> >>>>>>> figure out if any non-opt-in devices remain.
> >>>>>>>           
> >>>>>>
> >>>>>> You mean, in case of device direct assignment (device pass through)?  
> >>>>>
> >>>>> Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> >>>>> pinned and mapped, then the correct dirty page set is all mapped pages.
> >>>>> We discussed using the vpfn list as a mechanism for vendor drivers to
> >>>>> reduce their migration footprint, but we also discussed that we would
> >>>>> need a way to determine that all participants in the container have
> >>>>> explicitly pinned their working pages or else we must consider the
> >>>>> entire potential working set as dirty.
> >>>>>        
> >>>>
> >>>> How can vendor driver tell this capability to iommu module? Any suggestions?  
> >>>
> >>> I think it does so by pinning pages.  Is it acceptable that if the
> >>> vendor driver pins any pages, then from that point forward we consider
> >>> the IOMMU group dirty page scope to be limited to pinned pages?  There  
> >> we should also be aware of that dirty page scope is pinned pages + unpinned pages,
> >> which means ever since a page is pinned, it should be regarded as dirty
> >> no matter whether it's unpinned later. only after log_sync is called and
> >> dirty info retrieved, its dirty state should be cleared.  
> > 
> > Yes, good point.  We can't just remove a vpfn when a page is unpinned
> > or else we'd lose information that the page potentially had been
> > dirtied while it was pinned.  Maybe that vpfn needs to move to a dirty
> > list and both the currently pinned vpfns and the dirty vpfns are walked
> > on a log_sync.  The dirty vpfns list would be cleared after a log_sync.
> > The container would need to know that dirty tracking is enabled and
> > only manage the dirty vpfns list when necessary.  Thanks,
> >   
> 
> If page is unpinned, then that page is available in free page pool for 
> others to use, then how can we say that unpinned page has valid data?
> 
> If suppose, one driver A unpins a page and when driver B of some other 
> device gets that page and he pins it, uses it, and then unpins it, then 
> how can we say that page has valid data for driver A?
> 
> Can you give one example where unpinned page data is considered reliable 
> and valid?

We can only pin pages that the user has already allocated* and mapped
through the vfio DMA API.  The pinning of the page simply locks the
page for the vendor driver to access it and unpinning that page only
indicates that access is complete.  Pages are not freed when a vendor
driver unpins them, they still exist and at this point we're now
assuming the device dirtied the page while it was pinned.  Thanks,

Alex

* An exception here is that the page might be demand allocated and the
  act of pinning the page could actually allocate the backing page for
  the user if they have not faulted the page to trigger that allocation
  previously.  That page remains mapped for the user's virtual address
  space even after the unpinning though.
Yan Zhao Dec. 5, 2019, 1:28 a.m. UTC | #16
On Thu, Dec 05, 2019 at 02:34:57AM +0800, Alex Williamson wrote:
> On Wed, 4 Dec 2019 23:40:25 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 12/3/2019 11:34 PM, Alex Williamson wrote:
> > > On Mon, 25 Nov 2019 19:57:39 -0500
> > > Yan Zhao <yan.y.zhao@intel.com> wrote:
> > >   
> > >> On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:  
> > >>> On Fri, 15 Nov 2019 00:26:07 +0530
> > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>>      
> > >>>> On 11/14/2019 1:37 AM, Alex Williamson wrote:  
> > >>>>> On Thu, 14 Nov 2019 01:07:21 +0530
> > >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>>>>        
> > >>>>>> On 11/13/2019 4:00 AM, Alex Williamson wrote:  
> > >>>>>>> On Tue, 12 Nov 2019 22:33:37 +0530
> > >>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>>>>>>           
> > >>>>>>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> > >>>>>>>> considered as dirty during migration. IOMMU container maintains a list of
> > >>>>>>>> all such pinned pages. Added an ioctl defination to get bitmap of such  
> > >>>>>>>
> > >>>>>>> definition
> > >>>>>>>           
> > >>>>>>>> pinned pages for requested IO virtual address range.  
> > >>>>>>>
> > >>>>>>> Additionally, all mapped pages are considered dirty when physically
> > >>>>>>> mapped through to an IOMMU, modulo we discussed devices opting in to
> > >>>>>>> per page pinning to indicate finer granularity with a TBD mechanism to
> > >>>>>>> figure out if any non-opt-in devices remain.
> > >>>>>>>           
> > >>>>>>
> > >>>>>> You mean, in case of device direct assignment (device pass through)?  
> > >>>>>
> > >>>>> Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> > >>>>> pinned and mapped, then the correct dirty page set is all mapped pages.
> > >>>>> We discussed using the vpfn list as a mechanism for vendor drivers to
> > >>>>> reduce their migration footprint, but we also discussed that we would
> > >>>>> need a way to determine that all participants in the container have
> > >>>>> explicitly pinned their working pages or else we must consider the
> > >>>>> entire potential working set as dirty.
> > >>>>>        
> > >>>>
> > >>>> How can vendor driver tell this capability to iommu module? Any suggestions?  
> > >>>
> > >>> I think it does so by pinning pages.  Is it acceptable that if the
> > >>> vendor driver pins any pages, then from that point forward we consider
> > >>> the IOMMU group dirty page scope to be limited to pinned pages?  There  
> > >> we should also be aware of that dirty page scope is pinned pages + unpinned pages,
> > >> which means ever since a page is pinned, it should be regarded as dirty
> > >> no matter whether it's unpinned later. only after log_sync is called and
> > >> dirty info retrieved, its dirty state should be cleared.  
> > > 
> > > Yes, good point.  We can't just remove a vpfn when a page is unpinned
> > > or else we'd lose information that the page potentially had been
> > > dirtied while it was pinned.  Maybe that vpfn needs to move to a dirty
> > > list and both the currently pinned vpfns and the dirty vpfns are walked
> > > on a log_sync.  The dirty vpfns list would be cleared after a log_sync.
> > > The container would need to know that dirty tracking is enabled and
> > > only manage the dirty vpfns list when necessary.  Thanks,
> > >   
> > 
> > If page is unpinned, then that page is available in free page pool for 
> > others to use, then how can we say that unpinned page has valid data?
> > 
> > If suppose, one driver A unpins a page and when driver B of some other 
> > device gets that page and he pins it, uses it, and then unpins it, then 
> > how can we say that page has valid data for driver A?
> > 
> > Can you give one example where unpinned page data is considered reliable 
> > and valid?
> 
> We can only pin pages that the user has already allocated* and mapped
> through the vfio DMA API.  The pinning of the page simply locks the
> page for the vendor driver to access it and unpinning that page only
> indicates that access is complete.  Pages are not freed when a vendor
> driver unpins them, they still exist and at this point we're now
> assuming the device dirtied the page while it was pinned.  Thanks,
> 
> Alex
> 
> * An exception here is that the page might be demand allocated and the
>   act of pinning the page could actually allocate the backing page for
>   the user if they have not faulted the page to trigger that allocation
>   previously.  That page remains mapped for the user's virtual address
>   space even after the unpinning though.
>

Yes, I can give an example in GVT.
when a gem_object is allocated in guest, before submitting it to guest
vGPU, gfx cmds in its ring buffer need to be pinned into GGTT to get a
global graphics address for hardware access. At that time, we shadow
those cmds and pin pages through vfio pin_pages(), and submit the shadow
gem_object to physial hardware.
After guest driver thinks the submitted gem_object has completed hardware
DMA, it unnpinnd those pinned GGTT graphics memory addresses. Then in
host, we unpin the shadow pages through vfio unpin_pages.
But, at this point, guest driver is still free to access the gem_object
through vCPUs, and guest user space is probably still mapping an object
into the gem_object in guest driver.
So, missing the dirty page tracking for unpinned pages would cause
data inconsitency.

Thanks
Yan
Kirti Wankhede Dec. 5, 2019, 5:42 a.m. UTC | #17
On 12/5/2019 6:58 AM, Yan Zhao wrote:
> On Thu, Dec 05, 2019 at 02:34:57AM +0800, Alex Williamson wrote:
>> On Wed, 4 Dec 2019 23:40:25 +0530
>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>
>>> On 12/3/2019 11:34 PM, Alex Williamson wrote:
>>>> On Mon, 25 Nov 2019 19:57:39 -0500
>>>> Yan Zhao <yan.y.zhao@intel.com> wrote:
>>>>    
>>>>> On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
>>>>>> On Fri, 15 Nov 2019 00:26:07 +0530
>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>       
>>>>>>> On 11/14/2019 1:37 AM, Alex Williamson wrote:
>>>>>>>> On Thu, 14 Nov 2019 01:07:21 +0530
>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>>>         
>>>>>>>>> On 11/13/2019 4:00 AM, Alex Williamson wrote:
>>>>>>>>>> On Tue, 12 Nov 2019 22:33:37 +0530
>>>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>>>>>            
>>>>>>>>>>> All pages pinned by vendor driver through vfio_pin_pages API should be
>>>>>>>>>>> considered as dirty during migration. IOMMU container maintains a list of
>>>>>>>>>>> all such pinned pages. Added an ioctl defination to get bitmap of such
>>>>>>>>>>
>>>>>>>>>> definition
>>>>>>>>>>            
>>>>>>>>>>> pinned pages for requested IO virtual address range.
>>>>>>>>>>
>>>>>>>>>> Additionally, all mapped pages are considered dirty when physically
>>>>>>>>>> mapped through to an IOMMU, modulo we discussed devices opting in to
>>>>>>>>>> per page pinning to indicate finer granularity with a TBD mechanism to
>>>>>>>>>> figure out if any non-opt-in devices remain.
>>>>>>>>>>            
>>>>>>>>>
>>>>>>>>> You mean, in case of device direct assignment (device pass through)?
>>>>>>>>
>>>>>>>> Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
>>>>>>>> pinned and mapped, then the correct dirty page set is all mapped pages.
>>>>>>>> We discussed using the vpfn list as a mechanism for vendor drivers to
>>>>>>>> reduce their migration footprint, but we also discussed that we would
>>>>>>>> need a way to determine that all participants in the container have
>>>>>>>> explicitly pinned their working pages or else we must consider the
>>>>>>>> entire potential working set as dirty.
>>>>>>>>         
>>>>>>>
>>>>>>> How can vendor driver tell this capability to iommu module? Any suggestions?
>>>>>>
>>>>>> I think it does so by pinning pages.  Is it acceptable that if the
>>>>>> vendor driver pins any pages, then from that point forward we consider
>>>>>> the IOMMU group dirty page scope to be limited to pinned pages?  There
>>>>> we should also be aware of that dirty page scope is pinned pages + unpinned pages,
>>>>> which means ever since a page is pinned, it should be regarded as dirty
>>>>> no matter whether it's unpinned later. only after log_sync is called and
>>>>> dirty info retrieved, its dirty state should be cleared.
>>>>
>>>> Yes, good point.  We can't just remove a vpfn when a page is unpinned
>>>> or else we'd lose information that the page potentially had been
>>>> dirtied while it was pinned.  Maybe that vpfn needs to move to a dirty
>>>> list and both the currently pinned vpfns and the dirty vpfns are walked
>>>> on a log_sync.  The dirty vpfns list would be cleared after a log_sync.
>>>> The container would need to know that dirty tracking is enabled and
>>>> only manage the dirty vpfns list when necessary.  Thanks,
>>>>    
>>>
>>> If page is unpinned, then that page is available in free page pool for
>>> others to use, then how can we say that unpinned page has valid data?
>>>
>>> If suppose, one driver A unpins a page and when driver B of some other
>>> device gets that page and he pins it, uses it, and then unpins it, then
>>> how can we say that page has valid data for driver A?
>>>
>>> Can you give one example where unpinned page data is considered reliable
>>> and valid?
>>
>> We can only pin pages that the user has already allocated* and mapped
>> through the vfio DMA API.  The pinning of the page simply locks the
>> page for the vendor driver to access it and unpinning that page only
>> indicates that access is complete.  Pages are not freed when a vendor
>> driver unpins them, they still exist and at this point we're now
>> assuming the device dirtied the page while it was pinned.  Thanks,
>>
>> Alex
>>
>> * An exception here is that the page might be demand allocated and the
>>    act of pinning the page could actually allocate the backing page for
>>    the user if they have not faulted the page to trigger that allocation
>>    previously.  That page remains mapped for the user's virtual address
>>    space even after the unpinning though.
>>
> 
> Yes, I can give an example in GVT.
> when a gem_object is allocated in guest, before submitting it to guest
> vGPU, gfx cmds in its ring buffer need to be pinned into GGTT to get a
> global graphics address for hardware access. At that time, we shadow
> those cmds and pin pages through vfio pin_pages(), and submit the shadow
> gem_object to physial hardware.
> After guest driver thinks the submitted gem_object has completed hardware
> DMA, it unnpinnd those pinned GGTT graphics memory addresses. Then in
> host, we unpin the shadow pages through vfio unpin_pages.
> But, at this point, guest driver is still free to access the gem_object
> through vCPUs, and guest user space is probably still mapping an object
> into the gem_object in guest driver.
> So, missing the dirty page tracking for unpinned pages would cause
> data inconsitency.
> 

If pages are accessed by guest through vCPUs, then RAM module in QEMU 
will take care of tracking those pages as dirty.

All unpinned pages might not be used, so tracking all unpinned pages 
during VM or application life time would also lead to tracking lots of 
stale pages, even though they are not being used. Increasing number of 
not needed pages could also lead to increasing migration data leading 
increase in migration downtime.

Thanks,
Kirti
Yan Zhao Dec. 5, 2019, 5:47 a.m. UTC | #18
On Thu, Dec 05, 2019 at 01:42:23PM +0800, Kirti Wankhede wrote:
> 
> 
> On 12/5/2019 6:58 AM, Yan Zhao wrote:
> > On Thu, Dec 05, 2019 at 02:34:57AM +0800, Alex Williamson wrote:
> >> On Wed, 4 Dec 2019 23:40:25 +0530
> >> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>
> >>> On 12/3/2019 11:34 PM, Alex Williamson wrote:
> >>>> On Mon, 25 Nov 2019 19:57:39 -0500
> >>>> Yan Zhao <yan.y.zhao@intel.com> wrote:
> >>>>    
> >>>>> On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
> >>>>>> On Fri, 15 Nov 2019 00:26:07 +0530
> >>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>       
> >>>>>>> On 11/14/2019 1:37 AM, Alex Williamson wrote:
> >>>>>>>> On Thu, 14 Nov 2019 01:07:21 +0530
> >>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>>>         
> >>>>>>>>> On 11/13/2019 4:00 AM, Alex Williamson wrote:
> >>>>>>>>>> On Tue, 12 Nov 2019 22:33:37 +0530
> >>>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>>>>>            
> >>>>>>>>>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> >>>>>>>>>>> considered as dirty during migration. IOMMU container maintains a list of
> >>>>>>>>>>> all such pinned pages. Added an ioctl defination to get bitmap of such
> >>>>>>>>>>
> >>>>>>>>>> definition
> >>>>>>>>>>            
> >>>>>>>>>>> pinned pages for requested IO virtual address range.
> >>>>>>>>>>
> >>>>>>>>>> Additionally, all mapped pages are considered dirty when physically
> >>>>>>>>>> mapped through to an IOMMU, modulo we discussed devices opting in to
> >>>>>>>>>> per page pinning to indicate finer granularity with a TBD mechanism to
> >>>>>>>>>> figure out if any non-opt-in devices remain.
> >>>>>>>>>>            
> >>>>>>>>>
> >>>>>>>>> You mean, in case of device direct assignment (device pass through)?
> >>>>>>>>
> >>>>>>>> Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> >>>>>>>> pinned and mapped, then the correct dirty page set is all mapped pages.
> >>>>>>>> We discussed using the vpfn list as a mechanism for vendor drivers to
> >>>>>>>> reduce their migration footprint, but we also discussed that we would
> >>>>>>>> need a way to determine that all participants in the container have
> >>>>>>>> explicitly pinned their working pages or else we must consider the
> >>>>>>>> entire potential working set as dirty.
> >>>>>>>>         
> >>>>>>>
> >>>>>>> How can vendor driver tell this capability to iommu module? Any suggestions?
> >>>>>>
> >>>>>> I think it does so by pinning pages.  Is it acceptable that if the
> >>>>>> vendor driver pins any pages, then from that point forward we consider
> >>>>>> the IOMMU group dirty page scope to be limited to pinned pages?  There
> >>>>> we should also be aware of that dirty page scope is pinned pages + unpinned pages,
> >>>>> which means ever since a page is pinned, it should be regarded as dirty
> >>>>> no matter whether it's unpinned later. only after log_sync is called and
> >>>>> dirty info retrieved, its dirty state should be cleared.
> >>>>
> >>>> Yes, good point.  We can't just remove a vpfn when a page is unpinned
> >>>> or else we'd lose information that the page potentially had been
> >>>> dirtied while it was pinned.  Maybe that vpfn needs to move to a dirty
> >>>> list and both the currently pinned vpfns and the dirty vpfns are walked
> >>>> on a log_sync.  The dirty vpfns list would be cleared after a log_sync.
> >>>> The container would need to know that dirty tracking is enabled and
> >>>> only manage the dirty vpfns list when necessary.  Thanks,
> >>>>    
> >>>
> >>> If page is unpinned, then that page is available in free page pool for
> >>> others to use, then how can we say that unpinned page has valid data?
> >>>
> >>> If suppose, one driver A unpins a page and when driver B of some other
> >>> device gets that page and he pins it, uses it, and then unpins it, then
> >>> how can we say that page has valid data for driver A?
> >>>
> >>> Can you give one example where unpinned page data is considered reliable
> >>> and valid?
> >>
> >> We can only pin pages that the user has already allocated* and mapped
> >> through the vfio DMA API.  The pinning of the page simply locks the
> >> page for the vendor driver to access it and unpinning that page only
> >> indicates that access is complete.  Pages are not freed when a vendor
> >> driver unpins them, they still exist and at this point we're now
> >> assuming the device dirtied the page while it was pinned.  Thanks,
> >>
> >> Alex
> >>
> >> * An exception here is that the page might be demand allocated and the
> >>    act of pinning the page could actually allocate the backing page for
> >>    the user if they have not faulted the page to trigger that allocation
> >>    previously.  That page remains mapped for the user's virtual address
> >>    space even after the unpinning though.
> >>
> > 
> > Yes, I can give an example in GVT.
> > when a gem_object is allocated in guest, before submitting it to guest
> > vGPU, gfx cmds in its ring buffer need to be pinned into GGTT to get a
> > global graphics address for hardware access. At that time, we shadow
> > those cmds and pin pages through vfio pin_pages(), and submit the shadow
> > gem_object to physial hardware.
> > After guest driver thinks the submitted gem_object has completed hardware
> > DMA, it unnpinnd those pinned GGTT graphics memory addresses. Then in
> > host, we unpin the shadow pages through vfio unpin_pages.
> > But, at this point, guest driver is still free to access the gem_object
> > through vCPUs, and guest user space is probably still mapping an object
> > into the gem_object in guest driver.
> > So, missing the dirty page tracking for unpinned pages would cause
> > data inconsitency.
> > 
> 
> If pages are accessed by guest through vCPUs, then RAM module in QEMU 
> will take care of tracking those pages as dirty.
> 
> All unpinned pages might not be used, so tracking all unpinned pages 
> during VM or application life time would also lead to tracking lots of 
> stale pages, even though they are not being used. Increasing number of 
> not needed pages could also lead to increasing migration data leading 
> increase in migration downtime.
> 
> Thanks,
> Kirti

Those are pages dirtied by vGPU during Pin/Unpin. They are not dirtied
by vCPUs. RAM module in QEMU has no idea of it.
Alex Williamson Dec. 5, 2019, 5:56 a.m. UTC | #19
On Thu, 5 Dec 2019 11:12:23 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 12/5/2019 6:58 AM, Yan Zhao wrote:
> > On Thu, Dec 05, 2019 at 02:34:57AM +0800, Alex Williamson wrote:  
> >> On Wed, 4 Dec 2019 23:40:25 +0530
> >> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>  
> >>> On 12/3/2019 11:34 PM, Alex Williamson wrote:  
> >>>> On Mon, 25 Nov 2019 19:57:39 -0500
> >>>> Yan Zhao <yan.y.zhao@intel.com> wrote:
> >>>>      
> >>>>> On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:  
> >>>>>> On Fri, 15 Nov 2019 00:26:07 +0530
> >>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>         
> >>>>>>> On 11/14/2019 1:37 AM, Alex Williamson wrote:  
> >>>>>>>> On Thu, 14 Nov 2019 01:07:21 +0530
> >>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>>>           
> >>>>>>>>> On 11/13/2019 4:00 AM, Alex Williamson wrote:  
> >>>>>>>>>> On Tue, 12 Nov 2019 22:33:37 +0530
> >>>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>>>>>              
> >>>>>>>>>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> >>>>>>>>>>> considered as dirty during migration. IOMMU container maintains a list of
> >>>>>>>>>>> all such pinned pages. Added an ioctl defination to get bitmap of such  
> >>>>>>>>>>
> >>>>>>>>>> definition
> >>>>>>>>>>              
> >>>>>>>>>>> pinned pages for requested IO virtual address range.  
> >>>>>>>>>>
> >>>>>>>>>> Additionally, all mapped pages are considered dirty when physically
> >>>>>>>>>> mapped through to an IOMMU, modulo we discussed devices opting in to
> >>>>>>>>>> per page pinning to indicate finer granularity with a TBD mechanism to
> >>>>>>>>>> figure out if any non-opt-in devices remain.
> >>>>>>>>>>              
> >>>>>>>>>
> >>>>>>>>> You mean, in case of device direct assignment (device pass through)?  
> >>>>>>>>
> >>>>>>>> Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> >>>>>>>> pinned and mapped, then the correct dirty page set is all mapped pages.
> >>>>>>>> We discussed using the vpfn list as a mechanism for vendor drivers to
> >>>>>>>> reduce their migration footprint, but we also discussed that we would
> >>>>>>>> need a way to determine that all participants in the container have
> >>>>>>>> explicitly pinned their working pages or else we must consider the
> >>>>>>>> entire potential working set as dirty.
> >>>>>>>>           
> >>>>>>>
> >>>>>>> How can vendor driver tell this capability to iommu module? Any suggestions?  
> >>>>>>
> >>>>>> I think it does so by pinning pages.  Is it acceptable that if the
> >>>>>> vendor driver pins any pages, then from that point forward we consider
> >>>>>> the IOMMU group dirty page scope to be limited to pinned pages?  There  
> >>>>> we should also be aware of that dirty page scope is pinned pages + unpinned pages,
> >>>>> which means ever since a page is pinned, it should be regarded as dirty
> >>>>> no matter whether it's unpinned later. only after log_sync is called and
> >>>>> dirty info retrieved, its dirty state should be cleared.  
> >>>>
> >>>> Yes, good point.  We can't just remove a vpfn when a page is unpinned
> >>>> or else we'd lose information that the page potentially had been
> >>>> dirtied while it was pinned.  Maybe that vpfn needs to move to a dirty
> >>>> list and both the currently pinned vpfns and the dirty vpfns are walked
> >>>> on a log_sync.  The dirty vpfns list would be cleared after a log_sync.
> >>>> The container would need to know that dirty tracking is enabled and
> >>>> only manage the dirty vpfns list when necessary.  Thanks,
> >>>>      
> >>>
> >>> If page is unpinned, then that page is available in free page pool for
> >>> others to use, then how can we say that unpinned page has valid data?
> >>>
> >>> If suppose, one driver A unpins a page and when driver B of some other
> >>> device gets that page and he pins it, uses it, and then unpins it, then
> >>> how can we say that page has valid data for driver A?
> >>>
> >>> Can you give one example where unpinned page data is considered reliable
> >>> and valid?  
> >>
> >> We can only pin pages that the user has already allocated* and mapped
> >> through the vfio DMA API.  The pinning of the page simply locks the
> >> page for the vendor driver to access it and unpinning that page only
> >> indicates that access is complete.  Pages are not freed when a vendor
> >> driver unpins them, they still exist and at this point we're now
> >> assuming the device dirtied the page while it was pinned.  Thanks,
> >>
> >> Alex
> >>
> >> * An exception here is that the page might be demand allocated and the
> >>    act of pinning the page could actually allocate the backing page for
> >>    the user if they have not faulted the page to trigger that allocation
> >>    previously.  That page remains mapped for the user's virtual address
> >>    space even after the unpinning though.
> >>  
> > 
> > Yes, I can give an example in GVT.
> > when a gem_object is allocated in guest, before submitting it to guest
> > vGPU, gfx cmds in its ring buffer need to be pinned into GGTT to get a
> > global graphics address for hardware access. At that time, we shadow
> > those cmds and pin pages through vfio pin_pages(), and submit the shadow
> > gem_object to physial hardware.
> > After guest driver thinks the submitted gem_object has completed hardware
> > DMA, it unnpinnd those pinned GGTT graphics memory addresses. Then in
> > host, we unpin the shadow pages through vfio unpin_pages.
> > But, at this point, guest driver is still free to access the gem_object
> > through vCPUs, and guest user space is probably still mapping an object
> > into the gem_object in guest driver.
> > So, missing the dirty page tracking for unpinned pages would cause
> > data inconsitency.
> >   
> 
> If pages are accessed by guest through vCPUs, then RAM module in QEMU 
> will take care of tracking those pages as dirty.
> 
> All unpinned pages might not be used, so tracking all unpinned pages 
> during VM or application life time would also lead to tracking lots of 
> stale pages, even though they are not being used. Increasing number of 
> not needed pages could also lead to increasing migration data leading 
> increase in migration downtime.

We can't rely on the vCPU also dirtying a page, the overhead is
unavoidable.  It doesn't matter if the migration is fast if it's
incorrect.  We only need to track unpinned dirty pages while the
migration is active and the tracking is flushed on each log_sync
callback.  Thanks,

Alex
Kirti Wankhede Dec. 5, 2019, 6:19 a.m. UTC | #20
On 12/5/2019 11:26 AM, Alex Williamson wrote:
> On Thu, 5 Dec 2019 11:12:23 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 12/5/2019 6:58 AM, Yan Zhao wrote:
>>> On Thu, Dec 05, 2019 at 02:34:57AM +0800, Alex Williamson wrote:
>>>> On Wed, 4 Dec 2019 23:40:25 +0530
>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>   
>>>>> On 12/3/2019 11:34 PM, Alex Williamson wrote:
>>>>>> On Mon, 25 Nov 2019 19:57:39 -0500
>>>>>> Yan Zhao <yan.y.zhao@intel.com> wrote:
>>>>>>       
>>>>>>> On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:
>>>>>>>> On Fri, 15 Nov 2019 00:26:07 +0530
>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>>>          
>>>>>>>>> On 11/14/2019 1:37 AM, Alex Williamson wrote:
>>>>>>>>>> On Thu, 14 Nov 2019 01:07:21 +0530
>>>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>>>>>            
>>>>>>>>>>> On 11/13/2019 4:00 AM, Alex Williamson wrote:
>>>>>>>>>>>> On Tue, 12 Nov 2019 22:33:37 +0530
>>>>>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>>>>>>>               
>>>>>>>>>>>>> All pages pinned by vendor driver through vfio_pin_pages API should be
>>>>>>>>>>>>> considered as dirty during migration. IOMMU container maintains a list of
>>>>>>>>>>>>> all such pinned pages. Added an ioctl defination to get bitmap of such
>>>>>>>>>>>>
>>>>>>>>>>>> definition
>>>>>>>>>>>>               
>>>>>>>>>>>>> pinned pages for requested IO virtual address range.
>>>>>>>>>>>>
>>>>>>>>>>>> Additionally, all mapped pages are considered dirty when physically
>>>>>>>>>>>> mapped through to an IOMMU, modulo we discussed devices opting in to
>>>>>>>>>>>> per page pinning to indicate finer granularity with a TBD mechanism to
>>>>>>>>>>>> figure out if any non-opt-in devices remain.
>>>>>>>>>>>>               
>>>>>>>>>>>
>>>>>>>>>>> You mean, in case of device direct assignment (device pass through)?
>>>>>>>>>>
>>>>>>>>>> Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
>>>>>>>>>> pinned and mapped, then the correct dirty page set is all mapped pages.
>>>>>>>>>> We discussed using the vpfn list as a mechanism for vendor drivers to
>>>>>>>>>> reduce their migration footprint, but we also discussed that we would
>>>>>>>>>> need a way to determine that all participants in the container have
>>>>>>>>>> explicitly pinned their working pages or else we must consider the
>>>>>>>>>> entire potential working set as dirty.
>>>>>>>>>>            
>>>>>>>>>
>>>>>>>>> How can vendor driver tell this capability to iommu module? Any suggestions?
>>>>>>>>
>>>>>>>> I think it does so by pinning pages.  Is it acceptable that if the
>>>>>>>> vendor driver pins any pages, then from that point forward we consider
>>>>>>>> the IOMMU group dirty page scope to be limited to pinned pages?  There
>>>>>>> we should also be aware of that dirty page scope is pinned pages + unpinned pages,
>>>>>>> which means ever since a page is pinned, it should be regarded as dirty
>>>>>>> no matter whether it's unpinned later. only after log_sync is called and
>>>>>>> dirty info retrieved, its dirty state should be cleared.
>>>>>>
>>>>>> Yes, good point.  We can't just remove a vpfn when a page is unpinned
>>>>>> or else we'd lose information that the page potentially had been
>>>>>> dirtied while it was pinned.  Maybe that vpfn needs to move to a dirty
>>>>>> list and both the currently pinned vpfns and the dirty vpfns are walked
>>>>>> on a log_sync.  The dirty vpfns list would be cleared after a log_sync.
>>>>>> The container would need to know that dirty tracking is enabled and
>>>>>> only manage the dirty vpfns list when necessary.  Thanks,
>>>>>>       
>>>>>
>>>>> If page is unpinned, then that page is available in free page pool for
>>>>> others to use, then how can we say that unpinned page has valid data?
>>>>>
>>>>> If suppose, one driver A unpins a page and when driver B of some other
>>>>> device gets that page and he pins it, uses it, and then unpins it, then
>>>>> how can we say that page has valid data for driver A?
>>>>>
>>>>> Can you give one example where unpinned page data is considered reliable
>>>>> and valid?
>>>>
>>>> We can only pin pages that the user has already allocated* and mapped
>>>> through the vfio DMA API.  The pinning of the page simply locks the
>>>> page for the vendor driver to access it and unpinning that page only
>>>> indicates that access is complete.  Pages are not freed when a vendor
>>>> driver unpins them, they still exist and at this point we're now
>>>> assuming the device dirtied the page while it was pinned.  Thanks,
>>>>
>>>> Alex
>>>>
>>>> * An exception here is that the page might be demand allocated and the
>>>>     act of pinning the page could actually allocate the backing page for
>>>>     the user if they have not faulted the page to trigger that allocation
>>>>     previously.  That page remains mapped for the user's virtual address
>>>>     space even after the unpinning though.
>>>>   
>>>
>>> Yes, I can give an example in GVT.
>>> when a gem_object is allocated in guest, before submitting it to guest
>>> vGPU, gfx cmds in its ring buffer need to be pinned into GGTT to get a
>>> global graphics address for hardware access. At that time, we shadow
>>> those cmds and pin pages through vfio pin_pages(), and submit the shadow
>>> gem_object to physial hardware.
>>> After guest driver thinks the submitted gem_object has completed hardware
>>> DMA, it unnpinnd those pinned GGTT graphics memory addresses. Then in
>>> host, we unpin the shadow pages through vfio unpin_pages.
>>> But, at this point, guest driver is still free to access the gem_object
>>> through vCPUs, and guest user space is probably still mapping an object
>>> into the gem_object in guest driver.
>>> So, missing the dirty page tracking for unpinned pages would cause
>>> data inconsitency.
>>>    
>>
>> If pages are accessed by guest through vCPUs, then RAM module in QEMU
>> will take care of tracking those pages as dirty.
>>
>> All unpinned pages might not be used, so tracking all unpinned pages
>> during VM or application life time would also lead to tracking lots of
>> stale pages, even though they are not being used. Increasing number of
>> not needed pages could also lead to increasing migration data leading
>> increase in migration downtime.
> 
> We can't rely on the vCPU also dirtying a page, the overhead is
> unavoidable.  It doesn't matter if the migration is fast if it's
> incorrect.  We only need to track unpinned dirty pages while the
> migration is active and the tracking is flushed on each log_sync
> callback.  Thanks,
> 

 From Yan's comment, pasted below, I thought, need to track all unpinned 
pages during application or VM's lifetime.

 > There we should also be aware of that dirty page scope is pinned 
pages  > + unpinned pages, which means ever since a page is pinned, it 
should
 > be regarded as dirty no matter whether it's unpinned later.

But if its about tracking pages which are unpinned "while the migration 
is active", then that set would be less, will do this change.

Thanks,
Kirti
Alex Williamson Dec. 5, 2019, 6:40 a.m. UTC | #21
On Thu, 5 Dec 2019 11:49:00 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 12/5/2019 11:26 AM, Alex Williamson wrote:
> > On Thu, 5 Dec 2019 11:12:23 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 12/5/2019 6:58 AM, Yan Zhao wrote:  
> >>> On Thu, Dec 05, 2019 at 02:34:57AM +0800, Alex Williamson wrote:  
> >>>> On Wed, 4 Dec 2019 23:40:25 +0530
> >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>     
> >>>>> On 12/3/2019 11:34 PM, Alex Williamson wrote:  
> >>>>>> On Mon, 25 Nov 2019 19:57:39 -0500
> >>>>>> Yan Zhao <yan.y.zhao@intel.com> wrote:
> >>>>>>         
> >>>>>>> On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote:  
> >>>>>>>> On Fri, 15 Nov 2019 00:26:07 +0530
> >>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>>>            
> >>>>>>>>> On 11/14/2019 1:37 AM, Alex Williamson wrote:  
> >>>>>>>>>> On Thu, 14 Nov 2019 01:07:21 +0530
> >>>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>>>>>              
> >>>>>>>>>>> On 11/13/2019 4:00 AM, Alex Williamson wrote:  
> >>>>>>>>>>>> On Tue, 12 Nov 2019 22:33:37 +0530
> >>>>>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>>>>>>>                 
> >>>>>>>>>>>>> All pages pinned by vendor driver through vfio_pin_pages API should be
> >>>>>>>>>>>>> considered as dirty during migration. IOMMU container maintains a list of
> >>>>>>>>>>>>> all such pinned pages. Added an ioctl defination to get bitmap of such  
> >>>>>>>>>>>>
> >>>>>>>>>>>> definition
> >>>>>>>>>>>>                 
> >>>>>>>>>>>>> pinned pages for requested IO virtual address range.  
> >>>>>>>>>>>>
> >>>>>>>>>>>> Additionally, all mapped pages are considered dirty when physically
> >>>>>>>>>>>> mapped through to an IOMMU, modulo we discussed devices opting in to
> >>>>>>>>>>>> per page pinning to indicate finer granularity with a TBD mechanism to
> >>>>>>>>>>>> figure out if any non-opt-in devices remain.
> >>>>>>>>>>>>                 
> >>>>>>>>>>>
> >>>>>>>>>>> You mean, in case of device direct assignment (device pass through)?  
> >>>>>>>>>>
> >>>>>>>>>> Yes, or IOMMU backed mdevs.  If vfio_dmas in the container are fully
> >>>>>>>>>> pinned and mapped, then the correct dirty page set is all mapped pages.
> >>>>>>>>>> We discussed using the vpfn list as a mechanism for vendor drivers to
> >>>>>>>>>> reduce their migration footprint, but we also discussed that we would
> >>>>>>>>>> need a way to determine that all participants in the container have
> >>>>>>>>>> explicitly pinned their working pages or else we must consider the
> >>>>>>>>>> entire potential working set as dirty.
> >>>>>>>>>>              
> >>>>>>>>>
> >>>>>>>>> How can vendor driver tell this capability to iommu module? Any suggestions?  
> >>>>>>>>
> >>>>>>>> I think it does so by pinning pages.  Is it acceptable that if the
> >>>>>>>> vendor driver pins any pages, then from that point forward we consider
> >>>>>>>> the IOMMU group dirty page scope to be limited to pinned pages?  There  
> >>>>>>> we should also be aware of that dirty page scope is pinned pages + unpinned pages,
> >>>>>>> which means ever since a page is pinned, it should be regarded as dirty
> >>>>>>> no matter whether it's unpinned later. only after log_sync is called and
> >>>>>>> dirty info retrieved, its dirty state should be cleared.  
> >>>>>>
> >>>>>> Yes, good point.  We can't just remove a vpfn when a page is unpinned
> >>>>>> or else we'd lose information that the page potentially had been
> >>>>>> dirtied while it was pinned.  Maybe that vpfn needs to move to a dirty
> >>>>>> list and both the currently pinned vpfns and the dirty vpfns are walked
> >>>>>> on a log_sync.  The dirty vpfns list would be cleared after a log_sync.
> >>>>>> The container would need to know that dirty tracking is enabled and
> >>>>>> only manage the dirty vpfns list when necessary.  Thanks,
> >>>>>>         
> >>>>>
> >>>>> If page is unpinned, then that page is available in free page pool for
> >>>>> others to use, then how can we say that unpinned page has valid data?
> >>>>>
> >>>>> If suppose, one driver A unpins a page and when driver B of some other
> >>>>> device gets that page and he pins it, uses it, and then unpins it, then
> >>>>> how can we say that page has valid data for driver A?
> >>>>>
> >>>>> Can you give one example where unpinned page data is considered reliable
> >>>>> and valid?  
> >>>>
> >>>> We can only pin pages that the user has already allocated* and mapped
> >>>> through the vfio DMA API.  The pinning of the page simply locks the
> >>>> page for the vendor driver to access it and unpinning that page only
> >>>> indicates that access is complete.  Pages are not freed when a vendor
> >>>> driver unpins them, they still exist and at this point we're now
> >>>> assuming the device dirtied the page while it was pinned.  Thanks,
> >>>>
> >>>> Alex
> >>>>
> >>>> * An exception here is that the page might be demand allocated and the
> >>>>     act of pinning the page could actually allocate the backing page for
> >>>>     the user if they have not faulted the page to trigger that allocation
> >>>>     previously.  That page remains mapped for the user's virtual address
> >>>>     space even after the unpinning though.
> >>>>     
> >>>
> >>> Yes, I can give an example in GVT.
> >>> when a gem_object is allocated in guest, before submitting it to guest
> >>> vGPU, gfx cmds in its ring buffer need to be pinned into GGTT to get a
> >>> global graphics address for hardware access. At that time, we shadow
> >>> those cmds and pin pages through vfio pin_pages(), and submit the shadow
> >>> gem_object to physial hardware.
> >>> After guest driver thinks the submitted gem_object has completed hardware
> >>> DMA, it unnpinnd those pinned GGTT graphics memory addresses. Then in
> >>> host, we unpin the shadow pages through vfio unpin_pages.
> >>> But, at this point, guest driver is still free to access the gem_object
> >>> through vCPUs, and guest user space is probably still mapping an object
> >>> into the gem_object in guest driver.
> >>> So, missing the dirty page tracking for unpinned pages would cause
> >>> data inconsitency.
> >>>      
> >>
> >> If pages are accessed by guest through vCPUs, then RAM module in QEMU
> >> will take care of tracking those pages as dirty.
> >>
> >> All unpinned pages might not be used, so tracking all unpinned pages
> >> during VM or application life time would also lead to tracking lots of
> >> stale pages, even though they are not being used. Increasing number of
> >> not needed pages could also lead to increasing migration data leading
> >> increase in migration downtime.  
> > 
> > We can't rely on the vCPU also dirtying a page, the overhead is
> > unavoidable.  It doesn't matter if the migration is fast if it's
> > incorrect.  We only need to track unpinned dirty pages while the
> > migration is active and the tracking is flushed on each log_sync
> > callback.  Thanks,
> >   
> 
>  From Yan's comment, pasted below, I thought, need to track all unpinned 
> pages during application or VM's lifetime.
> 
>  > There we should also be aware of that dirty page scope is pinned   
> pages  > + unpinned pages, which means ever since a page is pinned, it 
> should
>  > be regarded as dirty no matter whether it's unpinned later.  
> 
> But if its about tracking pages which are unpinned "while the migration 
> is active", then that set would be less, will do this change.

When we first start a pre-copy, all RAM (including anything previously
pinned) is dirty anyway.  I believe the log_sync callback is only
intended to report pages dirtied since the migration was started, or
since the last log_sync callback.  We then assume that currently pinned
pages are constantly dirty and previously pinned pages are dirty until
we've reported them through log_sync.  Thanks,

Alex
diff mbox series

Patch

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 35b09427ad9f..6fd3822aa610 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -902,6 +902,29 @@  struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
+ *                                     struct vfio_iommu_type1_dirty_bitmap)
+ *
+ * IOCTL to get dirty pages bitmap for IOMMU container during migration.
+ * Get dirty pages bitmap of given IO virtual addresses range using
+ * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of
+ * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get
+ * bitmap and should set size of allocated memory in bitmap_size field.
+ * One bit is used to represent per page consecutively starting from iova
+ * offset. Bit set indicates page at that offset from iova is dirty.
+ */
+struct vfio_iommu_type1_dirty_bitmap {
+	__u32        argsz;
+	__u32        flags;
+	__u64        iova;                      /* IO virtual address */
+	__u64        size;                      /* Size of iova range */
+	__u64        bitmap_size;               /* in bytes */
+	void __user *bitmap;                    /* one bit per page */
+};
+
+#define VFIO_IOMMU_GET_DIRTY_BITMAP             _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*