diff mbox series

[3/4] mm: simplify device private page handling in hmm_range_fault

Message ID 20200316193216.920734-4-hch@lst.de
State Not Applicable
Headers show
Series [1/4] memremap: add an owner field to struct dev_pagemap | expand

Commit Message

Christoph Hellwig March 16, 2020, 7:32 p.m. UTC
Remove the code to fault device private pages back into system memory
that has never been used by any driver.  Also replace the usage of the
HMM_PFN_DEVICE_PRIVATE flag in the pfns array with a simple
is_device_private_page check in nouveau.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  1 -
 drivers/gpu/drm/nouveau/nouveau_dmem.c  |  5 +++--
 drivers/gpu/drm/nouveau/nouveau_svm.c   |  1 -
 include/linux/hmm.h                     |  2 --
 mm/hmm.c                                | 25 +++++--------------------
 5 files changed, 8 insertions(+), 26 deletions(-)

Comments

Jason Gunthorpe March 16, 2020, 7:59 p.m. UTC | #1
On Mon, Mar 16, 2020 at 08:32:15PM +0100, Christoph Hellwig wrote:
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 180e398170b0..cfad65f6a67b 100644
> +++ b/mm/hmm.c
> @@ -118,15 +118,6 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
>  	/* We aren't ask to do anything ... */
>  	if (!(pfns & range->flags[HMM_PFN_VALID]))
>  		return;
> -	/* If this is device memory then only fault if explicitly requested */
> -	if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
> -		/* Do we fault on device memory ? */
> -		if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
> -			*write_fault = pfns & range->flags[HMM_PFN_WRITE];
> -			*fault = true;
> -		}
> -		return;
> -	}

Yes, this is an elegant solution to the input flags.

However, between patch 3 and 4 doesn't this break amd gpu as it will
return device_private pages now if not requested? Squash the two?

Jason
Christoph Hellwig March 16, 2020, 9:33 p.m. UTC | #2
On Mon, Mar 16, 2020 at 04:59:23PM -0300, Jason Gunthorpe wrote:
> However, between patch 3 and 4 doesn't this break amd gpu as it will
> return device_private pages now if not requested? Squash the two?

No change in behavior in this patch as long as HMM_PFN_DEVICE_PRIVATE
isn't set in ->pfns or ->default_flags, which is the case for both
nouveau and amdgpu.  The existing behavior is broken for private
pages not known to the driver, but that is fixed in the next patch.
Ralph Campbell March 16, 2020, 10:49 p.m. UTC | #3
On 3/16/20 12:32 PM, Christoph Hellwig wrote:
> Remove the code to fault device private pages back into system memory
> that has never been used by any driver.  Also replace the usage of the
> HMM_PFN_DEVICE_PRIVATE flag in the pfns array with a simple
> is_device_private_page check in nouveau.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Getting rid of HMM_PFN_DEVICE_PRIVATE seems reasonable to me since a driver can
look at the struct page but what if a driver needs to fault in a page from
another device's private memory? Should it call handle_mm_fault()?


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  1 -
>   drivers/gpu/drm/nouveau/nouveau_dmem.c  |  5 +++--
>   drivers/gpu/drm/nouveau/nouveau_svm.c   |  1 -
>   include/linux/hmm.h                     |  2 --
>   mm/hmm.c                                | 25 +++++--------------------
>   5 files changed, 8 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index dee446278417..90821ce5e6ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -776,7 +776,6 @@ struct amdgpu_ttm_tt {
>   static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
>   	(1 << 0), /* HMM_PFN_VALID */
>   	(1 << 1), /* HMM_PFN_WRITE */
> -	0 /* HMM_PFN_DEVICE_PRIVATE */
>   };
>   
>   static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> index 0e36345d395c..edfd0805fba4 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> @@ -28,6 +28,7 @@
>   
>   #include <nvif/class.h>
>   #include <nvif/object.h>
> +#include <nvif/if000c.h>
>   #include <nvif/if500b.h>
>   #include <nvif/if900b.h>
>   
> @@ -692,9 +693,8 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
>   		if (page == NULL)
>   			continue;
>   
> -		if (!(range->pfns[i] & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
> +		if (!is_device_private_page(page))
>   			continue;
> -		}
>   
>   		if (!nouveau_dmem_page(drm, page)) {
>   			WARN(1, "Some unknown device memory !\n");
> @@ -705,5 +705,6 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
>   		addr = nouveau_dmem_page_addr(page);
>   		range->pfns[i] &= ((1UL << range->pfn_shift) - 1);
>   		range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift;
> +		range->pfns[i] |= NVIF_VMM_PFNMAP_V0_VRAM;
>   	}
>   }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
> index df9bf1fd1bc0..39c731a99937 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_svm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
> @@ -367,7 +367,6 @@ static const u64
>   nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = {
>   	[HMM_PFN_VALID         ] = NVIF_VMM_PFNMAP_V0_V,
>   	[HMM_PFN_WRITE         ] = NVIF_VMM_PFNMAP_V0_W,
> -	[HMM_PFN_DEVICE_PRIVATE] = NVIF_VMM_PFNMAP_V0_VRAM,
>   };
>   
>   static const u64
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index 4bf8d6997b12..5e6034f105c3 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -74,7 +74,6 @@
>    * Flags:
>    * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
>    * HMM_PFN_WRITE: CPU page table has write permission set
> - * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
>    *
>    * The driver provides a flags array for mapping page protections to device
>    * PTE bits. If the driver valid bit for an entry is bit 3,
> @@ -86,7 +85,6 @@
>   enum hmm_pfn_flag_e {
>   	HMM_PFN_VALID = 0,
>   	HMM_PFN_WRITE,
> -	HMM_PFN_DEVICE_PRIVATE,
>   	HMM_PFN_FLAG_MAX
>   };
>   
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 180e398170b0..cfad65f6a67b 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -118,15 +118,6 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
>   	/* We aren't ask to do anything ... */
>   	if (!(pfns & range->flags[HMM_PFN_VALID]))
>   		return;
> -	/* If this is device memory then only fault if explicitly requested */
> -	if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
> -		/* Do we fault on device memory ? */
> -		if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
> -			*write_fault = pfns & range->flags[HMM_PFN_WRITE];
> -			*fault = true;
> -		}
> -		return;
> -	}
>   
>   	/* If CPU page table is not valid then we need to fault */
>   	*fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
> @@ -260,21 +251,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>   		swp_entry_t entry = pte_to_swp_entry(pte);
>   
>   		/*
> -		 * This is a special swap entry, ignore migration, use
> -		 * device and report anything else as error.
> +		 * Never fault in device private pages pages, but just report
> +		 * the PFN even if not present.
>   		 */
>   		if (is_device_private_entry(entry)) {
> -			cpu_flags = range->flags[HMM_PFN_VALID] |
> -				range->flags[HMM_PFN_DEVICE_PRIVATE];
> -			cpu_flags |= is_write_device_private_entry(entry) ?
> -				range->flags[HMM_PFN_WRITE] : 0;
> -			hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
> -					   &fault, &write_fault);
> -			if (fault || write_fault)
> -				goto fault;
>   			*pfn = hmm_device_entry_from_pfn(range,
>   					    swp_offset(entry));
> -			*pfn |= cpu_flags;
> +			*pfn |= range->flags[HMM_PFN_VALID];
> +			if (is_write_device_private_entry(entry))
> +				*pfn |= range->flags[HMM_PFN_WRITE];
>   			return 0;
>   		}
>   
>
Christoph Hellwig March 17, 2020, 7:34 a.m. UTC | #4
On Mon, Mar 16, 2020 at 03:49:51PM -0700, Ralph Campbell wrote:
> On 3/16/20 12:32 PM, Christoph Hellwig wrote:
>> Remove the code to fault device private pages back into system memory
>> that has never been used by any driver.  Also replace the usage of the
>> HMM_PFN_DEVICE_PRIVATE flag in the pfns array with a simple
>> is_device_private_page check in nouveau.
>>
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Getting rid of HMM_PFN_DEVICE_PRIVATE seems reasonable to me since a driver can
> look at the struct page but what if a driver needs to fault in a page from
> another device's private memory? Should it call handle_mm_fault()?

Obviously no driver cared for that so far.  Once we have test cases
for that and thus testable code we can add code to fault it in from
hmm_vma_handle_pte.
Jason Gunthorpe March 17, 2020, 12:15 p.m. UTC | #5
On Mon, Mar 16, 2020 at 03:49:51PM -0700, Ralph Campbell wrote:
> 
> On 3/16/20 12:32 PM, Christoph Hellwig wrote:
> > Remove the code to fault device private pages back into system memory
> > that has never been used by any driver.  Also replace the usage of the
> > HMM_PFN_DEVICE_PRIVATE flag in the pfns array with a simple
> > is_device_private_page check in nouveau.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Getting rid of HMM_PFN_DEVICE_PRIVATE seems reasonable to me since a driver can
> look at the struct page but what if a driver needs to fault in a page from
> another device's private memory? Should it call handle_mm_fault()?

Isn't that what this series basically does?

The dev_private_owner is set to the type of pgmap the device knows how
to handle, and everything else is automatically faulted for the
device.

If the device does not know how to handle device_private then it sets
dev_private_owner to NULL and it never gets device_private pfns.

Since the device_private pfn cannot be dma mapped, drivers must have
explicit support for them.

Jason
Christoph Hellwig March 17, 2020, 12:24 p.m. UTC | #6
On Tue, Mar 17, 2020 at 09:15:36AM -0300, Jason Gunthorpe wrote:
> > Getting rid of HMM_PFN_DEVICE_PRIVATE seems reasonable to me since a driver can
> > look at the struct page but what if a driver needs to fault in a page from
> > another device's private memory? Should it call handle_mm_fault()?
> 
> Isn't that what this series basically does?
>
> The dev_private_owner is set to the type of pgmap the device knows how
> to handle, and everything else is automatically faulted for the
> device.
> 
> If the device does not know how to handle device_private then it sets
> dev_private_owner to NULL and it never gets device_private pfns.
> 
> Since the device_private pfn cannot be dma mapped, drivers must have
> explicit support for them.

No, with this series (and all actual callers before this series)
we never fault in device private pages.
Christoph Hellwig March 17, 2020, 12:28 p.m. UTC | #7
On Tue, Mar 17, 2020 at 01:24:45PM +0100, Christoph Hellwig wrote:
> On Tue, Mar 17, 2020 at 09:15:36AM -0300, Jason Gunthorpe wrote:
> > > Getting rid of HMM_PFN_DEVICE_PRIVATE seems reasonable to me since a driver can
> > > look at the struct page but what if a driver needs to fault in a page from
> > > another device's private memory? Should it call handle_mm_fault()?
> > 
> > Isn't that what this series basically does?
> >
> > The dev_private_owner is set to the type of pgmap the device knows how
> > to handle, and everything else is automatically faulted for the
> > device.
> > 
> > If the device does not know how to handle device_private then it sets
> > dev_private_owner to NULL and it never gets device_private pfns.
> > 
> > Since the device_private pfn cannot be dma mapped, drivers must have
> > explicit support for them.
> 
> No, with this series (and all actual callers before this series)
> we never fault in device private pages.

IFF we want to fault it in we'd need something like this.  But I'd
really prefer to see test cases for that first.

diff --git a/mm/hmm.c b/mm/hmm.c
index b75b3750e03d..2884a3d11a1f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -276,7 +276,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		if (!fault && !write_fault)
 			return 0;
 
-		if (!non_swap_entry(entry))
+		if (!non_swap_entry(entry) || is_device_private_entry(entry))
 			goto fault;
 
 		if (is_migration_entry(entry)) {
Jason Gunthorpe March 17, 2020, 12:47 p.m. UTC | #8
On Tue, Mar 17, 2020 at 01:28:13PM +0100, Christoph Hellwig wrote:
> On Tue, Mar 17, 2020 at 01:24:45PM +0100, Christoph Hellwig wrote:
> > On Tue, Mar 17, 2020 at 09:15:36AM -0300, Jason Gunthorpe wrote:
> > > > Getting rid of HMM_PFN_DEVICE_PRIVATE seems reasonable to me since a driver can
> > > > look at the struct page but what if a driver needs to fault in a page from
> > > > another device's private memory? Should it call handle_mm_fault()?
> > > 
> > > Isn't that what this series basically does?
> > >
> > > The dev_private_owner is set to the type of pgmap the device knows how
> > > to handle, and everything else is automatically faulted for the
> > > device.
> > > 
> > > If the device does not know how to handle device_private then it sets
> > > dev_private_owner to NULL and it never gets device_private pfns.
> > > 
> > > Since the device_private pfn cannot be dma mapped, drivers must have
> > > explicit support for them.
> > 
> > No, with this series (and all actual callers before this series)
> > we never fault in device private pages.
> 
> IFF we want to fault it in we'd need something like this.  But I'd
> really prefer to see test cases for that first.

In general I think hmm_range_fault should have a mode that is the same
as get_user_pages in terms of when it returns a hard failure, and
generates faults. AFAIK, GUP will fault in this case?

I need this for making ODP use this API. ODP is the one that is highly
likely to see other driver's device_private pages and must have them
always fault to CPU.

> diff --git a/mm/hmm.c b/mm/hmm.c
> index b75b3750e03d..2884a3d11a1f 100644
> +++ b/mm/hmm.c
> @@ -276,7 +276,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>  		if (!fault && !write_fault)
>  			return 0;
>  
> -		if (!non_swap_entry(entry))
> +		if (!non_swap_entry(entry) || is_device_private_entry(entry))
>  			goto fault;

Yes, OK,  makes sense.

I've been using v7 of Ralph's tester and it is working well - it has
DEVICE_PRIVATE support so I think it can test this flow too. Ralph are
you able?

This hunk seems trivial enough to me, can we include it now?

Thanks,
Jason
Christoph Hellwig March 17, 2020, 12:59 p.m. UTC | #9
On Tue, Mar 17, 2020 at 09:47:55AM -0300, Jason Gunthorpe wrote:
> I've been using v7 of Ralph's tester and it is working well - it has
> DEVICE_PRIVATE support so I think it can test this flow too. Ralph are
> you able?
> 
> This hunk seems trivial enough to me, can we include it now?

I can send a separate patch for it once the tester covers it.  I don't
want to add it to the original patch as it is a significant behavior
change compared to the existing code.
Jason Gunthorpe March 17, 2020, 5:32 p.m. UTC | #10
On Tue, Mar 17, 2020 at 01:59:55PM +0100, Christoph Hellwig wrote:
> On Tue, Mar 17, 2020 at 09:47:55AM -0300, Jason Gunthorpe wrote:
> > I've been using v7 of Ralph's tester and it is working well - it has
> > DEVICE_PRIVATE support so I think it can test this flow too. Ralph are
> > you able?
> > 
> > This hunk seems trivial enough to me, can we include it now?
> 
> I can send a separate patch for it once the tester covers it.  I don't
> want to add it to the original patch as it is a significant behavior
> change compared to the existing code.

Okay. I'm happy enough for now that amdgpu will get ERROR on
device_private pages. That is a bug fix in of itself.

Jason
Ralph Campbell March 17, 2020, 10:43 p.m. UTC | #11
On 3/17/20 12:34 AM, Christoph Hellwig wrote:
> On Mon, Mar 16, 2020 at 03:49:51PM -0700, Ralph Campbell wrote:
>> On 3/16/20 12:32 PM, Christoph Hellwig wrote:
>>> Remove the code to fault device private pages back into system memory
>>> that has never been used by any driver.  Also replace the usage of the
>>> HMM_PFN_DEVICE_PRIVATE flag in the pfns array with a simple
>>> is_device_private_page check in nouveau.
>>>
>>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>>
>> Getting rid of HMM_PFN_DEVICE_PRIVATE seems reasonable to me since a driver can
>> look at the struct page but what if a driver needs to fault in a page from
>> another device's private memory? Should it call handle_mm_fault()?
> 
> Obviously no driver cared for that so far.  Once we have test cases
> for that and thus testable code we can add code to fault it in from
> hmm_vma_handle_pte.
> 

I'm OK with the series. I think I would have been less confused if I looked at
patch 4 then 3.
Ralph Campbell March 17, 2020, 11:14 p.m. UTC | #12
On 3/17/20 5:59 AM, Christoph Hellwig wrote:
> On Tue, Mar 17, 2020 at 09:47:55AM -0300, Jason Gunthorpe wrote:
>> I've been using v7 of Ralph's tester and it is working well - it has
>> DEVICE_PRIVATE support so I think it can test this flow too. Ralph are
>> you able?
>>
>> This hunk seems trivial enough to me, can we include it now?
> 
> I can send a separate patch for it once the tester covers it.  I don't
> want to add it to the original patch as it is a significant behavior
> change compared to the existing code.
> 

Attached is an updated version of my HMM tests based on linux-5.6.0-rc6.
I ran this OK with Jason's 8+1 HMM patches, Christoph's 1-5 misc HMM clean ups,
and Christoph's 1-4 device private page changes applied.

I'm working on getting my nouveau tests running again on a different test
machine and will report on that when ready.
Christoph Hellwig March 18, 2020, 9:34 a.m. UTC | #13
On Tue, Mar 17, 2020 at 03:43:47PM -0700, Ralph Campbell wrote:
>> Obviously no driver cared for that so far.  Once we have test cases
>> for that and thus testable code we can add code to fault it in from
>> hmm_vma_handle_pte.
>>
>
> I'm OK with the series. I think I would have been less confused if I looked at
> patch 4 then 3.

I guess I could just merge 3 and 4 if it is too confusing otherwise.
Jason Gunthorpe March 19, 2020, 6:17 p.m. UTC | #14
On Tue, Mar 17, 2020 at 04:14:31PM -0700, Ralph Campbell wrote:
> 
> On 3/17/20 5:59 AM, Christoph Hellwig wrote:
> > On Tue, Mar 17, 2020 at 09:47:55AM -0300, Jason Gunthorpe wrote:
> > > I've been using v7 of Ralph's tester and it is working well - it has
> > > DEVICE_PRIVATE support so I think it can test this flow too. Ralph are
> > > you able?
> > > 
> > > This hunk seems trivial enough to me, can we include it now?
> > 
> > I can send a separate patch for it once the tester covers it.  I don't
> > want to add it to the original patch as it is a significant behavior
> > change compared to the existing code.
> > 
> 
> Attached is an updated version of my HMM tests based on linux-5.6.0-rc6.
> I ran this OK with Jason's 8+1 HMM patches, Christoph's 1-5 misc HMM clean ups,
> and Christoph's 1-4 device private page changes applied.

I'd like to get this to mergable, it looks pretty good now, but I have
no idea about selftests - and I'm struggling to even compile the tools
dir

> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 69def4a9df00..4d22ce7879a7 100644
> +++ b/lib/Kconfig.debug
> @@ -2162,6 +2162,18 @@ config TEST_MEMINIT
>  
>  	  If unsure, say N.
>  
> +config TEST_HMM
> +	tristate "Test HMM (Heterogeneous Memory Management)"
> +	depends on DEVICE_PRIVATE
> +	select HMM_MIRROR
> +        select MMU_NOTIFIER

extra spaces

In general I wonder if it even makes sense that DEVICE_PRIVATE is user
selectable?

> +static int dmirror_fops_open(struct inode *inode, struct file *filp)
> +{
> +	struct cdev *cdev = inode->i_cdev;
> +	struct dmirror *dmirror;
> +	int ret;
> +
> +	/* Mirror this process address space */
> +	dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL);
> +	if (dmirror == NULL)
> +		return -ENOMEM;
> +
> +	dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice);
> +	mutex_init(&dmirror->mutex);
> +	xa_init(&dmirror->pt);
> +
> +	ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm,
> +				0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops);
> +	if (ret) {
> +		kfree(dmirror);
> +		return ret;
> +	}
> +
> +	/* Pairs with the mmdrop() in dmirror_fops_release(). */
> +	mmgrab(current->mm);
> +	dmirror->mm = current->mm;

The notifier holds a mmgrab, no need for another one

> +	/* Only the first open registers the address space. */
> +	filp->private_data = dmirror;

Not sure what this comment means

> +static inline struct dmirror_device *dmirror_page_to_device(struct page *page)
> +
> +{
> +	struct dmirror_chunk *devmem;
> +
> +	devmem = container_of(page->pgmap, struct dmirror_chunk, pagemap);
> +	return devmem->mdevice;
> +}

extra devmem var is not really needed

> +
> +static bool dmirror_device_is_mine(struct dmirror_device *mdevice,
> +				   struct page *page)
> +{
> +	if (!is_zone_device_page(page))
> +		return false;
> +	return page->pgmap->ops == &dmirror_devmem_ops &&
> +		dmirror_page_to_device(page) == mdevice;
> +}

Use new owner stuff, right? Actually this is redunant now, the check
should be just WARN_ON pageowner != self owner

> +static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range)
> +{
> +	uint64_t *pfns = range->pfns;
> +	unsigned long pfn;
> +
> +	for (pfn = (range->start >> PAGE_SHIFT);
> +	     pfn < (range->end >> PAGE_SHIFT);
> +	     pfn++, pfns++) {
> +		struct page *page;
> +		void *entry;
> +
> +		/*
> +		 * HMM_PFN_ERROR is returned if it is accessing invalid memory
> +		 * either because of memory error (hardware detected memory
> +		 * corruption) or more likely because of truncate on mmap
> +		 * file.
> +		 */
> +		if (*pfns == range->values[HMM_PFN_ERROR])
> +			return -EFAULT;

Unless that snapshot is use hmm_range_fault() never returns success
and sets PFN_ERROR, so this should be a WARN_ON

> +		if (!(*pfns & range->flags[HMM_PFN_VALID]))
> +			return -EFAULT;

Same with valid.

> +		page = hmm_device_entry_to_page(range, *pfns);
> +		/* We asked for pages to be populated but check anyway. */
> +		if (!page)
> +			return -EFAULT;

WARN_ON

> +		if (is_zone_device_page(page)) {
> +			/*
> +			 * TODO: need a way to ask HMM to fault foreign zone
> +			 * device private pages.
> +			 */
> +			if (!dmirror_device_is_mine(dmirror->mdevice, page))
> +				continue;

Actually re

> +static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni,
> +				const struct mmu_notifier_range *range,
> +				unsigned long cur_seq)
> +{
> +	struct dmirror *dmirror = container_of(mni, struct dmirror, notifier);
> +	struct mm_struct *mm = dmirror->mm;
> +
> +	/*
> +	 * If the process doesn't exist, we don't need to invalidate the
> +	 * device page table since the address space will be torn down.
> +	 */
> +	if (!mmget_not_zero(mm))
> +		return true;

Why? Don't the notifiers provide for this already. 

mmget_not_zero() is required before calling hmm_range_fault() though

> +static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
> +			 unsigned long end, bool write)
> +{
> +	struct mm_struct *mm = dmirror->mm;
> +	unsigned long addr;
> +	uint64_t pfns[64];
> +	struct hmm_range range = {
> +		.notifier = &dmirror->notifier,
> +		.pfns = pfns,
> +		.flags = dmirror_hmm_flags,
> +		.values = dmirror_hmm_values,
> +		.pfn_shift = DPT_SHIFT,
> +		.pfn_flags_mask = ~(dmirror_hmm_flags[HMM_PFN_VALID] |
> +				    dmirror_hmm_flags[HMM_PFN_WRITE]),
> +		.default_flags = dmirror_hmm_flags[HMM_PFN_VALID] |
> +				(write ? dmirror_hmm_flags[HMM_PFN_WRITE] : 0),
> +		.dev_private_owner = dmirror->mdevice,
> +	};
> +	int ret = 0;
> +
> +	/* Since the mm is for the mirrored process, get a reference first. */
> +	if (!mmget_not_zero(mm))
> +		return 0;

Right

> +	for (addr = start; addr < end; addr = range.end) {
> +		range.start = addr;
> +		range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
> +
> +		ret = dmirror_range_fault(dmirror, &range);
> +		if (ret)
> +			break;
> +	}
> +
> +	mmput(mm);
> +	return ret;
> +}
> +
> +static int dmirror_do_read(struct dmirror *dmirror, unsigned long start,
> +			   unsigned long end, struct dmirror_bounce *bounce)
> +{
> +	unsigned long pfn;
> +	void *ptr;
> +
> +	ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK);
> +
> +	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
> +		void *entry;
> +		struct page *page;
> +		void *tmp;
> +
> +		entry = xa_load(&dmirror->pt, pfn);
> +		page = xa_untag_pointer(entry);
> +		if (!page)
> +			return -ENOENT;
> +
> +		tmp = kmap(page);
> +		memcpy(ptr, tmp, PAGE_SIZE);
> +		kunmap(page);
> +
> +		ptr += PAGE_SIZE;
> +		bounce->cpages++;
> +	}
> +
> +	return 0;
> +}
> +
> +static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
> +{
> +	struct dmirror_bounce bounce;
> +	unsigned long start, end;
> +	unsigned long size = cmd->npages << PAGE_SHIFT;
> +	int ret;
> +
> +	start = cmd->addr;
> +	end = start + size;
> +	if (end < start)
> +		return -EINVAL;
> +
> +	ret = dmirror_bounce_init(&bounce, start, size);
> +	if (ret)
> +		return ret;
> +
> +again:
> +	mutex_lock(&dmirror->mutex);
> +	ret = dmirror_do_read(dmirror, start, end, &bounce);
> +	mutex_unlock(&dmirror->mutex);
> +	if (ret == 0)
> +		ret = copy_to_user((void __user *)cmd->ptr, bounce.ptr,
> +					bounce.size);

Use u64_to_user_ptr() instead of the cast

> +static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
> +{
> +	struct dmirror_bounce bounce;
> +	unsigned long start, end;
> +	unsigned long size = cmd->npages << PAGE_SHIFT;
> +	int ret;
> +
> +	start = cmd->addr;
> +	end = start + size;
> +	if (end < start)
> +		return -EINVAL;
> +
> +	ret = dmirror_bounce_init(&bounce, start, size);
> +	if (ret)
> +		return ret;
> +	ret = copy_from_user(bounce.ptr, (void __user *)cmd->ptr,
> +				bounce.size);

ditto

> +	if (ret)
> +		return ret;
> +
> +again:
> +	mutex_lock(&dmirror->mutex);
> +	ret = dmirror_do_write(dmirror, start, end, &bounce);
> +	mutex_unlock(&dmirror->mutex);
> +	if (ret == -ENOENT) {
> +		start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
> +		ret = dmirror_fault(dmirror, start, end, true);
> +		if (ret == 0) {
> +			cmd->faults++;
> +			goto again;

Use a loop instead of goto?

Also I get this:

lib/test_hmm.c: In function ‘dmirror_devmem_fault_alloc_and_copy’:
lib/test_hmm.c:1041:25: warning: unused variable ‘vma’ [-Wunused-variable]
 1041 |  struct vm_area_struct *vma = args->vma;

But this is a kernel bug, due to alloc_page_vma being a #define not a
static inline and me having CONFIG_NUMA off in this .config

Jason
Ralph Campbell March 19, 2020, 10:56 p.m. UTC | #15
Adding linux-kselftest@vger.kernel.org for the test config question.

On 3/19/20 11:17 AM, Jason Gunthorpe wrote:
> On Tue, Mar 17, 2020 at 04:14:31PM -0700, Ralph Campbell wrote:
>>
>> On 3/17/20 5:59 AM, Christoph Hellwig wrote:
>>> On Tue, Mar 17, 2020 at 09:47:55AM -0300, Jason Gunthorpe wrote:
>>>> I've been using v7 of Ralph's tester and it is working well - it has
>>>> DEVICE_PRIVATE support so I think it can test this flow too. Ralph are
>>>> you able?
>>>>
>>>> This hunk seems trivial enough to me, can we include it now?
>>>
>>> I can send a separate patch for it once the tester covers it.  I don't
>>> want to add it to the original patch as it is a significant behavior
>>> change compared to the existing code.
>>>
>>
>> Attached is an updated version of my HMM tests based on linux-5.6.0-rc6.
>> I ran this OK with Jason's 8+1 HMM patches, Christoph's 1-5 misc HMM clean ups,
>> and Christoph's 1-4 device private page changes applied.
> 
> I'd like to get this to mergable, it looks pretty good now, but I have
> no idea about selftests - and I'm struggling to even compile the tools
> dir
> 
>> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
>> index 69def4a9df00..4d22ce7879a7 100644
>> +++ b/lib/Kconfig.debug
>> @@ -2162,6 +2162,18 @@ config TEST_MEMINIT
>>   
>>   	  If unsure, say N.
>>   
>> +config TEST_HMM
>> +	tristate "Test HMM (Heterogeneous Memory Management)"
>> +	depends on DEVICE_PRIVATE
>> +	select HMM_MIRROR
>> +        select MMU_NOTIFIER
> 
> extra spaces

Will fix in v8.

> In general I wonder if it even makes sense that DEVICE_PRIVATE is user
> selectable?

Should tests enable the feature or the feature enable the test?
IMHO, if the feature is being compiled into the kernel, that should
enable the menu item for the test. If the feature isn't selected,
no need to test it :-)

>> +static int dmirror_fops_open(struct inode *inode, struct file *filp)
>> +{
>> +	struct cdev *cdev = inode->i_cdev;
>> +	struct dmirror *dmirror;
>> +	int ret;
>> +
>> +	/* Mirror this process address space */
>> +	dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL);
>> +	if (dmirror == NULL)
>> +		return -ENOMEM;
>> +
>> +	dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice);
>> +	mutex_init(&dmirror->mutex);
>> +	xa_init(&dmirror->pt);
>> +
>> +	ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm,
>> +				0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops);
>> +	if (ret) {
>> +		kfree(dmirror);
>> +		return ret;
>> +	}
>> +
>> +	/* Pairs with the mmdrop() in dmirror_fops_release(). */
>> +	mmgrab(current->mm);
>> +	dmirror->mm = current->mm;
> 
> The notifier holds a mmgrab, no need for another one

OK. I'll replace dmirror->mm with dmirror->notifier.mm.

>> +	/* Only the first open registers the address space. */
>> +	filp->private_data = dmirror;
> 
> Not sure what this comment means

I'll change the comment to:
	/*
          * The first open of the device character file registers the address
          * space of the process doing the open() system call with the device.
          * Subsequent file opens by other processes will have access to the
          * first process' address space.
          */

>> +static inline struct dmirror_device *dmirror_page_to_device(struct page *page)
>> +
>> +{
>> +	struct dmirror_chunk *devmem;
>> +
>> +	devmem = container_of(page->pgmap, struct dmirror_chunk, pagemap);
>> +	return devmem->mdevice;
>> +}
> 
> extra devmem var is not really needed

I'll change this to:
	return container_of(page->pgmap, struct dmirror_chunk,
			    pagemap)->mdevice;

>> +
>> +static bool dmirror_device_is_mine(struct dmirror_device *mdevice,
>> +				   struct page *page)
>> +{
>> +	if (!is_zone_device_page(page))
>> +		return false;
>> +	return page->pgmap->ops == &dmirror_devmem_ops &&
>> +		dmirror_page_to_device(page) == mdevice;
>> +}
> 
> Use new owner stuff, right? Actually this is redunant now, the check
> should be just WARN_ON pageowner != self owner

I'll clean this up. dmirror_device_is_mine() isn't needed now.

>> +static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range)
>> +{
>> +	uint64_t *pfns = range->pfns;
>> +	unsigned long pfn;
>> +
>> +	for (pfn = (range->start >> PAGE_SHIFT);
>> +	     pfn < (range->end >> PAGE_SHIFT);
>> +	     pfn++, pfns++) {
>> +		struct page *page;
>> +		void *entry;
>> +
>> +		/*
>> +		 * HMM_PFN_ERROR is returned if it is accessing invalid memory
>> +		 * either because of memory error (hardware detected memory
>> +		 * corruption) or more likely because of truncate on mmap
>> +		 * file.
>> +		 */
>> +		if (*pfns == range->values[HMM_PFN_ERROR])
>> +			return -EFAULT;
> 
> Unless that snapshot is use hmm_range_fault() never returns success
> and sets PFN_ERROR, so this should be a WARN_ON
> 
>> +		if (!(*pfns & range->flags[HMM_PFN_VALID]))
>> +			return -EFAULT;
> 
> Same with valid.
> 
>> +		page = hmm_device_entry_to_page(range, *pfns);
>> +		/* We asked for pages to be populated but check anyway. */
>> +		if (!page)
>> +			return -EFAULT;
> 
> WARN_ON
> 
>> +		if (is_zone_device_page(page)) {
>> +			/*
>> +			 * TODO: need a way to ask HMM to fault foreign zone
>> +			 * device private pages.
>> +			 */
>> +			if (!dmirror_device_is_mine(dmirror->mdevice, page))
>> +				continue;
> 
> Actually re
> 
>> +static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni,
>> +				const struct mmu_notifier_range *range,
>> +				unsigned long cur_seq)
>> +{
>> +	struct dmirror *dmirror = container_of(mni, struct dmirror, notifier);
>> +	struct mm_struct *mm = dmirror->mm;
>> +
>> +	/*
>> +	 * If the process doesn't exist, we don't need to invalidate the
>> +	 * device page table since the address space will be torn down.
>> +	 */
>> +	if (!mmget_not_zero(mm))
>> +		return true;
> 
> Why? Don't the notifiers provide for this already.
> 
> mmget_not_zero() is required before calling hmm_range_fault() though

This is a workaround for a problem I don't quite understand.
If you change tools/testing/selftests/vm/hmm-tests.c line 868 to
	ASSERT_EQ(ret, -1);
Then the test will abort, core dump, and cause two problems,
1) the migrated page will be faulted back to system memory in order to write
    it to the core dump. This triggers lockdep_assert_held(&walk.mm->mmap_sem)
    in walk_page_range().
2) Then after a delay, I get:
[  137.852986] rcu: INFO: rcu_sched self-detected stall on CPU
[  137.858594] rcu: 	0-....: (26000 ticks this GP) idle=69e/1/0x4000000000000002 softirq=34555/34555 fqs=6497
[  137.868439] 	(t=26007 jiffies g=14653 q=271)
[  137.872711] NMI backtrace for cpu 0
[  137.876205] CPU: 0 PID: 6228 Comm: hmm-tests Not tainted 5.6.0-rc6+ #2
[  137.882730] Hardware name: System manufacturer System Product Name/SABERTOOTH X79, BIOS 4302 08/29/2013
[  137.892115] Call Trace:
[  137.894570]  <IRQ>
[  137.896593]  dump_stack+0x97/0xe0
[  137.899920]  nmi_cpu_backtrace.cold+0x14/0x68
[  137.904287]  ? lapic_can_unplug_cpu.cold+0x39/0x39
[  137.909091]  nmi_trigger_cpumask_backtrace+0xf1/0x10e
[  137.914152]  rcu_dump_cpu_stacks+0xe2/0x125
[  137.918348]  rcu_sched_clock_irq.cold+0x393/0x610
[  137.923069]  update_process_times+0x24/0x50
[  137.927263]  tick_sched_handle+0x68/0x90
[  137.931196]  tick_sched_timer+0x38/0xa0
[  137.935037]  __hrtimer_run_queues+0x1f9/0x6c0
[  137.939403]  ? tick_sched_do_timer+0x90/0x90
[  137.943683]  ? enqueue_hrtimer+0x1a0/0x1a0
[  137.947790]  ? recalibrate_cpu_khz+0x10/0x10
[  137.952064]  ? ktime_get_update_offsets_now+0xed/0x1c0
[  137.957209]  hrtimer_interrupt+0x1a5/0x340
[  137.961315]  ? rcu_read_lock_sched_held+0xa1/0xd0
[  137.966038]  smp_apic_timer_interrupt+0xbb/0x320
[  137.970665]  apic_timer_interrupt+0xf/0x20
[  137.974771]  </IRQ>
[  137.976876] RIP: 0010:xas_load+0x7/0x80
[  137.980718] Code: 80 2f 1a 83 c6 05 e9 8d 7b 01 01 e8 3e b1 b1 fe e9 05 ff ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 41 56 41 55 41 54 55 <48> 89 fd 53 4c 8d 6d 10 e8 3c fc ff ff 49 89 c4 4c 89 e0 83 e0 03
[  137.999461] RSP: 0018:ffffc900015e77c8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
[  138.007028] RAX: ffff8886e508c408 RBX: 0000000000000000 RCX: ffffffff82626c89
[  138.014159] RDX: dffffc0000000000 RSI: 0000000000000000 RDI: ffffc900015e78a0
[  138.021293] RBP: ffffc900015e78a0 R08: ffffffff811461c4 R09: fffff520002bcf17
[  138.028426] R10: fffff520002bcf16 R11: 0000000000000003 R12: 0000000002606d10
[  138.035557] R13: ffff8886e508c448 R14: 0000000000000031 R15: ffffffffa06546a0
[  138.042701]  ? do_raw_spin_lock+0x104/0x1d0
[  138.046888]  ? xas_store+0x19/0xa60
[  138.050390]  xas_store+0x5b3/0xa60
[  138.053806]  ? register_lock_class+0x860/0x860
[  138.058267]  __xa_erase+0x96/0x110
[  138.061673]  ? xas_store+0xa60/0xa60
[  138.065267]  xa_erase+0x19/0x30
[  138.068418]  dmirror_interval_invalidate+0x7d/0xc0 [test_hmm]
[  138.074174]  __mmu_notifier_release+0x1a6/0x370
[  138.078714]  ? mmu_notifier_unregister+0x1e0/0x1e0
[  138.083520]  ? lock_downgrade+0x380/0x380
[  138.087535]  ? uprobe_clear_state+0x2e/0x150
[  138.091823]  exit_mmap+0x24d/0x2a0
[  138.095229]  ? do_munmap+0x10/0x10
[  138.098635]  ? __x64_sys_io_setup+0x200/0x200
[  138.102995]  ? __mutex_unlock_slowpath+0xb4/0x3f0
[  138.107704]  ? wait_for_completion+0x250/0x250
[  138.112158]  ? lock_downgrade+0x380/0x380
[  138.116176]  ? check_flags.part.0+0x82/0x210
[  138.120463]  mmput+0xb5/0x210
[  138.123444]  do_exit+0x602/0x14c0
[  138.126776]  ? mm_update_next_owner+0x400/0x400
[  138.131329]  do_group_exit+0x8a/0x140
[  138.135006]  get_signal+0x25b/0x1080
[  138.138606]  do_signal+0x8c/0xa90
[  138.141928]  ? _raw_spin_unlock_irq+0x24/0x30
[  138.146292]  ? mark_held_locks+0x24/0x90
[  138.150219]  ? _raw_spin_unlock_irq+0x24/0x30
[  138.154580]  ? lockdep_hardirqs_on+0x190/0x280
[  138.159026]  ? setup_sigcontext+0x260/0x260
[  138.163210]  ? sigprocmask+0x10b/0x150
[  138.166965]  ? __x64_sys_rt_sigsuspend+0xe0/0xe0
[  138.171594]  ? __x64_sys_rt_sigprocmask+0xfb/0x180
[  138.176394]  ? __ia32_compat_sys_rt_sigprocmask+0x190/0x190
[  138.181965]  ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
[  138.187192]  ? exit_to_usermode_loop+0x60/0x100
[  138.191723]  ? mark_held_locks+0x24/0x90
[  138.195656]  exit_to_usermode_loop+0x85/0x100
[  138.200023]  do_syscall_64+0x20b/0x290
[  138.203782]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  138.208839] RIP: 0033:0x7f2d349f1625
[  138.212420] Code: c2 b8 ea 00 00 00 0f 05 48 3d 00 f0 ff ff 77 3d 41 89 c0 41 ba 08 00 00 00 31 d2 4c 89 ce bf 02 00 00 00 b8 0e 00 00 00 0f 05 <48> 8b 84 24 08 01 00 00 64 48 33 04 25 28 00 00 00 75 24 44 89 c0
[  138.231163] RSP: 002b:00007ffe84228c60 EFLAGS: 00000246 ORIG_RAX: 000000000000000e
[  138.238731] RAX: 0000000000000000 RBX: 00007f2d349ad040 RCX: 00007f2d349f1625
[  138.245862] RDX: 0000000000000000 RSI: 00007ffe84228c60 RDI: 0000000000000002
[  138.252997] RBP: 00007ffe84228ec0 R08: 0000000000000000 R09: 00007ffe84228c60
[  138.260127] R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000401240
[  138.267261] R13: 00007ffe84229190 R14: 0000000000000000 R15: 0000000000000000
[  138.274554] rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 0-... } 26428 jiffies s: 277 root: 0x1/.
[  138.285167] rcu: blocking rcu_node structures:
[  138.289641] Task dump for CPU 0:
[  138.292890] hmm-tests       R  running task    27560  6228   6211 0x8000400a
[  138.300002] Call Trace:
[  138.302486]  ? check_chain_key+0x1d1/0x2c0
[  138.306627]  ? __lock_acquire+0x61c/0x2820
[  138.310760]  ? match_held_lock+0x1b/0x230
[  138.314800]  ? check_chain_key+0x1d1/0x2c0
[  138.318937]  ? lock_downgrade+0x380/0x380
[  138.323008]  ? lock_acquire+0xff/0x220
[  138.326795]  ? stack_depot_save+0x137/0x450
[  138.331026]  ? _raw_spin_unlock_irqrestore+0x3e/0x50
[  138.336026]  ? mark_held_locks+0x24/0x90
[  138.340012]  ? _raw_spin_unlock_irqrestore+0x3e/0x50
[  138.345022]  ? lockdep_hardirqs_on+0x190/0x280
[  138.349504]  ? stack_depot_save+0x253/0x450
[  138.353731]  ? check_chain_key+0x1d1/0x2c0
[  138.357871]  ? __lock_acquire+0x61c/0x2820
[  138.362033]  ? match_held_lock+0x1b/0x230
[  138.366070]  ? check_chain_key+0x1d1/0x2c0
[  138.370209]  ? mark_lock+0xac/0x9e0
[  138.373734]  ? mark_lock+0xac/0x9e0
[  138.377261]  ? mark_lock+0xac/0x9e0
[  138.380772]  ? mark_held_locks+0x65/0x90
[  138.384730]  ? mark_lock+0xac/0x9e0
[  138.388249]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[  138.392895]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[  138.397542]  ? lockdep_hardirqs_on+0x190/0x280
[  138.402029]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[  138.406676]  ? mark_lock+0xac/0x9e0
[  138.410209]  ? lock_is_held_type+0x110/0x140
[  138.414509]  ? mark_held_locks+0x65/0x90
[  138.418461]  ? match_held_lock+0x1b/0x230
[  138.422500]  ? match_held_lock+0x1b/0x230
[  138.426551]  ? __lock_acquire+0x11c3/0x2820
[  138.430773]  ? xas_load+0x64/0x80
[  138.434126]  ? xas_store+0xac/0xa60
[  138.437652]  ? register_lock_class+0x860/0x860
[  138.442140]  ? lock_downgrade+0x380/0x380
[  138.446193]  ? _raw_spin_lock+0x2c/0x40
[  138.450059]  ? _raw_spin_unlock+0x17/0x30
[  138.454100]  ? xa_erase+0xe/0x30
[  138.457370]  ? dmirror_interval_invalidate+0x7d/0xc0 [test_hmm]
[  138.463325]  ? __mmu_notifier_release+0x1a6/0x370
[  138.468065]  ? mmu_notifier_unregister+0x1e0/0x1e0
[  138.472896]  ? lock_downgrade+0x380/0x380
[  138.476937]  ? uprobe_clear_state+0x2e/0x150
[  138.481252]  ? exit_mmap+0x24d/0x2a0
[  138.484859]  ? do_munmap+0x10/0x10
[  138.488298]  ? __x64_sys_io_setup+0x200/0x200
[  138.492683]  ? __mutex_unlock_slowpath+0xb4/0x3f0
[  138.497418]  ? wait_for_completion+0x250/0x250
[  138.501897]  ? lock_downgrade+0x380/0x380
[  138.505944]  ? check_flags.part.0+0x82/0x210
[  138.510254]  ? mmput+0xb5/0x210
[  138.513435]  ? do_exit+0x602/0x14c0
[  138.516967]  ? mm_update_next_owner+0x400/0x400
[  138.521545]  ? do_group_exit+0x8a/0x140
[  138.525423]  ? get_signal+0x25b/0x1080
[  138.529225]  ? do_signal+0x8c/0xa90
[  138.532755]  ? _raw_spin_unlock_irq+0x24/0x30
[  138.537143]  ? mark_held_locks+0x24/0x90
[  138.541096]  ? _raw_spin_unlock_irq+0x24/0x30
[  138.545492]  ? lockdep_hardirqs_on+0x190/0x280
[  138.549971]  ? setup_sigcontext+0x260/0x260
[  138.554190]  ? sigprocmask+0x10b/0x150
[  138.557969]  ? __x64_sys_rt_sigsuspend+0xe0/0xe0
[  138.562627]  ? __x64_sys_rt_sigprocmask+0xfb/0x180
[  138.567452]  ? __ia32_compat_sys_rt_sigprocmask+0x190/0x190
[  138.573057]  ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
[  138.578311]  ? exit_to_usermode_loop+0x60/0x100
[  138.582875]  ? mark_held_locks+0x24/0x90
[  138.586837]  ? exit_to_usermode_loop+0x85/0x100
[  138.591400]  ? do_syscall_64+0x20b/0x290
[  138.595360]  ? entry_SYSCALL_64_after_hwframe+0x49/0xbe
./test_hmm.sh: line 58:  6211 Alarm clock             ./hmm-tests


>> +static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
>> +			 unsigned long end, bool write)
>> +{
>> +	struct mm_struct *mm = dmirror->mm;
>> +	unsigned long addr;
>> +	uint64_t pfns[64];
>> +	struct hmm_range range = {
>> +		.notifier = &dmirror->notifier,
>> +		.pfns = pfns,
>> +		.flags = dmirror_hmm_flags,
>> +		.values = dmirror_hmm_values,
>> +		.pfn_shift = DPT_SHIFT,
>> +		.pfn_flags_mask = ~(dmirror_hmm_flags[HMM_PFN_VALID] |
>> +				    dmirror_hmm_flags[HMM_PFN_WRITE]),
>> +		.default_flags = dmirror_hmm_flags[HMM_PFN_VALID] |
>> +				(write ? dmirror_hmm_flags[HMM_PFN_WRITE] : 0),
>> +		.dev_private_owner = dmirror->mdevice,
>> +	};
>> +	int ret = 0;
>> +
>> +	/* Since the mm is for the mirrored process, get a reference first. */
>> +	if (!mmget_not_zero(mm))
>> +		return 0;
> 
> Right
> 
>> +	for (addr = start; addr < end; addr = range.end) {
>> +		range.start = addr;
>> +		range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
>> +
>> +		ret = dmirror_range_fault(dmirror, &range);
>> +		if (ret)
>> +			break;
>> +	}
>> +
>> +	mmput(mm);
>> +	return ret;
>> +}
>> +
>> +static int dmirror_do_read(struct dmirror *dmirror, unsigned long start,
>> +			   unsigned long end, struct dmirror_bounce *bounce)
>> +{
>> +	unsigned long pfn;
>> +	void *ptr;
>> +
>> +	ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK);
>> +
>> +	for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
>> +		void *entry;
>> +		struct page *page;
>> +		void *tmp;
>> +
>> +		entry = xa_load(&dmirror->pt, pfn);
>> +		page = xa_untag_pointer(entry);
>> +		if (!page)
>> +			return -ENOENT;
>> +
>> +		tmp = kmap(page);
>> +		memcpy(ptr, tmp, PAGE_SIZE);
>> +		kunmap(page);
>> +
>> +		ptr += PAGE_SIZE;
>> +		bounce->cpages++;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
>> +{
>> +	struct dmirror_bounce bounce;
>> +	unsigned long start, end;
>> +	unsigned long size = cmd->npages << PAGE_SHIFT;
>> +	int ret;
>> +
>> +	start = cmd->addr;
>> +	end = start + size;
>> +	if (end < start)
>> +		return -EINVAL;
>> +
>> +	ret = dmirror_bounce_init(&bounce, start, size);
>> +	if (ret)
>> +		return ret;
>> +
>> +again:
>> +	mutex_lock(&dmirror->mutex);
>> +	ret = dmirror_do_read(dmirror, start, end, &bounce);
>> +	mutex_unlock(&dmirror->mutex);
>> +	if (ret == 0)
>> +		ret = copy_to_user((void __user *)cmd->ptr, bounce.ptr,
>> +					bounce.size);
> 
> Use u64_to_user_ptr() instead of the cast

Will do.

>> +static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
>> +{
>> +	struct dmirror_bounce bounce;
>> +	unsigned long start, end;
>> +	unsigned long size = cmd->npages << PAGE_SHIFT;
>> +	int ret;
>> +
>> +	start = cmd->addr;
>> +	end = start + size;
>> +	if (end < start)
>> +		return -EINVAL;
>> +
>> +	ret = dmirror_bounce_init(&bounce, start, size);
>> +	if (ret)
>> +		return ret;
>> +	ret = copy_from_user(bounce.ptr, (void __user *)cmd->ptr,
>> +				bounce.size);
> 
> ditto
> 
>> +	if (ret)
>> +		return ret;
>> +
>> +again:
>> +	mutex_lock(&dmirror->mutex);
>> +	ret = dmirror_do_write(dmirror, start, end, &bounce);
>> +	mutex_unlock(&dmirror->mutex);
>> +	if (ret == -ENOENT) {
>> +		start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
>> +		ret = dmirror_fault(dmirror, start, end, true);
>> +		if (ret == 0) {
>> +			cmd->faults++;
>> +			goto again;
> 
> Use a loop instead of goto?

OK.

> Also I get this:
> 
> lib/test_hmm.c: In function ‘dmirror_devmem_fault_alloc_and_copy’:
> lib/test_hmm.c:1041:25: warning: unused variable ‘vma’ [-Wunused-variable]
>   1041 |  struct vm_area_struct *vma = args->vma;
> 
> But this is a kernel bug, due to alloc_page_vma being a #define not a
> static inline and me having CONFIG_NUMA off in this .config

Fixed.
I'll repost as a proper series shortly.

> Jason
>
Jason Gunthorpe March 20, 2020, 12:03 a.m. UTC | #16
On Thu, Mar 19, 2020 at 03:56:50PM -0700, Ralph Campbell wrote:
> Adding linux-kselftest@vger.kernel.org for the test config question.
> 
> On 3/19/20 11:17 AM, Jason Gunthorpe wrote:
> > On Tue, Mar 17, 2020 at 04:14:31PM -0700, Ralph Campbell wrote:
> > > 
> > > On 3/17/20 5:59 AM, Christoph Hellwig wrote:
> > > > On Tue, Mar 17, 2020 at 09:47:55AM -0300, Jason Gunthorpe wrote:
> > > > > I've been using v7 of Ralph's tester and it is working well - it has
> > > > > DEVICE_PRIVATE support so I think it can test this flow too. Ralph are
> > > > > you able?
> > > > > 
> > > > > This hunk seems trivial enough to me, can we include it now?
> > > > 
> > > > I can send a separate patch for it once the tester covers it.  I don't
> > > > want to add it to the original patch as it is a significant behavior
> > > > change compared to the existing code.
> > > > 
> > > 
> > > Attached is an updated version of my HMM tests based on linux-5.6.0-rc6.
> > > I ran this OK with Jason's 8+1 HMM patches, Christoph's 1-5 misc HMM clean ups,
> > > and Christoph's 1-4 device private page changes applied.
> > 
> > I'd like to get this to mergable, it looks pretty good now, but I have
> > no idea about selftests - and I'm struggling to even compile the tools
> > dir
> > 
> > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > > index 69def4a9df00..4d22ce7879a7 100644
> > > +++ b/lib/Kconfig.debug
> > > @@ -2162,6 +2162,18 @@ config TEST_MEMINIT
> > >   	  If unsure, say N.
> > > +config TEST_HMM
> > > +	tristate "Test HMM (Heterogeneous Memory Management)"
> > > +	depends on DEVICE_PRIVATE
> > > +	select HMM_MIRROR
> > > +        select MMU_NOTIFIER
> > 
> > extra spaces
> 
> Will fix in v8.
> 
> > In general I wonder if it even makes sense that DEVICE_PRIVATE is user
> > selectable?
> 
> Should tests enable the feature or the feature enable the test?
> IMHO, if the feature is being compiled into the kernel, that should
> enable the menu item for the test. If the feature isn't selected,
> no need to test it :-)

I ment if DEVICE_PRIVATE should be a user selectable option at all, or
should it be turned on when a driver like nouveau is selected.

Is there some downside to enabling DEVICE_PRIVATE?

> > The notifier holds a mmgrab, no need for another one
> 
> OK. I'll replace dmirror->mm with dmirror->notifier.mm.

Right that is good too

> > > +	filp->private_data = dmirror;
> > 
> > Not sure what this comment means
> 
> I'll change the comment to:
> 	  /*
>          * The first open of the device character file registers the address
>          * space of the process doing the open() system call with the device.
>          * Subsequent file opens by other processes will have access to the
>          * first process' address space.
>          */

How does this happen? The function looks like it always does the same thing

> > > +static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni,
> > > +				const struct mmu_notifier_range *range,
> > > +				unsigned long cur_seq)
> > > +{
> > > +	struct dmirror *dmirror = container_of(mni, struct dmirror, notifier);
> > > +	struct mm_struct *mm = dmirror->mm;
> > > +
> > > +	/*
> > > +	 * If the process doesn't exist, we don't need to invalidate the
> > > +	 * device page table since the address space will be torn down.
> > > +	 */
> > > +	if (!mmget_not_zero(mm))
> > > +		return true;
> > 
> > Why? Don't the notifiers provide for this already.
> > 
> > mmget_not_zero() is required before calling hmm_range_fault() though

Oh... This is the invalidate_all path during invalidation

IMHO you should test the invalidation reason in the range to exclude
this.

But xa_erase looks totally safe so there should be no reason to do
that.

> This is a workaround for a problem I don't quite understand.
> If you change tools/testing/selftests/vm/hmm-tests.c line 868 to
> 	ASSERT_EQ(ret, -1);
> Then the test will abort, core dump, and cause two problems,
> 1) the migrated page will be faulted back to system memory in order to write
>    it to the core dump. This triggers lockdep_assert_held(&walk.mm->mmap_sem)
>    in walk_page_range().

Has the migration stuff become entangled with the xarray?

> [  137.980718] Code: 80 2f 1a 83 c6 05 e9 8d 7b 01 01 e8 3e b1 b1 fe e9 05 ff ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 41 56 41 55 41 54 55 <48> 89 fd 53 4c 8d 6d 10 e8 3c fc ff ff 49 89 c4 4c 89 e0 83 e0 03
> [  137.999461] RSP: 0018:ffffc900015e77c8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
> [  138.007028] RAX: ffff8886e508c408 RBX: 0000000000000000 RCX: ffffffff82626c89
> [  138.014159] RDX: dffffc0000000000 RSI: 0000000000000000 RDI: ffffc900015e78a0
> [  138.021293] RBP: ffffc900015e78a0 R08: ffffffff811461c4 R09: fffff520002bcf17
> [  138.028426] R10: fffff520002bcf16 R11: 0000000000000003 R12: 0000000002606d10
> [  138.035557] R13: ffff8886e508c448 R14: 0000000000000031 R15: ffffffffa06546a0
> [  138.042701]  ? do_raw_spin_lock+0x104/0x1d0
> [  138.046888]  ? xas_store+0x19/0xa60
> [  138.050390]  xas_store+0x5b3/0xa60
> [  138.053806]  ? register_lock_class+0x860/0x860
> [  138.058267]  __xa_erase+0x96/0x110
> [  138.061673]  ? xas_store+0xa60/0xa60
> [  138.065267]  xa_erase+0x19/0x30

oh, it is doing this:

static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
                             struct mm_struct *mm)
{
        struct mmu_notifier_range range = {
                .flags = MMU_NOTIFIER_RANGE_BLOCKABLE,
                .event = MMU_NOTIFY_RELEASE,
                .mm = mm,
                .start = 0,
                .end = ULONG_MAX,
        };

ie it is sitting doing a huge number of xa_erases, I suppose. Probably
in normal exit the notifier is removed before the mm is destroyed.

The xa_erase needs to be a bit smarter to jump over gaps in the tree
perhaps some

xa_for_each()
   xa_erase()

pattern?

> > Also I get this:
> > 
> > lib/test_hmm.c: In function ‘dmirror_devmem_fault_alloc_and_copy’:
> > lib/test_hmm.c:1041:25: warning: unused variable ‘vma’ [-Wunused-variable]
> >   1041 |  struct vm_area_struct *vma = args->vma;
> > 
> > But this is a kernel bug, due to alloc_page_vma being a #define not a
> > static inline and me having CONFIG_NUMA off in this .config
> 
> Fixed.

in gfp.h?

Jason
Jason Gunthorpe March 20, 2020, 12:14 a.m. UTC | #17
On Tue, Mar 17, 2020 at 04:14:31PM -0700, Ralph Campbell wrote:

> +static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
> +			 unsigned long end, bool write)
> +{
> +	struct mm_struct *mm = dmirror->mm;
> +	unsigned long addr;
> +	uint64_t pfns[64];
> +	struct hmm_range range = {
> +		.notifier = &dmirror->notifier,
> +		.pfns = pfns,
> +		.flags = dmirror_hmm_flags,
> +		.values = dmirror_hmm_values,
> +		.pfn_shift = DPT_SHIFT,
> +		.pfn_flags_mask = ~(dmirror_hmm_flags[HMM_PFN_VALID] |
> +				    dmirror_hmm_flags[HMM_PFN_WRITE]),

Since pfns is not initialized pfn_flags_mask should be 0.

> +		.default_flags = dmirror_hmm_flags[HMM_PFN_VALID] |
> +				(write ? dmirror_hmm_flags[HMM_PFN_WRITE] : 0),
> +		.dev_private_owner = dmirror->mdevice,
> +	};
> +	int ret = 0;

> +static int dmirror_snapshot(struct dmirror *dmirror,
> +			    struct hmm_dmirror_cmd *cmd)
> +{
> +	struct mm_struct *mm = dmirror->mm;
> +	unsigned long start, end;
> +	unsigned long size = cmd->npages << PAGE_SHIFT;
> +	unsigned long addr;
> +	unsigned long next;
> +	uint64_t pfns[64];
> +	unsigned char perm[64];
> +	char __user *uptr;
> +	struct hmm_range range = {
> +		.pfns = pfns,
> +		.flags = dmirror_hmm_flags,
> +		.values = dmirror_hmm_values,
> +		.pfn_shift = DPT_SHIFT,
> +		.pfn_flags_mask = ~0ULL,

Same here, especially since this is snapshot

Jason
Ralph Campbell March 20, 2020, 1:33 a.m. UTC | #18
On 3/19/20 5:14 PM, Jason Gunthorpe wrote:
> On Tue, Mar 17, 2020 at 04:14:31PM -0700, Ralph Campbell wrote:
> 
>> +static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
>> +			 unsigned long end, bool write)
>> +{
>> +	struct mm_struct *mm = dmirror->mm;
>> +	unsigned long addr;
>> +	uint64_t pfns[64];
>> +	struct hmm_range range = {
>> +		.notifier = &dmirror->notifier,
>> +		.pfns = pfns,
>> +		.flags = dmirror_hmm_flags,
>> +		.values = dmirror_hmm_values,
>> +		.pfn_shift = DPT_SHIFT,
>> +		.pfn_flags_mask = ~(dmirror_hmm_flags[HMM_PFN_VALID] |
>> +				    dmirror_hmm_flags[HMM_PFN_WRITE]),
> 
> Since pfns is not initialized pfn_flags_mask should be 0.

Good point.

>> +		.default_flags = dmirror_hmm_flags[HMM_PFN_VALID] |
>> +				(write ? dmirror_hmm_flags[HMM_PFN_WRITE] : 0),
>> +		.dev_private_owner = dmirror->mdevice,
>> +	};
>> +	int ret = 0;
> 
>> +static int dmirror_snapshot(struct dmirror *dmirror,
>> +			    struct hmm_dmirror_cmd *cmd)
>> +{
>> +	struct mm_struct *mm = dmirror->mm;
>> +	unsigned long start, end;
>> +	unsigned long size = cmd->npages << PAGE_SHIFT;
>> +	unsigned long addr;
>> +	unsigned long next;
>> +	uint64_t pfns[64];
>> +	unsigned char perm[64];
>> +	char __user *uptr;
>> +	struct hmm_range range = {
>> +		.pfns = pfns,
>> +		.flags = dmirror_hmm_flags,
>> +		.values = dmirror_hmm_values,
>> +		.pfn_shift = DPT_SHIFT,
>> +		.pfn_flags_mask = ~0ULL,
> 
> Same here, especially since this is snapshot
> 
> Jason

Actually, snapshot ignores pfn_flags_mask and default_flags.
In hmm_pte_need_fault(), HMM_FAULT_SNAPSHOT is checked and returns early before
checking pfn_flags_mask and default_flags since no faults are being requested.
Jason Gunthorpe March 20, 2020, 12:58 p.m. UTC | #19
On Thu, Mar 19, 2020 at 06:33:04PM -0700, Ralph Campbell wrote:

> > > +		.default_flags = dmirror_hmm_flags[HMM_PFN_VALID] |
> > > +				(write ? dmirror_hmm_flags[HMM_PFN_WRITE] : 0),
> > > +		.dev_private_owner = dmirror->mdevice,
> > > +	};
> > > +	int ret = 0;
> > 
> > > +static int dmirror_snapshot(struct dmirror *dmirror,
> > > +			    struct hmm_dmirror_cmd *cmd)
> > > +{
> > > +	struct mm_struct *mm = dmirror->mm;
> > > +	unsigned long start, end;
> > > +	unsigned long size = cmd->npages << PAGE_SHIFT;
> > > +	unsigned long addr;
> > > +	unsigned long next;
> > > +	uint64_t pfns[64];
> > > +	unsigned char perm[64];
> > > +	char __user *uptr;
> > > +	struct hmm_range range = {
> > > +		.pfns = pfns,
> > > +		.flags = dmirror_hmm_flags,
> > > +		.values = dmirror_hmm_values,
> > > +		.pfn_shift = DPT_SHIFT,
> > > +		.pfn_flags_mask = ~0ULL,
> > 
> > Same here, especially since this is snapshot
> > 
> > Jason
> 
> Actually, snapshot ignores pfn_flags_mask and default_flags.

Yes, so no reason to set them to not 0..

Jason
Christoph Hellwig March 21, 2020, 8:20 a.m. UTC | #20
On Thu, Mar 19, 2020 at 09:03:45PM -0300, Jason Gunthorpe wrote:
> > Should tests enable the feature or the feature enable the test?
> > IMHO, if the feature is being compiled into the kernel, that should
> > enable the menu item for the test. If the feature isn't selected,
> > no need to test it :-)
> 
> I ment if DEVICE_PRIVATE should be a user selectable option at all, or
> should it be turned on when a driver like nouveau is selected.

I don't think it should be user selectable.  This is an implementation
detail users can't know about.

> Is there some downside to enabling DEVICE_PRIVATE?

The option itself adds a little more code to the core kernel, and
introduces a few additional branches in core mm code.

But more importantly it pulls in the whole pgmap infrastructure.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dee446278417..90821ce5e6ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -776,7 +776,6 @@  struct amdgpu_ttm_tt {
 static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
 	(1 << 0), /* HMM_PFN_VALID */
 	(1 << 1), /* HMM_PFN_WRITE */
-	0 /* HMM_PFN_DEVICE_PRIVATE */
 };
 
 static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 0e36345d395c..edfd0805fba4 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -28,6 +28,7 @@ 
 
 #include <nvif/class.h>
 #include <nvif/object.h>
+#include <nvif/if000c.h>
 #include <nvif/if500b.h>
 #include <nvif/if900b.h>
 
@@ -692,9 +693,8 @@  nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
 		if (page == NULL)
 			continue;
 
-		if (!(range->pfns[i] & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
+		if (!is_device_private_page(page))
 			continue;
-		}
 
 		if (!nouveau_dmem_page(drm, page)) {
 			WARN(1, "Some unknown device memory !\n");
@@ -705,5 +705,6 @@  nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
 		addr = nouveau_dmem_page_addr(page);
 		range->pfns[i] &= ((1UL << range->pfn_shift) - 1);
 		range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift;
+		range->pfns[i] |= NVIF_VMM_PFNMAP_V0_VRAM;
 	}
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index df9bf1fd1bc0..39c731a99937 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -367,7 +367,6 @@  static const u64
 nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = {
 	[HMM_PFN_VALID         ] = NVIF_VMM_PFNMAP_V0_V,
 	[HMM_PFN_WRITE         ] = NVIF_VMM_PFNMAP_V0_W,
-	[HMM_PFN_DEVICE_PRIVATE] = NVIF_VMM_PFNMAP_V0_VRAM,
 };
 
 static const u64
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4bf8d6997b12..5e6034f105c3 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -74,7 +74,6 @@ 
  * Flags:
  * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
  * HMM_PFN_WRITE: CPU page table has write permission set
- * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
  *
  * The driver provides a flags array for mapping page protections to device
  * PTE bits. If the driver valid bit for an entry is bit 3,
@@ -86,7 +85,6 @@ 
 enum hmm_pfn_flag_e {
 	HMM_PFN_VALID = 0,
 	HMM_PFN_WRITE,
-	HMM_PFN_DEVICE_PRIVATE,
 	HMM_PFN_FLAG_MAX
 };
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 180e398170b0..cfad65f6a67b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -118,15 +118,6 @@  static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
 	/* We aren't ask to do anything ... */
 	if (!(pfns & range->flags[HMM_PFN_VALID]))
 		return;
-	/* If this is device memory then only fault if explicitly requested */
-	if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
-		/* Do we fault on device memory ? */
-		if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
-			*write_fault = pfns & range->flags[HMM_PFN_WRITE];
-			*fault = true;
-		}
-		return;
-	}
 
 	/* If CPU page table is not valid then we need to fault */
 	*fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
@@ -260,21 +251,15 @@  static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		swp_entry_t entry = pte_to_swp_entry(pte);
 
 		/*
-		 * This is a special swap entry, ignore migration, use
-		 * device and report anything else as error.
+		 * Never fault in device private pages pages, but just report
+		 * the PFN even if not present.
 		 */
 		if (is_device_private_entry(entry)) {
-			cpu_flags = range->flags[HMM_PFN_VALID] |
-				range->flags[HMM_PFN_DEVICE_PRIVATE];
-			cpu_flags |= is_write_device_private_entry(entry) ?
-				range->flags[HMM_PFN_WRITE] : 0;
-			hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
-					   &fault, &write_fault);
-			if (fault || write_fault)
-				goto fault;
 			*pfn = hmm_device_entry_from_pfn(range,
 					    swp_offset(entry));
-			*pfn |= cpu_flags;
+			*pfn |= range->flags[HMM_PFN_VALID];
+			if (is_write_device_private_entry(entry))
+				*pfn |= range->flags[HMM_PFN_WRITE];
 			return 0;
 		}