[1/5] vfio/type1: use pinned_vm instead of locked_vm to account pinned pages

Message ID 20190211224437.25267-2-daniel.m.jordan@oracle.com
State Not Applicable
Headers show
Series
  • use pinned_vm instead of locked_vm to account pinned pages
Related show

Checks

Context Check Description
snowpatch_ozlabs/checkpatch fail total: 1 errors, 0 warnings, 0 checks, 63 lines checked
snowpatch_ozlabs/apply_patch success next/apply_patch Successfully applied

Commit Message

Daniel Jordan Feb. 11, 2019, 10:44 p.m.
Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
pages"), locked and pinned pages are accounted separately.  Type1
accounts pinned pages to locked_vm; use pinned_vm instead.

pinned_vm recently became atomic and so no longer relies on mmap_sem
held as writer: delete.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
 drivers/vfio/vfio_iommu_type1.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

Comments

Jason Gunthorpe Feb. 11, 2019, 10:56 p.m. | #1
On Mon, Feb 11, 2019 at 05:44:33PM -0500, Daniel Jordan wrote:
> Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
> pages"), locked and pinned pages are accounted separately.  Type1
> accounts pinned pages to locked_vm; use pinned_vm instead.
> 
> pinned_vm recently became atomic and so no longer relies on mmap_sem
> held as writer: delete.
> 
> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
>  drivers/vfio/vfio_iommu_type1.c | 31 ++++++++++++-------------------
>  1 file changed, 12 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 73652e21efec..a56cc341813f 100644
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -257,7 +257,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
>  static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
>  {
>  	struct mm_struct *mm;
> -	int ret;
> +	s64 pinned_vm;
> +	int ret = 0;
>  
>  	if (!npage)
>  		return 0;
> @@ -266,24 +267,15 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
>  	if (!mm)
>  		return -ESRCH; /* process exited */
>  
> -	ret = down_write_killable(&mm->mmap_sem);
> -	if (!ret) {
> -		if (npage > 0) {
> -			if (!dma->lock_cap) {
> -				unsigned long limit;
> -
> -				limit = task_rlimit(dma->task,
> -						RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	pinned_vm = atomic64_add_return(npage, &mm->pinned_vm);
>  
> -				if (mm->locked_vm + npage > limit)
> -					ret = -ENOMEM;
> -			}
> +	if (npage > 0 && !dma->lock_cap) {
> +		unsigned long limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >>
> +
> -					PAGE_SHIFT;

I haven't looked at this super closely, but how does this stuff work?

do_mlock doesn't touch pinned_vm, and this doesn't touch locked_vm...

Shouldn't all this be 'if (locked_vm + pinned_vm < RLIMIT_MEMLOCK)' ?

Otherwise MEMLOCK is really doubled..

Jason
Daniel Jordan Feb. 11, 2019, 11:11 p.m. | #2
On Mon, Feb 11, 2019 at 03:56:20PM -0700, Jason Gunthorpe wrote:
> On Mon, Feb 11, 2019 at 05:44:33PM -0500, Daniel Jordan wrote:
> > @@ -266,24 +267,15 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
> >  	if (!mm)
> >  		return -ESRCH; /* process exited */
> >  
> > -	ret = down_write_killable(&mm->mmap_sem);
> > -	if (!ret) {
> > -		if (npage > 0) {
> > -			if (!dma->lock_cap) {
> > -				unsigned long limit;
> > -
> > -				limit = task_rlimit(dma->task,
> > -						RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +	pinned_vm = atomic64_add_return(npage, &mm->pinned_vm);
> >  
> > -				if (mm->locked_vm + npage > limit)
> > -					ret = -ENOMEM;
> > -			}
> > +	if (npage > 0 && !dma->lock_cap) {
> > +		unsigned long limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >>
> > +
> > -					PAGE_SHIFT;
> 
> I haven't looked at this super closely, but how does this stuff work?
> 
> do_mlock doesn't touch pinned_vm, and this doesn't touch locked_vm...
> 
> Shouldn't all this be 'if (locked_vm + pinned_vm < RLIMIT_MEMLOCK)' ?
>
> Otherwise MEMLOCK is really doubled..

So this has been a problem for some time, but it's not as easy as adding them
together, see [1][2] for a start.

The locked_vm/pinned_vm issue definitely needs fixing, but all this series is
trying to do is account to the right counter.

Daniel

[1] http://lkml.kernel.org/r/20130523104154.GA23650@twins.programming.kicks-ass.net
[2] http://lkml.kernel.org/r/20130524140114.GK23650@twins.programming.kicks-ass.net
Alex Williamson Feb. 12, 2019, 6:41 p.m. | #3
On Mon, 11 Feb 2019 18:11:53 -0500
Daniel Jordan <daniel.m.jordan@oracle.com> wrote:

> On Mon, Feb 11, 2019 at 03:56:20PM -0700, Jason Gunthorpe wrote:
> > On Mon, Feb 11, 2019 at 05:44:33PM -0500, Daniel Jordan wrote:  
> > > @@ -266,24 +267,15 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
> > >  	if (!mm)
> > >  		return -ESRCH; /* process exited */
> > >  
> > > -	ret = down_write_killable(&mm->mmap_sem);
> > > -	if (!ret) {
> > > -		if (npage > 0) {
> > > -			if (!dma->lock_cap) {
> > > -				unsigned long limit;
> > > -
> > > -				limit = task_rlimit(dma->task,
> > > -						RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > +	pinned_vm = atomic64_add_return(npage, &mm->pinned_vm);
> > >  
> > > -				if (mm->locked_vm + npage > limit)
> > > -					ret = -ENOMEM;
> > > -			}
> > > +	if (npage > 0 && !dma->lock_cap) {
> > > +		unsigned long limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >>
> > > +
> > > -					PAGE_SHIFT;  
> > 
> > I haven't looked at this super closely, but how does this stuff work?
> > 
> > do_mlock doesn't touch pinned_vm, and this doesn't touch locked_vm...
> > 
> > Shouldn't all this be 'if (locked_vm + pinned_vm < RLIMIT_MEMLOCK)' ?
> >
> > Otherwise MEMLOCK is really doubled..  
> 
> So this has been a problem for some time, but it's not as easy as adding them
> together, see [1][2] for a start.
> 
> The locked_vm/pinned_vm issue definitely needs fixing, but all this series is
> trying to do is account to the right counter.

This still makes me nervous because we have userspace dependencies on
setting process locked memory.  There's a user visible difference if we
account for them in the same bucket vs separate.  Perhaps we're
counting in the wrong bucket now, but if we "fix" that and userspace
adapts, how do we ever go back to accounting both mlocked and pinned
memory combined against rlimit?  Thanks,

Alex
Daniel Jordan Feb. 13, 2019, 12:26 a.m. | #4
On Tue, Feb 12, 2019 at 11:41:10AM -0700, Alex Williamson wrote:
> Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
> > On Mon, Feb 11, 2019 at 03:56:20PM -0700, Jason Gunthorpe wrote:
> > > I haven't looked at this super closely, but how does this stuff work?
> > > 
> > > do_mlock doesn't touch pinned_vm, and this doesn't touch locked_vm...
> > > 
> > > Shouldn't all this be 'if (locked_vm + pinned_vm < RLIMIT_MEMLOCK)' ?
> > >
> > > Otherwise MEMLOCK is really doubled..  
> > 
> > So this has been a problem for some time, but it's not as easy as adding them
> > together, see [1][2] for a start.
> > 
> > The locked_vm/pinned_vm issue definitely needs fixing, but all this series is
> > trying to do is account to the right counter.

Thanks for taking a look, Alex.

> This still makes me nervous because we have userspace dependencies on
> setting process locked memory.

Could you please expand on this?  Trying to get more context.

> There's a user visible difference if we
> account for them in the same bucket vs separate.  Perhaps we're
> counting in the wrong bucket now, but if we "fix" that and userspace
> adapts, how do we ever go back to accounting both mlocked and pinned
> memory combined against rlimit?  Thanks,

PeterZ posted an RFC that addresses this point[1].  It kept pinned_vm and
locked_vm accounting separate, but allowed the two to be added safely to be
compared against RLIMIT_MEMLOCK.

Anyway, until some solution is agreed on, are there objections to converting
locked_vm to an atomic, to avoid user-visible changes, instead of switching
locked_vm users to pinned_vm?

Daniel

[1] http://lkml.kernel.org/r/20130524140114.GK23650@twins.programming.kicks-ass.net
Alex Williamson Feb. 13, 2019, 8:03 p.m. | #5
On Tue, 12 Feb 2019 19:26:50 -0500
Daniel Jordan <daniel.m.jordan@oracle.com> wrote:

> On Tue, Feb 12, 2019 at 11:41:10AM -0700, Alex Williamson wrote:
> > Daniel Jordan <daniel.m.jordan@oracle.com> wrote:  
> > > On Mon, Feb 11, 2019 at 03:56:20PM -0700, Jason Gunthorpe wrote:  
> > > > I haven't looked at this super closely, but how does this stuff work?
> > > > 
> > > > do_mlock doesn't touch pinned_vm, and this doesn't touch locked_vm...
> > > > 
> > > > Shouldn't all this be 'if (locked_vm + pinned_vm < RLIMIT_MEMLOCK)' ?
> > > >
> > > > Otherwise MEMLOCK is really doubled..    
> > > 
> > > So this has been a problem for some time, but it's not as easy as adding them
> > > together, see [1][2] for a start.
> > > 
> > > The locked_vm/pinned_vm issue definitely needs fixing, but all this series is
> > > trying to do is account to the right counter.  
> 
> Thanks for taking a look, Alex.
> 
> > This still makes me nervous because we have userspace dependencies on
> > setting process locked memory.  
> 
> Could you please expand on this?  Trying to get more context.

VFIO is a userspace driver interface and the pinned/locked page
accounting we're doing here is trying to prevent a user from exceeding
their locked memory limits.  Thus a VM management tool or unprivileged
userspace driver needs to have appropriate locked memory limits
configured for their use case.  Currently we do not have a unified
accounting scheme, so if a page is mlock'd by the user and also mapped
through VFIO for DMA, it's accounted twice, these both increment
locked_vm and userspace needs to manage that.  If pinned memory
and locked memory are now two separate buckets and we're only comparing
one of them against the locked memory limit, then it seems we have
effectively doubled the user's locked memory for this use case, as
Jason questioned.  The user could mlock one page and DMA map another,
they're both "locked", but now they only take one slot in each bucket.

If we continue forward with using a separate bucket here, userspace
could infer that accounting is unified and lower the user's locked
memory limit, or exploit the gap that their effective limit might
actually exceed system memory.  In the former case, if we do eventually
correct to compare the total of the combined buckets against the user's
locked memory limits, we'll break users that have adapted their locked
memory limits to meet the apparent needs.  In the latter case, the
inconsistent accounting is potentially an attack vector.

> > There's a user visible difference if we
> > account for them in the same bucket vs separate.  Perhaps we're
> > counting in the wrong bucket now, but if we "fix" that and userspace
> > adapts, how do we ever go back to accounting both mlocked and pinned
> > memory combined against rlimit?  Thanks,  
> 
> PeterZ posted an RFC that addresses this point[1].  It kept pinned_vm and
> locked_vm accounting separate, but allowed the two to be added safely to be
> compared against RLIMIT_MEMLOCK.

Unless I'm incorrect in the concerns above, I don't see how we can
convert vfio before this occurs.
 
> Anyway, until some solution is agreed on, are there objections to converting
> locked_vm to an atomic, to avoid user-visible changes, instead of switching
> locked_vm users to pinned_vm?

Seems that as long as we have separate buckets that are compared
individually to rlimit that we've got problems, it's just a matter of
where they're exposed based on which bucket is used for which
interface.  Thanks,

Alex
Jason Gunthorpe Feb. 13, 2019, 11:07 p.m. | #6
On Wed, Feb 13, 2019 at 01:03:30PM -0700, Alex Williamson wrote:
> > PeterZ posted an RFC that addresses this point[1].  It kept pinned_vm and
> > locked_vm accounting separate, but allowed the two to be added safely to be
> > compared against RLIMIT_MEMLOCK.
> 
> Unless I'm incorrect in the concerns above, I don't see how we can
> convert vfio before this occurs.

RDMA was converted to this pinned_vm scheme a long time ago, arguably
it is a mistake that VFIO did something different... This was to fix
some other bug where reporting of pages was wrong.

You are not wrong that this approach doesn't entirely make sense
though. :)

Jason
Daniel Jordan Feb. 14, 2019, 1:46 a.m. | #7
On Wed, Feb 13, 2019 at 01:03:30PM -0700, Alex Williamson wrote:
> Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
> > On Tue, Feb 12, 2019 at 11:41:10AM -0700, Alex Williamson wrote:
> > > This still makes me nervous because we have userspace dependencies on
> > > setting process locked memory.  
> > 
> > Could you please expand on this?  Trying to get more context.
> 
> VFIO is a userspace driver interface and the pinned/locked page
> accounting we're doing here is trying to prevent a user from exceeding
> their locked memory limits.  Thus a VM management tool or unprivileged
> userspace driver needs to have appropriate locked memory limits
> configured for their use case.  Currently we do not have a unified
> accounting scheme, so if a page is mlock'd by the user and also mapped
> through VFIO for DMA, it's accounted twice, these both increment
> locked_vm and userspace needs to manage that.  If pinned memory
> and locked memory are now two separate buckets and we're only comparing
> one of them against the locked memory limit, then it seems we have
> effectively doubled the user's locked memory for this use case, as
> Jason questioned.  The user could mlock one page and DMA map another,
> they're both "locked", but now they only take one slot in each bucket.

Right, yes.  Should have been more specific.  I was after a concrete use case
where this would happen (sounded like you may have had a specific tool in
mind).

But it doesn't matter.  I understand your concern and agree that, given the
possibility that accounting in _some_ tool can be affected, we should fix
accounting before changing user visible behavior.  I can start a separate
discussion, having opened the can of worms again :)

> If we continue forward with using a separate bucket here, userspace
> could infer that accounting is unified and lower the user's locked
> memory limit, or exploit the gap that their effective limit might
> actually exceed system memory.  In the former case, if we do eventually
> correct to compare the total of the combined buckets against the user's
> locked memory limits, we'll break users that have adapted their locked
> memory limits to meet the apparent needs.  In the latter case, the
> inconsistent accounting is potentially an attack vector.

Makes sense.

> > > There's a user visible difference if we
> > > account for them in the same bucket vs separate.  Perhaps we're
> > > counting in the wrong bucket now, but if we "fix" that and userspace
> > > adapts, how do we ever go back to accounting both mlocked and pinned
> > > memory combined against rlimit?  Thanks,  
> > 
> > PeterZ posted an RFC that addresses this point[1].  It kept pinned_vm and
> > locked_vm accounting separate, but allowed the two to be added safely to be
> > compared against RLIMIT_MEMLOCK.
> 
> Unless I'm incorrect in the concerns above, I don't see how we can
> convert vfio before this occurs.
>  
> > Anyway, until some solution is agreed on, are there objections to converting
> > locked_vm to an atomic, to avoid user-visible changes, instead of switching
> > locked_vm users to pinned_vm?
> 
> Seems that as long as we have separate buckets that are compared
> individually to rlimit that we've got problems, it's just a matter of
> where they're exposed based on which bucket is used for which
> interface.  Thanks,

Indeed.  But for now, any concern with simply changing the type of the
currently used counter to an atomic, to reduce mmap_sem usage?  This is just an
implementation detail, invisible to userspace.

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 73652e21efec..a56cc341813f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -257,7 +257,8 @@  static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 {
 	struct mm_struct *mm;
-	int ret;
+	s64 pinned_vm;
+	int ret = 0;
 
 	if (!npage)
 		return 0;
@@ -266,24 +267,15 @@  static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 	if (!mm)
 		return -ESRCH; /* process exited */
 
-	ret = down_write_killable(&mm->mmap_sem);
-	if (!ret) {
-		if (npage > 0) {
-			if (!dma->lock_cap) {
-				unsigned long limit;
-
-				limit = task_rlimit(dma->task,
-						RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	pinned_vm = atomic64_add_return(npage, &mm->pinned_vm);
 
-				if (mm->locked_vm + npage > limit)
-					ret = -ENOMEM;
-			}
+	if (npage > 0 && !dma->lock_cap) {
+		unsigned long limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >>
+								   PAGE_SHIFT;
+		if (pinned_vm > limit) {
+			atomic64_sub(npage, &mm->pinned_vm);
+			ret = -ENOMEM;
 		}
-
-		if (!ret)
-			mm->locked_vm += npage;
-
-		up_write(&mm->mmap_sem);
 	}
 
 	if (async)
@@ -401,6 +393,7 @@  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	long ret, pinned = 0, lock_acct = 0;
 	bool rsvd;
 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
+	atomic64_t *pinned_vm = &current->mm->pinned_vm;
 
 	/* This code path is only user initiated */
 	if (!current->mm)
@@ -418,7 +411,7 @@  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 	 * pages are already counted against the user.
 	 */
 	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
-		if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
+		if (!dma->lock_cap && atomic64_read(pinned_vm) + 1 > limit) {
 			put_pfn(*pfn_base, dma->prot);
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 					limit << PAGE_SHIFT);
@@ -445,7 +438,7 @@  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 
 		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
 			if (!dma->lock_cap &&
-			    current->mm->locked_vm + lock_acct + 1 > limit) {
+			    atomic64_read(pinned_vm) + lock_acct + 1 > limit) {
 				put_pfn(pfn, dma->prot);
 				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 					__func__, limit << PAGE_SHIFT);