diff mbox series

ocxl: Fix concurrent AFU open and device removal

Message ID 20190624144148.32022-1-fbarrat@linux.ibm.com (mailing list archive)
State Accepted
Commit a58d37bce0d21cf7fbd589384c619e465ef2f927
Headers show
Series ocxl: Fix concurrent AFU open and device removal | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch next (e610a466d16a086e321f0bd421e2fc75cff28605)
snowpatch_ozlabs/build-ppc64le success Build succeeded
snowpatch_ozlabs/build-ppc64be success Build succeeded
snowpatch_ozlabs/build-ppc64e success Build succeeded
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/checkpatch success total: 0 errors, 0 warnings, 0 checks, 56 lines checked

Commit Message

Frederic Barrat June 24, 2019, 2:41 p.m. UTC
If an ocxl device is unbound through sysfs at the same time its AFU is
being opened by a user process, the open code may dereference freed
stuctures, which can lead to kernel oops messages. You'd have to hit a
tiny time window, but it's possible. It's fairly easy to test by
making the time window bigger artificially.

Fix it with a combination of 2 changes:
- when an AFU device is found in the IDR by looking for the device
minor number, we should hold a reference on the device until after the
context is allocated. A reference on the AFU structure is kept when
the context is allocated, so we can release the reference on the
device after the context allocation.
- with the fix above, there's still another even tinier window,
between the time the AFU device is found in the IDR and the reference
on the device is taken. We can fix this one by removing the IDR entry
earlier, when the device setup is removed, instead of waiting for the
'release' device callback. With proper locking around the IDR.

Fixes: 75ca758adbaf ("ocxl: Create a clear delineation between ocxl backend & frontend")
Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
---
mpe: this fixes a commit merged in v5.2-rc1. It's late, and I don't think it's that important. If it's for the next merge window, I would add:
Cc: stable@vger.kernel.org      # v5.2


drivers/misc/ocxl/file.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

Comments

Greg Kurz June 24, 2019, 3:24 p.m. UTC | #1
On Mon, 24 Jun 2019 16:41:48 +0200
Frederic Barrat <fbarrat@linux.ibm.com> wrote:

> If an ocxl device is unbound through sysfs at the same time its AFU is
> being opened by a user process, the open code may dereference freed
> stuctures, which can lead to kernel oops messages. You'd have to hit a
> tiny time window, but it's possible. It's fairly easy to test by
> making the time window bigger artificially.
> 
> Fix it with a combination of 2 changes:
> - when an AFU device is found in the IDR by looking for the device
> minor number, we should hold a reference on the device until after the
> context is allocated. A reference on the AFU structure is kept when
> the context is allocated, so we can release the reference on the
> device after the context allocation.
> - with the fix above, there's still another even tinier window,
> between the time the AFU device is found in the IDR and the reference
> on the device is taken. We can fix this one by removing the IDR entry
> earlier, when the device setup is removed, instead of waiting for the
> 'release' device callback. With proper locking around the IDR.
> 
> Fixes: 75ca758adbaf ("ocxl: Create a clear delineation between ocxl backend & frontend")
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
> ---
> mpe: this fixes a commit merged in v5.2-rc1. It's late, and I don't think it's that important. If it's for the next merge window, I would add:
> Cc: stable@vger.kernel.org      # v5.2
> 
> 
> drivers/misc/ocxl/file.c | 23 +++++++++++------------
>  1 file changed, 11 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
> index 2870c25da166..4d1b44de1492 100644
> --- a/drivers/misc/ocxl/file.c
> +++ b/drivers/misc/ocxl/file.c
> @@ -18,18 +18,15 @@ static struct class *ocxl_class;
>  static struct mutex minors_idr_lock;
>  static struct idr minors_idr;
>  
> -static struct ocxl_file_info *find_file_info(dev_t devno)
> +static struct ocxl_file_info *find_and_get_file_info(dev_t devno)
>  {
>  	struct ocxl_file_info *info;
>  
> -	/*
> -	 * We don't declare an RCU critical section here, as our AFU
> -	 * is protected by a reference counter on the device. By the time the
> -	 * info reference is removed from the idr, the ref count of
> -	 * the device is already at 0, so no user API will access that AFU and
> -	 * this function can't return it.
> -	 */
> +	mutex_lock(&minors_idr_lock);
>  	info = idr_find(&minors_idr, MINOR(devno));
> +	if (info)
> +		get_device(&info->dev);
> +	mutex_unlock(&minors_idr_lock);
>  	return info;
>  }
>  
> @@ -58,14 +55,16 @@ static int afu_open(struct inode *inode, struct file *file)
>  
>  	pr_debug("%s for device %x\n", __func__, inode->i_rdev);
>  
> -	info = find_file_info(inode->i_rdev);
> +	info = find_and_get_file_info(inode->i_rdev);
>  	if (!info)
>  		return -ENODEV;
>  
>  	rc = ocxl_context_alloc(&ctx, info->afu, inode->i_mapping);
> -	if (rc)
> +	if (rc) {
> +		put_device(&info->dev);

You could have a single call site for put_device() since it's
needed for both branches. No big deal.

>  		return rc;
> -
> +	}
> +	put_device(&info->dev);
>  	file->private_data = ctx;
>  	return 0;
>  }
> @@ -487,7 +486,6 @@ static void info_release(struct device *dev)
>  {
>  	struct ocxl_file_info *info = container_of(dev, struct ocxl_file_info, dev);
>  
> -	free_minor(info);
>  	ocxl_afu_put(info->afu);
>  	kfree(info);
>  }
> @@ -577,6 +575,7 @@ void ocxl_file_unregister_afu(struct ocxl_afu *afu)
>  
>  	ocxl_file_make_invisible(info);
>  	ocxl_sysfs_unregister_afu(info);
> +	free_minor(info);

Since the IDR entry is added by ocxl_file_register_afu(), it seems to make
sense to undo that in ocxl_file_unregister_afu(). Out of curiosity, was there
any historical reason to do this in info_release() in the first place ?

Reviewed-by: Greg Kurz <groug@kaod.org>

>  	device_unregister(&info->dev);
>  }
>
Frederic Barrat June 24, 2019, 3:39 p.m. UTC | #2
Le 24/06/2019 à 17:24, Greg Kurz a écrit :
> On Mon, 24 Jun 2019 16:41:48 +0200
> Frederic Barrat <fbarrat@linux.ibm.com> wrote:
> 
>> If an ocxl device is unbound through sysfs at the same time its AFU is
>> being opened by a user process, the open code may dereference freed
>> stuctures, which can lead to kernel oops messages. You'd have to hit a
>> tiny time window, but it's possible. It's fairly easy to test by
>> making the time window bigger artificially.
>>
>> Fix it with a combination of 2 changes:
>> - when an AFU device is found in the IDR by looking for the device
>> minor number, we should hold a reference on the device until after the
>> context is allocated. A reference on the AFU structure is kept when
>> the context is allocated, so we can release the reference on the
>> device after the context allocation.
>> - with the fix above, there's still another even tinier window,
>> between the time the AFU device is found in the IDR and the reference
>> on the device is taken. We can fix this one by removing the IDR entry
>> earlier, when the device setup is removed, instead of waiting for the
>> 'release' device callback. With proper locking around the IDR.
>>
>> Fixes: 75ca758adbaf ("ocxl: Create a clear delineation between ocxl backend & frontend")
>> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
>> ---
>> mpe: this fixes a commit merged in v5.2-rc1. It's late, and I don't think it's that important. If it's for the next merge window, I would add:
>> Cc: stable@vger.kernel.org      # v5.2
>>
>>
>> drivers/misc/ocxl/file.c | 23 +++++++++++------------
>>   1 file changed, 11 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
>> index 2870c25da166..4d1b44de1492 100644
>> --- a/drivers/misc/ocxl/file.c
>> +++ b/drivers/misc/ocxl/file.c
>> @@ -18,18 +18,15 @@ static struct class *ocxl_class;
>>   static struct mutex minors_idr_lock;
>>   static struct idr minors_idr;
>>   
>> -static struct ocxl_file_info *find_file_info(dev_t devno)
>> +static struct ocxl_file_info *find_and_get_file_info(dev_t devno)
>>   {
>>   	struct ocxl_file_info *info;
>>   
>> -	/*
>> -	 * We don't declare an RCU critical section here, as our AFU
>> -	 * is protected by a reference counter on the device. By the time the
>> -	 * info reference is removed from the idr, the ref count of
>> -	 * the device is already at 0, so no user API will access that AFU and
>> -	 * this function can't return it.
>> -	 */
>> +	mutex_lock(&minors_idr_lock);
>>   	info = idr_find(&minors_idr, MINOR(devno));
>> +	if (info)
>> +		get_device(&info->dev);
>> +	mutex_unlock(&minors_idr_lock);
>>   	return info;
>>   }
>>   
>> @@ -58,14 +55,16 @@ static int afu_open(struct inode *inode, struct file *file)
>>   
>>   	pr_debug("%s for device %x\n", __func__, inode->i_rdev);
>>   
>> -	info = find_file_info(inode->i_rdev);
>> +	info = find_and_get_file_info(inode->i_rdev);
>>   	if (!info)
>>   		return -ENODEV;
>>   
>>   	rc = ocxl_context_alloc(&ctx, info->afu, inode->i_mapping);
>> -	if (rc)
>> +	if (rc) {
>> +		put_device(&info->dev);
> 
> You could have a single call site for put_device() since it's
> needed for both branches. No big deal.


Agreed. Will fix if I end up respinning, but won't if it's the only 
complaint :-)



>>   		return rc;
>> -
>> +	}
>> +	put_device(&info->dev);
>>   	file->private_data = ctx;
>>   	return 0;
>>   }
>> @@ -487,7 +486,6 @@ static void info_release(struct device *dev)
>>   {
>>   	struct ocxl_file_info *info = container_of(dev, struct ocxl_file_info, dev);
>>   
>> -	free_minor(info);
>>   	ocxl_afu_put(info->afu);
>>   	kfree(info);
>>   }
>> @@ -577,6 +575,7 @@ void ocxl_file_unregister_afu(struct ocxl_afu *afu)
>>   
>>   	ocxl_file_make_invisible(info);
>>   	ocxl_sysfs_unregister_afu(info);
>> +	free_minor(info);
> 
> Since the IDR entry is added by ocxl_file_register_afu(), it seems to make
> sense to undo that in ocxl_file_unregister_afu(). Out of curiosity, was there
> any historical reason to do this in info_release() in the first place ?


Yeah, it makes a lot of sense to remove the IDR entry in 
ocxl_file_unregister_afu(), that's where we undo the device. I wish I 
had noticed during the code reviews.
I don't think there was any good reason to have it in info_release() in 
the first place. I remember the code went through many iterations to get 
the reference counting on the AFU structure and device done correctly, 
but we let that one slip.

I now think the pre-5.2 ocxl code was also exposed to the 2nd window 
mentioned in the commit log (but the first window is new with the 
refactoring introduced in 5.2-rc1).

   Fred



> 
> Reviewed-by: Greg Kurz <groug@kaod.org>
> 
>>   	device_unregister(&info->dev);
>>   }
>>   
>
Greg Kurz June 24, 2019, 3:50 p.m. UTC | #3
On Mon, 24 Jun 2019 17:39:26 +0200
Frederic Barrat <fbarrat@linux.ibm.com> wrote:

> Le 24/06/2019 à 17:24, Greg Kurz a écrit :
> > On Mon, 24 Jun 2019 16:41:48 +0200
> > Frederic Barrat <fbarrat@linux.ibm.com> wrote:
> >   
> >> If an ocxl device is unbound through sysfs at the same time its AFU is
> >> being opened by a user process, the open code may dereference freed
> >> stuctures, which can lead to kernel oops messages. You'd have to hit a
> >> tiny time window, but it's possible. It's fairly easy to test by
> >> making the time window bigger artificially.
> >>
> >> Fix it with a combination of 2 changes:
> >> - when an AFU device is found in the IDR by looking for the device
> >> minor number, we should hold a reference on the device until after the
> >> context is allocated. A reference on the AFU structure is kept when
> >> the context is allocated, so we can release the reference on the
> >> device after the context allocation.
> >> - with the fix above, there's still another even tinier window,
> >> between the time the AFU device is found in the IDR and the reference
> >> on the device is taken. We can fix this one by removing the IDR entry
> >> earlier, when the device setup is removed, instead of waiting for the
> >> 'release' device callback. With proper locking around the IDR.
> >>
> >> Fixes: 75ca758adbaf ("ocxl: Create a clear delineation between ocxl backend & frontend")
> >> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
> >> ---
> >> mpe: this fixes a commit merged in v5.2-rc1. It's late, and I don't think it's that important. If it's for the next merge window, I would add:
> >> Cc: stable@vger.kernel.org      # v5.2
> >>
> >>
> >> drivers/misc/ocxl/file.c | 23 +++++++++++------------
> >>   1 file changed, 11 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
> >> index 2870c25da166..4d1b44de1492 100644
> >> --- a/drivers/misc/ocxl/file.c
> >> +++ b/drivers/misc/ocxl/file.c
> >> @@ -18,18 +18,15 @@ static struct class *ocxl_class;
> >>   static struct mutex minors_idr_lock;
> >>   static struct idr minors_idr;
> >>   
> >> -static struct ocxl_file_info *find_file_info(dev_t devno)
> >> +static struct ocxl_file_info *find_and_get_file_info(dev_t devno)
> >>   {
> >>   	struct ocxl_file_info *info;
> >>   
> >> -	/*
> >> -	 * We don't declare an RCU critical section here, as our AFU
> >> -	 * is protected by a reference counter on the device. By the time the
> >> -	 * info reference is removed from the idr, the ref count of
> >> -	 * the device is already at 0, so no user API will access that AFU and
> >> -	 * this function can't return it.
> >> -	 */
> >> +	mutex_lock(&minors_idr_lock);
> >>   	info = idr_find(&minors_idr, MINOR(devno));
> >> +	if (info)
> >> +		get_device(&info->dev);
> >> +	mutex_unlock(&minors_idr_lock);
> >>   	return info;
> >>   }
> >>   
> >> @@ -58,14 +55,16 @@ static int afu_open(struct inode *inode, struct file *file)
> >>   
> >>   	pr_debug("%s for device %x\n", __func__, inode->i_rdev);
> >>   
> >> -	info = find_file_info(inode->i_rdev);
> >> +	info = find_and_get_file_info(inode->i_rdev);
> >>   	if (!info)
> >>   		return -ENODEV;
> >>   
> >>   	rc = ocxl_context_alloc(&ctx, info->afu, inode->i_mapping);
> >> -	if (rc)
> >> +	if (rc) {
> >> +		put_device(&info->dev);  
> > 
> > You could have a single call site for put_device() since it's
> > needed for both branches. No big deal.  
> 
> 
> Agreed. Will fix if I end up respinning, but won't if it's the only 
> complaint :-)
> 
> 
> 
> >>   		return rc;
> >> -
> >> +	}
> >> +	put_device(&info->dev);
> >>   	file->private_data = ctx;
> >>   	return 0;
> >>   }
> >> @@ -487,7 +486,6 @@ static void info_release(struct device *dev)
> >>   {
> >>   	struct ocxl_file_info *info = container_of(dev, struct ocxl_file_info, dev);
> >>   
> >> -	free_minor(info);
> >>   	ocxl_afu_put(info->afu);
> >>   	kfree(info);
> >>   }
> >> @@ -577,6 +575,7 @@ void ocxl_file_unregister_afu(struct ocxl_afu *afu)
> >>   
> >>   	ocxl_file_make_invisible(info);
> >>   	ocxl_sysfs_unregister_afu(info);
> >> +	free_minor(info);  
> > 
> > Since the IDR entry is added by ocxl_file_register_afu(), it seems to make
> > sense to undo that in ocxl_file_unregister_afu(). Out of curiosity, was there
> > any historical reason to do this in info_release() in the first place ?  
> 
> 
> Yeah, it makes a lot of sense to remove the IDR entry in 
> ocxl_file_unregister_afu(), that's where we undo the device. I wish I 
> had noticed during the code reviews.
> I don't think there was any good reason to have it in info_release() in 
> the first place. I remember the code went through many iterations to get 
> the reference counting on the AFU structure and device done correctly, 
> but we let that one slip.
> 
> I now think the pre-5.2 ocxl code was also exposed to the 2nd window 
> mentioned in the commit log (but the first window is new with the 
> refactoring introduced in 5.2-rc1).
> 

This calls for two separate patches then IMHO.

>    Fred
> 
> 
> 
> > 
> > Reviewed-by: Greg Kurz <groug@kaod.org>
> >   
> >>   	device_unregister(&info->dev);
> >>   }
> >>     
> >   
>
Frederic Barrat June 25, 2019, 8:22 a.m. UTC | #4
Le 24/06/2019 à 17:50, Greg Kurz a écrit :
> On Mon, 24 Jun 2019 17:39:26 +0200
> Frederic Barrat <fbarrat@linux.ibm.com> wrote:
> 
>> Le 24/06/2019 à 17:24, Greg Kurz a écrit :
>>> On Mon, 24 Jun 2019 16:41:48 +0200
>>> Frederic Barrat <fbarrat@linux.ibm.com> wrote:
>>>    
>>>> If an ocxl device is unbound through sysfs at the same time its AFU is
>>>> being opened by a user process, the open code may dereference freed
>>>> stuctures, which can lead to kernel oops messages. You'd have to hit a
>>>> tiny time window, but it's possible. It's fairly easy to test by
>>>> making the time window bigger artificially.
>>>>
>>>> Fix it with a combination of 2 changes:
>>>> - when an AFU device is found in the IDR by looking for the device
>>>> minor number, we should hold a reference on the device until after the
>>>> context is allocated. A reference on the AFU structure is kept when
>>>> the context is allocated, so we can release the reference on the
>>>> device after the context allocation.
>>>> - with the fix above, there's still another even tinier window,
>>>> between the time the AFU device is found in the IDR and the reference
>>>> on the device is taken. We can fix this one by removing the IDR entry
>>>> earlier, when the device setup is removed, instead of waiting for the
>>>> 'release' device callback. With proper locking around the IDR.
>>>>
>>>> Fixes: 75ca758adbaf ("ocxl: Create a clear delineation between ocxl backend & frontend")
>>>> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
>>>> ---
>>>> mpe: this fixes a commit merged in v5.2-rc1. It's late, and I don't think it's that important. If it's for the next merge window, I would add:
>>>> Cc: stable@vger.kernel.org      # v5.2
>>>>
>>>>
>>>> drivers/misc/ocxl/file.c | 23 +++++++++++------------
>>>>    1 file changed, 11 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
>>>> index 2870c25da166..4d1b44de1492 100644
>>>> --- a/drivers/misc/ocxl/file.c
>>>> +++ b/drivers/misc/ocxl/file.c
>>>> @@ -18,18 +18,15 @@ static struct class *ocxl_class;
>>>>    static struct mutex minors_idr_lock;
>>>>    static struct idr minors_idr;
>>>>    
>>>> -static struct ocxl_file_info *find_file_info(dev_t devno)
>>>> +static struct ocxl_file_info *find_and_get_file_info(dev_t devno)
>>>>    {
>>>>    	struct ocxl_file_info *info;
>>>>    
>>>> -	/*
>>>> -	 * We don't declare an RCU critical section here, as our AFU
>>>> -	 * is protected by a reference counter on the device. By the time the
>>>> -	 * info reference is removed from the idr, the ref count of
>>>> -	 * the device is already at 0, so no user API will access that AFU and
>>>> -	 * this function can't return it.
>>>> -	 */
>>>> +	mutex_lock(&minors_idr_lock);
>>>>    	info = idr_find(&minors_idr, MINOR(devno));
>>>> +	if (info)
>>>> +		get_device(&info->dev);
>>>> +	mutex_unlock(&minors_idr_lock);
>>>>    	return info;
>>>>    }
>>>>    
>>>> @@ -58,14 +55,16 @@ static int afu_open(struct inode *inode, struct file *file)
>>>>    
>>>>    	pr_debug("%s for device %x\n", __func__, inode->i_rdev);
>>>>    
>>>> -	info = find_file_info(inode->i_rdev);
>>>> +	info = find_and_get_file_info(inode->i_rdev);
>>>>    	if (!info)
>>>>    		return -ENODEV;
>>>>    
>>>>    	rc = ocxl_context_alloc(&ctx, info->afu, inode->i_mapping);
>>>> -	if (rc)
>>>> +	if (rc) {
>>>> +		put_device(&info->dev);
>>>
>>> You could have a single call site for put_device() since it's
>>> needed for both branches. No big deal.
>>
>>
>> Agreed. Will fix if I end up respinning, but won't if it's the only
>> complaint :-)
>>
>>
>>
>>>>    		return rc;
>>>> -
>>>> +	}
>>>> +	put_device(&info->dev);
>>>>    	file->private_data = ctx;
>>>>    	return 0;
>>>>    }
>>>> @@ -487,7 +486,6 @@ static void info_release(struct device *dev)
>>>>    {
>>>>    	struct ocxl_file_info *info = container_of(dev, struct ocxl_file_info, dev);
>>>>    
>>>> -	free_minor(info);
>>>>    	ocxl_afu_put(info->afu);
>>>>    	kfree(info);
>>>>    }
>>>> @@ -577,6 +575,7 @@ void ocxl_file_unregister_afu(struct ocxl_afu *afu)
>>>>    
>>>>    	ocxl_file_make_invisible(info);
>>>>    	ocxl_sysfs_unregister_afu(info);
>>>> +	free_minor(info);
>>>
>>> Since the IDR entry is added by ocxl_file_register_afu(), it seems to make
>>> sense to undo that in ocxl_file_unregister_afu(). Out of curiosity, was there
>>> any historical reason to do this in info_release() in the first place ?
>>
>>
>> Yeah, it makes a lot of sense to remove the IDR entry in
>> ocxl_file_unregister_afu(), that's where we undo the device. I wish I
>> had noticed during the code reviews.
>> I don't think there was any good reason to have it in info_release() in
>> the first place. I remember the code went through many iterations to get
>> the reference counting on the AFU structure and device done correctly,
>> but we let that one slip.
>>
>> I now think the pre-5.2 ocxl code was also exposed to the 2nd window
>> mentioned in the commit log (but the first window is new with the
>> refactoring introduced in 5.2-rc1).
>>
> 
> This calls for two separate patches then IMHO.

Well, splitting this patch in 2 wouldn't help, as the pre-5.2 code was 
different enough that it wouldn't apply.
I could send a different patch covering just the 2nd window to stable 
and backport to distros. But considering the likelyhood of hitting the 
problem, it's going to be low on my list.

   Fred



> 
>>     Fred
>>
>>
>>
>>>
>>> Reviewed-by: Greg Kurz <groug@kaod.org>
>>>    
>>>>    	device_unregister(&info->dev);
>>>>    }
>>>>      
>>>    
>>
>
Michael Ellerman Dec. 13, 2019, 9:19 p.m. UTC | #5
On Mon, 2019-06-24 at 14:41:48 UTC, Frederic Barrat wrote:
> If an ocxl device is unbound through sysfs at the same time its AFU is
> being opened by a user process, the open code may dereference freed
> stuctures, which can lead to kernel oops messages. You'd have to hit a
> tiny time window, but it's possible. It's fairly easy to test by
> making the time window bigger artificially.
> 
> Fix it with a combination of 2 changes:
> - when an AFU device is found in the IDR by looking for the device
> minor number, we should hold a reference on the device until after the
> context is allocated. A reference on the AFU structure is kept when
> the context is allocated, so we can release the reference on the
> device after the context allocation.
> - with the fix above, there's still another even tinier window,
> between the time the AFU device is found in the IDR and the reference
> on the device is taken. We can fix this one by removing the IDR entry
> earlier, when the device setup is removed, instead of waiting for the
> 'release' device callback. With proper locking around the IDR.
> 
> Fixes: 75ca758adbaf ("ocxl: Create a clear delineation between ocxl backend & frontend")
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/a58d37bce0d21cf7fbd589384c619e465ef2f927

cheers
diff mbox series

Patch

diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
index 2870c25da166..4d1b44de1492 100644
--- a/drivers/misc/ocxl/file.c
+++ b/drivers/misc/ocxl/file.c
@@ -18,18 +18,15 @@  static struct class *ocxl_class;
 static struct mutex minors_idr_lock;
 static struct idr minors_idr;
 
-static struct ocxl_file_info *find_file_info(dev_t devno)
+static struct ocxl_file_info *find_and_get_file_info(dev_t devno)
 {
 	struct ocxl_file_info *info;
 
-	/*
-	 * We don't declare an RCU critical section here, as our AFU
-	 * is protected by a reference counter on the device. By the time the
-	 * info reference is removed from the idr, the ref count of
-	 * the device is already at 0, so no user API will access that AFU and
-	 * this function can't return it.
-	 */
+	mutex_lock(&minors_idr_lock);
 	info = idr_find(&minors_idr, MINOR(devno));
+	if (info)
+		get_device(&info->dev);
+	mutex_unlock(&minors_idr_lock);
 	return info;
 }
 
@@ -58,14 +55,16 @@  static int afu_open(struct inode *inode, struct file *file)
 
 	pr_debug("%s for device %x\n", __func__, inode->i_rdev);
 
-	info = find_file_info(inode->i_rdev);
+	info = find_and_get_file_info(inode->i_rdev);
 	if (!info)
 		return -ENODEV;
 
 	rc = ocxl_context_alloc(&ctx, info->afu, inode->i_mapping);
-	if (rc)
+	if (rc) {
+		put_device(&info->dev);
 		return rc;
-
+	}
+	put_device(&info->dev);
 	file->private_data = ctx;
 	return 0;
 }
@@ -487,7 +486,6 @@  static void info_release(struct device *dev)
 {
 	struct ocxl_file_info *info = container_of(dev, struct ocxl_file_info, dev);
 
-	free_minor(info);
 	ocxl_afu_put(info->afu);
 	kfree(info);
 }
@@ -577,6 +575,7 @@  void ocxl_file_unregister_afu(struct ocxl_afu *afu)
 
 	ocxl_file_make_invisible(info);
 	ocxl_sysfs_unregister_afu(info);
+	free_minor(info);
 	device_unregister(&info->dev);
 }