diff mbox series

[v4,1/6] vfio/pci: Mask INTx during runtime suspend

Message ID 20220701110814.7310-2-abhsahu@nvidia.com
State New
Headers show
Series vfio/pci: power management changes | expand

Commit Message

Abhishek Sahu July 1, 2022, 11:08 a.m. UTC
This patch adds INTx handling during runtime suspend/resume.
All the suspend/resume related code for the user to put the device
into the low power state will be added in subsequent patches.

The INTx are shared among devices. Whenever any INTx interrupt comes
for the VFIO devices, then vfio_intx_handler() will be called for each
device. Inside vfio_intx_handler(), it calls pci_check_and_mask_intx()
and checks if the interrupt has been generated for the current device.
Now, if the device is already in the D3cold state, then the config space
can not be read. Attempt to read config space in D3cold state can
cause system unresponsiveness in a few systems. To prevent this, mask
INTx in runtime suspend callback and unmask the same in runtime resume
callback. If INTx has been already masked, then no handling is needed
in runtime suspend/resume callbacks. 'pm_intx_masked' tracks this, and
vfio_pci_intx_mask() has been updated to return true if INTx has been
masked inside this function.

For the runtime suspend which is triggered for the no user of VFIO
device, the is_intx() will return false and these callbacks won't do
anything.

The MSI/MSI-X are not shared so similar handling should not be
needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
without doing any device-specific config access. When the user performs
any config access or IOCTL after receiving the eventfd notification,
then the device will be moved to the D0 state first before
servicing any request.

Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
---
 drivers/vfio/pci/vfio_pci_core.c  | 37 +++++++++++++++++++++++++++----
 drivers/vfio/pci/vfio_pci_intrs.c |  6 ++++-
 include/linux/vfio_pci_core.h     |  3 ++-
 3 files changed, 40 insertions(+), 6 deletions(-)

Comments

Alex Williamson July 6, 2022, 3:39 p.m. UTC | #1
On Fri, 1 Jul 2022 16:38:09 +0530
Abhishek Sahu <abhsahu@nvidia.com> wrote:

> This patch adds INTx handling during runtime suspend/resume.
> All the suspend/resume related code for the user to put the device
> into the low power state will be added in subsequent patches.
> 
> The INTx are shared among devices. Whenever any INTx interrupt comes

"The INTx lines may be shared..."

> for the VFIO devices, then vfio_intx_handler() will be called for each
> device. Inside vfio_intx_handler(), it calls pci_check_and_mask_intx()

"...device sharing the interrupt."

> and checks if the interrupt has been generated for the current device.
> Now, if the device is already in the D3cold state, then the config space
> can not be read. Attempt to read config space in D3cold state can
> cause system unresponsiveness in a few systems. To prevent this, mask
> INTx in runtime suspend callback and unmask the same in runtime resume
> callback. If INTx has been already masked, then no handling is needed
> in runtime suspend/resume callbacks. 'pm_intx_masked' tracks this, and
> vfio_pci_intx_mask() has been updated to return true if INTx has been
> masked inside this function.
> 
> For the runtime suspend which is triggered for the no user of VFIO
> device, the is_intx() will return false and these callbacks won't do
> anything.
> 
> The MSI/MSI-X are not shared so similar handling should not be
> needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
> without doing any device-specific config access. When the user performs
> any config access or IOCTL after receiving the eventfd notification,
> then the device will be moved to the D0 state first before
> servicing any request.
> 
> Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c  | 37 +++++++++++++++++++++++++++----
>  drivers/vfio/pci/vfio_pci_intrs.c |  6 ++++-
>  include/linux/vfio_pci_core.h     |  3 ++-
>  3 files changed, 40 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index a0d69ddaf90d..5948d930449b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -259,16 +259,45 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat
>  	return ret;
>  }
>  
> +#ifdef CONFIG_PM
> +static int vfio_pci_core_runtime_suspend(struct device *dev)
> +{
> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
> +
> +	/*
> +	 * If INTx is enabled, then mask INTx before going into the runtime
> +	 * suspended state and unmask the same in the runtime resume.
> +	 * If INTx has already been masked by the user, then
> +	 * vfio_pci_intx_mask() will return false and in that case, INTx
> +	 * should not be unmasked in the runtime resume.
> +	 */
> +	vdev->pm_intx_masked = (is_intx(vdev) && vfio_pci_intx_mask(vdev));
> +
> +	return 0;
> +}
> +
> +static int vfio_pci_core_runtime_resume(struct device *dev)
> +{
> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
> +
> +	if (vdev->pm_intx_masked)
> +		vfio_pci_intx_unmask(vdev);
> +
> +	return 0;
> +}
> +#endif /* CONFIG_PM */
> +
>  /*
> - * The dev_pm_ops needs to be provided to make pci-driver runtime PM working,
> - * so use structure without any callbacks.
> - *
>   * The pci-driver core runtime PM routines always save the device state
>   * before going into suspended state. If the device is going into low power
>   * state with only with runtime PM ops, then no explicit handling is needed
>   * for the devices which have NoSoftRst-.
>   */
> -static const struct dev_pm_ops vfio_pci_core_pm_ops = { };
> +static const struct dev_pm_ops vfio_pci_core_pm_ops = {
> +	SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend,
> +			   vfio_pci_core_runtime_resume,
> +			   NULL)
> +};
>  
>  int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
>  {
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> index 6069a11fb51a..1a37db99df48 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -33,10 +33,12 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
>  		eventfd_signal(vdev->ctx[0].trigger, 1);
>  }
>  
> -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
> +/* Returns true if INTx has been masked by this function. */
> +bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
>  {
>  	struct pci_dev *pdev = vdev->pdev;
>  	unsigned long flags;
> +	bool intx_masked = false;
>  
>  	spin_lock_irqsave(&vdev->irqlock, flags);
>  
> @@ -60,9 +62,11 @@ void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
>  			disable_irq_nosync(pdev->irq);
>  
>  		vdev->ctx[0].masked = true;
> +		intx_masked = true;
>  	}
>  
>  	spin_unlock_irqrestore(&vdev->irqlock, flags);
> +	return intx_masked;
>  }


There's certainly another path through this function that masks the
interrupt, which makes the definition of this return value a bit
confusing.  Wouldn't it be simpler not to overload the masked flag on
the interrupt context like this and instead set a new flag on the vdev
under irqlock to indicate the device is unable to generate interrupts.
The irq handler would add a test of this flag before any tests that
would access the device.  Thanks,

Alex
 
>  /*
> diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
> index 23c176d4b073..cdfd328ba6b1 100644
> --- a/include/linux/vfio_pci_core.h
> +++ b/include/linux/vfio_pci_core.h
> @@ -124,6 +124,7 @@ struct vfio_pci_core_device {
>  	bool			needs_reset;
>  	bool			nointx;
>  	bool			needs_pm_restore;
> +	bool			pm_intx_masked;
>  	struct pci_saved_state	*pci_saved_state;
>  	struct pci_saved_state	*pm_save;
>  	int			ioeventfds_nr;
> @@ -147,7 +148,7 @@ struct vfio_pci_core_device {
>  #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
>  #define irq_is(vdev, type) (vdev->irq_type == type)
>  
> -extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
> +extern bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
>  extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
>  
>  extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
Abhishek Sahu July 8, 2022, 9:21 a.m. UTC | #2
On 7/6/2022 9:09 PM, Alex Williamson wrote:
> On Fri, 1 Jul 2022 16:38:09 +0530
> Abhishek Sahu <abhsahu@nvidia.com> wrote:
> 
>> This patch adds INTx handling during runtime suspend/resume.
>> All the suspend/resume related code for the user to put the device
>> into the low power state will be added in subsequent patches.
>>
>> The INTx are shared among devices. Whenever any INTx interrupt comes
> 
> "The INTx lines may be shared..."
> 
>> for the VFIO devices, then vfio_intx_handler() will be called for each
>> device. Inside vfio_intx_handler(), it calls pci_check_and_mask_intx()
> 
> "...device sharing the interrupt."
> 
>> and checks if the interrupt has been generated for the current device.
>> Now, if the device is already in the D3cold state, then the config space
>> can not be read. Attempt to read config space in D3cold state can
>> cause system unresponsiveness in a few systems. To prevent this, mask
>> INTx in runtime suspend callback and unmask the same in runtime resume
>> callback. If INTx has been already masked, then no handling is needed
>> in runtime suspend/resume callbacks. 'pm_intx_masked' tracks this, and
>> vfio_pci_intx_mask() has been updated to return true if INTx has been
>> masked inside this function.
>>
>> For the runtime suspend which is triggered for the no user of VFIO
>> device, the is_intx() will return false and these callbacks won't do
>> anything.
>>
>> The MSI/MSI-X are not shared so similar handling should not be
>> needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
>> without doing any device-specific config access. When the user performs
>> any config access or IOCTL after receiving the eventfd notification,
>> then the device will be moved to the D0 state first before
>> servicing any request.
>>
>> Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
>> ---
>>  drivers/vfio/pci/vfio_pci_core.c  | 37 +++++++++++++++++++++++++++----
>>  drivers/vfio/pci/vfio_pci_intrs.c |  6 ++++-
>>  include/linux/vfio_pci_core.h     |  3 ++-
>>  3 files changed, 40 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
>> index a0d69ddaf90d..5948d930449b 100644
>> --- a/drivers/vfio/pci/vfio_pci_core.c
>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>> @@ -259,16 +259,45 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat
>>  	return ret;
>>  }
>>  
>> +#ifdef CONFIG_PM
>> +static int vfio_pci_core_runtime_suspend(struct device *dev)
>> +{
>> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
>> +
>> +	/*
>> +	 * If INTx is enabled, then mask INTx before going into the runtime
>> +	 * suspended state and unmask the same in the runtime resume.
>> +	 * If INTx has already been masked by the user, then
>> +	 * vfio_pci_intx_mask() will return false and in that case, INTx
>> +	 * should not be unmasked in the runtime resume.
>> +	 */
>> +	vdev->pm_intx_masked = (is_intx(vdev) && vfio_pci_intx_mask(vdev));
>> +
>> +	return 0;
>> +}
>> +
>> +static int vfio_pci_core_runtime_resume(struct device *dev)
>> +{
>> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
>> +
>> +	if (vdev->pm_intx_masked)
>> +		vfio_pci_intx_unmask(vdev);
>> +
>> +	return 0;
>> +}
>> +#endif /* CONFIG_PM */
>> +
>>  /*
>> - * The dev_pm_ops needs to be provided to make pci-driver runtime PM working,
>> - * so use structure without any callbacks.
>> - *
>>   * The pci-driver core runtime PM routines always save the device state
>>   * before going into suspended state. If the device is going into low power
>>   * state with only with runtime PM ops, then no explicit handling is needed
>>   * for the devices which have NoSoftRst-.
>>   */
>> -static const struct dev_pm_ops vfio_pci_core_pm_ops = { };
>> +static const struct dev_pm_ops vfio_pci_core_pm_ops = {
>> +	SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend,
>> +			   vfio_pci_core_runtime_resume,
>> +			   NULL)
>> +};
>>  
>>  int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
>>  {
>> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
>> index 6069a11fb51a..1a37db99df48 100644
>> --- a/drivers/vfio/pci/vfio_pci_intrs.c
>> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
>> @@ -33,10 +33,12 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
>>  		eventfd_signal(vdev->ctx[0].trigger, 1);
>>  }
>>  
>> -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
>> +/* Returns true if INTx has been masked by this function. */
>> +bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
>>  {
>>  	struct pci_dev *pdev = vdev->pdev;
>>  	unsigned long flags;
>> +	bool intx_masked = false;
>>  
>>  	spin_lock_irqsave(&vdev->irqlock, flags);
>>  
>> @@ -60,9 +62,11 @@ void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
>>  			disable_irq_nosync(pdev->irq);
>>  
>>  		vdev->ctx[0].masked = true;
>> +		intx_masked = true;
>>  	}
>>  
>>  	spin_unlock_irqrestore(&vdev->irqlock, flags);
>> +	return intx_masked;
>>  }
> 
> 
> There's certainly another path through this function that masks the
> interrupt, which makes the definition of this return value a bit
> confusing.

 For our case we should not hit that path. But we can return the
 intx_masked true from that path as well to align return value.

> Wouldn't it be simpler not to overload the masked flag on
> the interrupt context like this and instead set a new flag on the vdev
> under irqlock to indicate the device is unable to generate interrupts.
> The irq handler would add a test of this flag before any tests that
> would access the device.  Thanks,
> 
> Alex
>  

 We will set this flag inside runtime_suspend callback but the
 device can be in non-D3cold state (For example, if user has
 disabled d3cold explicitly by sysfs, the D3cold is not supported in
 the platform, etc.). Also, in D3cold supported case, the device will
 be in D0 till the PCI core moves the device into D3cold. In this case,
 there is possibility that the device can generate an interrupt.
 If we add check in the IRQ handler, then we won’t check and clear
 the IRQ status, but the interrupt line will still be asserted
 which can cause interrupt flooding.

 This was the reason for disabling interrupt itself instead of
 checking flag in the IRQ handler.

 Thanks,
 Abhishek

>>  /*
>> diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
>> index 23c176d4b073..cdfd328ba6b1 100644
>> --- a/include/linux/vfio_pci_core.h
>> +++ b/include/linux/vfio_pci_core.h
>> @@ -124,6 +124,7 @@ struct vfio_pci_core_device {
>>  	bool			needs_reset;
>>  	bool			nointx;
>>  	bool			needs_pm_restore;
>> +	bool			pm_intx_masked;
>>  	struct pci_saved_state	*pci_saved_state;
>>  	struct pci_saved_state	*pm_save;
>>  	int			ioeventfds_nr;
>> @@ -147,7 +148,7 @@ struct vfio_pci_core_device {
>>  #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
>>  #define irq_is(vdev, type) (vdev->irq_type == type)
>>  
>> -extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
>> +extern bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
>>  extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
>>  
>>  extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
>
Alex Williamson July 8, 2022, 3:45 p.m. UTC | #3
On Fri, 8 Jul 2022 14:51:30 +0530
Abhishek Sahu <abhsahu@nvidia.com> wrote:

> On 7/6/2022 9:09 PM, Alex Williamson wrote:
> > On Fri, 1 Jul 2022 16:38:09 +0530
> > Abhishek Sahu <abhsahu@nvidia.com> wrote:
> >   
> >> This patch adds INTx handling during runtime suspend/resume.
> >> All the suspend/resume related code for the user to put the device
> >> into the low power state will be added in subsequent patches.
> >>
> >> The INTx are shared among devices. Whenever any INTx interrupt comes  
> > 
> > "The INTx lines may be shared..."
> >   
> >> for the VFIO devices, then vfio_intx_handler() will be called for each
> >> device. Inside vfio_intx_handler(), it calls pci_check_and_mask_intx()  
> > 
> > "...device sharing the interrupt."
> >   
> >> and checks if the interrupt has been generated for the current device.
> >> Now, if the device is already in the D3cold state, then the config space
> >> can not be read. Attempt to read config space in D3cold state can
> >> cause system unresponsiveness in a few systems. To prevent this, mask
> >> INTx in runtime suspend callback and unmask the same in runtime resume
> >> callback. If INTx has been already masked, then no handling is needed
> >> in runtime suspend/resume callbacks. 'pm_intx_masked' tracks this, and
> >> vfio_pci_intx_mask() has been updated to return true if INTx has been
> >> masked inside this function.
> >>
> >> For the runtime suspend which is triggered for the no user of VFIO
> >> device, the is_intx() will return false and these callbacks won't do
> >> anything.
> >>
> >> The MSI/MSI-X are not shared so similar handling should not be
> >> needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
> >> without doing any device-specific config access. When the user performs
> >> any config access or IOCTL after receiving the eventfd notification,
> >> then the device will be moved to the D0 state first before
> >> servicing any request.
> >>
> >> Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
> >> ---
> >>  drivers/vfio/pci/vfio_pci_core.c  | 37 +++++++++++++++++++++++++++----
> >>  drivers/vfio/pci/vfio_pci_intrs.c |  6 ++++-
> >>  include/linux/vfio_pci_core.h     |  3 ++-
> >>  3 files changed, 40 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> >> index a0d69ddaf90d..5948d930449b 100644
> >> --- a/drivers/vfio/pci/vfio_pci_core.c
> >> +++ b/drivers/vfio/pci/vfio_pci_core.c
> >> @@ -259,16 +259,45 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat
> >>  	return ret;
> >>  }
> >>  
> >> +#ifdef CONFIG_PM
> >> +static int vfio_pci_core_runtime_suspend(struct device *dev)
> >> +{
> >> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
> >> +
> >> +	/*
> >> +	 * If INTx is enabled, then mask INTx before going into the runtime
> >> +	 * suspended state and unmask the same in the runtime resume.
> >> +	 * If INTx has already been masked by the user, then
> >> +	 * vfio_pci_intx_mask() will return false and in that case, INTx
> >> +	 * should not be unmasked in the runtime resume.
> >> +	 */
> >> +	vdev->pm_intx_masked = (is_intx(vdev) && vfio_pci_intx_mask(vdev));
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static int vfio_pci_core_runtime_resume(struct device *dev)
> >> +{
> >> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
> >> +
> >> +	if (vdev->pm_intx_masked)
> >> +		vfio_pci_intx_unmask(vdev);
> >> +
> >> +	return 0;
> >> +}
> >> +#endif /* CONFIG_PM */
> >> +
> >>  /*
> >> - * The dev_pm_ops needs to be provided to make pci-driver runtime PM working,
> >> - * so use structure without any callbacks.
> >> - *
> >>   * The pci-driver core runtime PM routines always save the device state
> >>   * before going into suspended state. If the device is going into low power
> >>   * state with only with runtime PM ops, then no explicit handling is needed
> >>   * for the devices which have NoSoftRst-.
> >>   */
> >> -static const struct dev_pm_ops vfio_pci_core_pm_ops = { };
> >> +static const struct dev_pm_ops vfio_pci_core_pm_ops = {
> >> +	SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend,
> >> +			   vfio_pci_core_runtime_resume,
> >> +			   NULL)
> >> +};
> >>  
> >>  int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
> >>  {
> >> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> >> index 6069a11fb51a..1a37db99df48 100644
> >> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> >> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> >> @@ -33,10 +33,12 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
> >>  		eventfd_signal(vdev->ctx[0].trigger, 1);
> >>  }
> >>  
> >> -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
> >> +/* Returns true if INTx has been masked by this function. */
> >> +bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
> >>  {
> >>  	struct pci_dev *pdev = vdev->pdev;
> >>  	unsigned long flags;
> >> +	bool intx_masked = false;
> >>  
> >>  	spin_lock_irqsave(&vdev->irqlock, flags);
> >>  
> >> @@ -60,9 +62,11 @@ void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
> >>  			disable_irq_nosync(pdev->irq);
> >>  
> >>  		vdev->ctx[0].masked = true;
> >> +		intx_masked = true;
> >>  	}
> >>  
> >>  	spin_unlock_irqrestore(&vdev->irqlock, flags);
> >> +	return intx_masked;
> >>  }  
> > 
> > 
> > There's certainly another path through this function that masks the
> > interrupt, which makes the definition of this return value a bit
> > confusing.  
> 
>  For our case we should not hit that path. But we can return the
>  intx_masked true from that path as well to align return value.
> 
> > Wouldn't it be simpler not to overload the masked flag on
> > the interrupt context like this and instead set a new flag on the vdev
> > under irqlock to indicate the device is unable to generate interrupts.
> > The irq handler would add a test of this flag before any tests that
> > would access the device.  Thanks,
> > 
> > Alex
> >    
> 
>  We will set this flag inside runtime_suspend callback but the
>  device can be in non-D3cold state (For example, if user has
>  disabled d3cold explicitly by sysfs, the D3cold is not supported in
>  the platform, etc.). Also, in D3cold supported case, the device will
>  be in D0 till the PCI core moves the device into D3cold. In this case,
>  there is possibility that the device can generate an interrupt.
>  If we add check in the IRQ handler, then we won’t check and clear
>  the IRQ status, but the interrupt line will still be asserted
>  which can cause interrupt flooding.
> 
>  This was the reason for disabling interrupt itself instead of
>  checking flag in the IRQ handler.

Ok, maybe this is largely a clarification of the return value of
vfio_pci_intx_mask().  I think what you're looking for is whether the
context mask was changed, rather than whether the interrupt is masked,
which I think avoids the confusion regarding whether the first branch
should return true or false.  So the variable should be something like
"masked_changed" and the comment changed to "Returns true if the INTx
vfio_pci_irq_ctx.masked value is changed".

Testing is_intx() outside of the irqlock is potentially racy, so do we
need to add the pm-get/put wrappers on ioctls first to avoid the
possibility that pm-suspend can race a SET_IRQS ioctl?  Thanks,

Alex
Abhishek Sahu July 11, 2022, 9:18 a.m. UTC | #4
On 7/8/2022 9:15 PM, Alex Williamson wrote:
> On Fri, 8 Jul 2022 14:51:30 +0530
> Abhishek Sahu <abhsahu@nvidia.com> wrote:
> 
>> On 7/6/2022 9:09 PM, Alex Williamson wrote:
>>> On Fri, 1 Jul 2022 16:38:09 +0530
>>> Abhishek Sahu <abhsahu@nvidia.com> wrote:
>>>   
>>>> This patch adds INTx handling during runtime suspend/resume.
>>>> All the suspend/resume related code for the user to put the device
>>>> into the low power state will be added in subsequent patches.
>>>>
>>>> The INTx are shared among devices. Whenever any INTx interrupt comes  
>>>
>>> "The INTx lines may be shared..."
>>>   
>>>> for the VFIO devices, then vfio_intx_handler() will be called for each
>>>> device. Inside vfio_intx_handler(), it calls pci_check_and_mask_intx()  
>>>
>>> "...device sharing the interrupt."
>>>   
>>>> and checks if the interrupt has been generated for the current device.
>>>> Now, if the device is already in the D3cold state, then the config space
>>>> can not be read. Attempt to read config space in D3cold state can
>>>> cause system unresponsiveness in a few systems. To prevent this, mask
>>>> INTx in runtime suspend callback and unmask the same in runtime resume
>>>> callback. If INTx has been already masked, then no handling is needed
>>>> in runtime suspend/resume callbacks. 'pm_intx_masked' tracks this, and
>>>> vfio_pci_intx_mask() has been updated to return true if INTx has been
>>>> masked inside this function.
>>>>
>>>> For the runtime suspend which is triggered for the no user of VFIO
>>>> device, the is_intx() will return false and these callbacks won't do
>>>> anything.
>>>>
>>>> The MSI/MSI-X are not shared so similar handling should not be
>>>> needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
>>>> without doing any device-specific config access. When the user performs
>>>> any config access or IOCTL after receiving the eventfd notification,
>>>> then the device will be moved to the D0 state first before
>>>> servicing any request.
>>>>
>>>> Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
>>>> ---
>>>>  drivers/vfio/pci/vfio_pci_core.c  | 37 +++++++++++++++++++++++++++----
>>>>  drivers/vfio/pci/vfio_pci_intrs.c |  6 ++++-
>>>>  include/linux/vfio_pci_core.h     |  3 ++-
>>>>  3 files changed, 40 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
>>>> index a0d69ddaf90d..5948d930449b 100644
>>>> --- a/drivers/vfio/pci/vfio_pci_core.c
>>>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>>>> @@ -259,16 +259,45 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat
>>>>  	return ret;
>>>>  }
>>>>  
>>>> +#ifdef CONFIG_PM
>>>> +static int vfio_pci_core_runtime_suspend(struct device *dev)
>>>> +{
>>>> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
>>>> +
>>>> +	/*
>>>> +	 * If INTx is enabled, then mask INTx before going into the runtime
>>>> +	 * suspended state and unmask the same in the runtime resume.
>>>> +	 * If INTx has already been masked by the user, then
>>>> +	 * vfio_pci_intx_mask() will return false and in that case, INTx
>>>> +	 * should not be unmasked in the runtime resume.
>>>> +	 */
>>>> +	vdev->pm_intx_masked = (is_intx(vdev) && vfio_pci_intx_mask(vdev));
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static int vfio_pci_core_runtime_resume(struct device *dev)
>>>> +{
>>>> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
>>>> +
>>>> +	if (vdev->pm_intx_masked)
>>>> +		vfio_pci_intx_unmask(vdev);
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +#endif /* CONFIG_PM */
>>>> +
>>>>  /*
>>>> - * The dev_pm_ops needs to be provided to make pci-driver runtime PM working,
>>>> - * so use structure without any callbacks.
>>>> - *
>>>>   * The pci-driver core runtime PM routines always save the device state
>>>>   * before going into suspended state. If the device is going into low power
>>>>   * state with only with runtime PM ops, then no explicit handling is needed
>>>>   * for the devices which have NoSoftRst-.
>>>>   */
>>>> -static const struct dev_pm_ops vfio_pci_core_pm_ops = { };
>>>> +static const struct dev_pm_ops vfio_pci_core_pm_ops = {
>>>> +	SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend,
>>>> +			   vfio_pci_core_runtime_resume,
>>>> +			   NULL)
>>>> +};
>>>>  
>>>>  int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
>>>>  {
>>>> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
>>>> index 6069a11fb51a..1a37db99df48 100644
>>>> --- a/drivers/vfio/pci/vfio_pci_intrs.c
>>>> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
>>>> @@ -33,10 +33,12 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
>>>>  		eventfd_signal(vdev->ctx[0].trigger, 1);
>>>>  }
>>>>  
>>>> -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
>>>> +/* Returns true if INTx has been masked by this function. */
>>>> +bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
>>>>  {
>>>>  	struct pci_dev *pdev = vdev->pdev;
>>>>  	unsigned long flags;
>>>> +	bool intx_masked = false;
>>>>  
>>>>  	spin_lock_irqsave(&vdev->irqlock, flags);
>>>>  
>>>> @@ -60,9 +62,11 @@ void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
>>>>  			disable_irq_nosync(pdev->irq);
>>>>  
>>>>  		vdev->ctx[0].masked = true;
>>>> +		intx_masked = true;
>>>>  	}
>>>>  
>>>>  	spin_unlock_irqrestore(&vdev->irqlock, flags);
>>>> +	return intx_masked;
>>>>  }  
>>>
>>>
>>> There's certainly another path through this function that masks the
>>> interrupt, which makes the definition of this return value a bit
>>> confusing.  
>>
>>  For our case we should not hit that path. But we can return the
>>  intx_masked true from that path as well to align return value.
>>
>>> Wouldn't it be simpler not to overload the masked flag on
>>> the interrupt context like this and instead set a new flag on the vdev
>>> under irqlock to indicate the device is unable to generate interrupts.
>>> The irq handler would add a test of this flag before any tests that
>>> would access the device.  Thanks,
>>>
>>> Alex
>>>    
>>
>>  We will set this flag inside runtime_suspend callback but the
>>  device can be in non-D3cold state (For example, if user has
>>  disabled d3cold explicitly by sysfs, the D3cold is not supported in
>>  the platform, etc.). Also, in D3cold supported case, the device will
>>  be in D0 till the PCI core moves the device into D3cold. In this case,
>>  there is possibility that the device can generate an interrupt.
>>  If we add check in the IRQ handler, then we won’t check and clear
>>  the IRQ status, but the interrupt line will still be asserted
>>  which can cause interrupt flooding.
>>
>>  This was the reason for disabling interrupt itself instead of
>>  checking flag in the IRQ handler.
> 
> Ok, maybe this is largely a clarification of the return value of
> vfio_pci_intx_mask().  I think what you're looking for is whether the
> context mask was changed, rather than whether the interrupt is masked,
> which I think avoids the confusion regarding whether the first branch
> should return true or false.  So the variable should be something like
> "masked_changed" and the comment changed to "Returns true if the INTx
> vfio_pci_irq_ctx.masked value is changed".
> 

 Thanks Alex.
 I will rename the variable and update the comment.

> Testing is_intx() outside of the irqlock is potentially racy, so do we
> need to add the pm-get/put wrappers on ioctls first to avoid the
> possibility that pm-suspend can race a SET_IRQS ioctl?  Thanks,
> 
> Alex
> 
 
 Even after adding this patch, the runtime suspend will not be supported
 for the device with users. It will be supported only after patch 4 in
 this patch series. So with this patch, the pm-suspend can be called only
 for the cases where vfio device has no user and there we should not see
 the race condition.

 But I can move the patch related with pm-get/put wrappers on
 ioctls before this patch since both are independent. 

 Regards,
 Abhishek
Alex Williamson July 11, 2022, 12:57 p.m. UTC | #5
On Mon, 11 Jul 2022 14:48:34 +0530
Abhishek Sahu <abhsahu@nvidia.com> wrote:

> On 7/8/2022 9:15 PM, Alex Williamson wrote:
> > On Fri, 8 Jul 2022 14:51:30 +0530
> > Abhishek Sahu <abhsahu@nvidia.com> wrote:
> >   
> >> On 7/6/2022 9:09 PM, Alex Williamson wrote:  
> >>> On Fri, 1 Jul 2022 16:38:09 +0530
> >>> Abhishek Sahu <abhsahu@nvidia.com> wrote:
> >>>     
> >>>> This patch adds INTx handling during runtime suspend/resume.
> >>>> All the suspend/resume related code for the user to put the device
> >>>> into the low power state will be added in subsequent patches.
> >>>>
> >>>> The INTx are shared among devices. Whenever any INTx interrupt comes    
> >>>
> >>> "The INTx lines may be shared..."
> >>>     
> >>>> for the VFIO devices, then vfio_intx_handler() will be called for each
> >>>> device. Inside vfio_intx_handler(), it calls pci_check_and_mask_intx()    
> >>>
> >>> "...device sharing the interrupt."
> >>>     
> >>>> and checks if the interrupt has been generated for the current device.
> >>>> Now, if the device is already in the D3cold state, then the config space
> >>>> can not be read. Attempt to read config space in D3cold state can
> >>>> cause system unresponsiveness in a few systems. To prevent this, mask
> >>>> INTx in runtime suspend callback and unmask the same in runtime resume
> >>>> callback. If INTx has been already masked, then no handling is needed
> >>>> in runtime suspend/resume callbacks. 'pm_intx_masked' tracks this, and
> >>>> vfio_pci_intx_mask() has been updated to return true if INTx has been
> >>>> masked inside this function.
> >>>>
> >>>> For the runtime suspend which is triggered for the no user of VFIO
> >>>> device, the is_intx() will return false and these callbacks won't do
> >>>> anything.
> >>>>
> >>>> The MSI/MSI-X are not shared so similar handling should not be
> >>>> needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
> >>>> without doing any device-specific config access. When the user performs
> >>>> any config access or IOCTL after receiving the eventfd notification,
> >>>> then the device will be moved to the D0 state first before
> >>>> servicing any request.
> >>>>
> >>>> Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
> >>>> ---
> >>>>  drivers/vfio/pci/vfio_pci_core.c  | 37 +++++++++++++++++++++++++++----
> >>>>  drivers/vfio/pci/vfio_pci_intrs.c |  6 ++++-
> >>>>  include/linux/vfio_pci_core.h     |  3 ++-
> >>>>  3 files changed, 40 insertions(+), 6 deletions(-)
> >>>>
> >>>> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> >>>> index a0d69ddaf90d..5948d930449b 100644
> >>>> --- a/drivers/vfio/pci/vfio_pci_core.c
> >>>> +++ b/drivers/vfio/pci/vfio_pci_core.c
> >>>> @@ -259,16 +259,45 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat
> >>>>  	return ret;
> >>>>  }
> >>>>  
> >>>> +#ifdef CONFIG_PM
> >>>> +static int vfio_pci_core_runtime_suspend(struct device *dev)
> >>>> +{
> >>>> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
> >>>> +
> >>>> +	/*
> >>>> +	 * If INTx is enabled, then mask INTx before going into the runtime
> >>>> +	 * suspended state and unmask the same in the runtime resume.
> >>>> +	 * If INTx has already been masked by the user, then
> >>>> +	 * vfio_pci_intx_mask() will return false and in that case, INTx
> >>>> +	 * should not be unmasked in the runtime resume.
> >>>> +	 */
> >>>> +	vdev->pm_intx_masked = (is_intx(vdev) && vfio_pci_intx_mask(vdev));
> >>>> +
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>> +static int vfio_pci_core_runtime_resume(struct device *dev)
> >>>> +{
> >>>> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
> >>>> +
> >>>> +	if (vdev->pm_intx_masked)
> >>>> +		vfio_pci_intx_unmask(vdev);
> >>>> +
> >>>> +	return 0;
> >>>> +}
> >>>> +#endif /* CONFIG_PM */
> >>>> +
> >>>>  /*
> >>>> - * The dev_pm_ops needs to be provided to make pci-driver runtime PM working,
> >>>> - * so use structure without any callbacks.
> >>>> - *
> >>>>   * The pci-driver core runtime PM routines always save the device state
> >>>>   * before going into suspended state. If the device is going into low power
> >>>>   * state with only with runtime PM ops, then no explicit handling is needed
> >>>>   * for the devices which have NoSoftRst-.
> >>>>   */
> >>>> -static const struct dev_pm_ops vfio_pci_core_pm_ops = { };
> >>>> +static const struct dev_pm_ops vfio_pci_core_pm_ops = {
> >>>> +	SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend,
> >>>> +			   vfio_pci_core_runtime_resume,
> >>>> +			   NULL)
> >>>> +};
> >>>>  
> >>>>  int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
> >>>>  {
> >>>> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> >>>> index 6069a11fb51a..1a37db99df48 100644
> >>>> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> >>>> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> >>>> @@ -33,10 +33,12 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
> >>>>  		eventfd_signal(vdev->ctx[0].trigger, 1);
> >>>>  }
> >>>>  
> >>>> -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
> >>>> +/* Returns true if INTx has been masked by this function. */
> >>>> +bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
> >>>>  {
> >>>>  	struct pci_dev *pdev = vdev->pdev;
> >>>>  	unsigned long flags;
> >>>> +	bool intx_masked = false;
> >>>>  
> >>>>  	spin_lock_irqsave(&vdev->irqlock, flags);
> >>>>  
> >>>> @@ -60,9 +62,11 @@ void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
> >>>>  			disable_irq_nosync(pdev->irq);
> >>>>  
> >>>>  		vdev->ctx[0].masked = true;
> >>>> +		intx_masked = true;
> >>>>  	}
> >>>>  
> >>>>  	spin_unlock_irqrestore(&vdev->irqlock, flags);
> >>>> +	return intx_masked;
> >>>>  }    
> >>>
> >>>
> >>> There's certainly another path through this function that masks the
> >>> interrupt, which makes the definition of this return value a bit
> >>> confusing.    
> >>
> >>  For our case we should not hit that path. But we can return the
> >>  intx_masked true from that path as well to align return value.
> >>  
> >>> Wouldn't it be simpler not to overload the masked flag on
> >>> the interrupt context like this and instead set a new flag on the vdev
> >>> under irqlock to indicate the device is unable to generate interrupts.
> >>> The irq handler would add a test of this flag before any tests that
> >>> would access the device.  Thanks,
> >>>
> >>> Alex
> >>>      
> >>
> >>  We will set this flag inside runtime_suspend callback but the
> >>  device can be in non-D3cold state (For example, if user has
> >>  disabled d3cold explicitly by sysfs, the D3cold is not supported in
> >>  the platform, etc.). Also, in D3cold supported case, the device will
> >>  be in D0 till the PCI core moves the device into D3cold. In this case,
> >>  there is possibility that the device can generate an interrupt.
> >>  If we add check in the IRQ handler, then we won’t check and clear
> >>  the IRQ status, but the interrupt line will still be asserted
> >>  which can cause interrupt flooding.
> >>
> >>  This was the reason for disabling interrupt itself instead of
> >>  checking flag in the IRQ handler.  
> > 
> > Ok, maybe this is largely a clarification of the return value of
> > vfio_pci_intx_mask().  I think what you're looking for is whether the
> > context mask was changed, rather than whether the interrupt is masked,
> > which I think avoids the confusion regarding whether the first branch
> > should return true or false.  So the variable should be something like
> > "masked_changed" and the comment changed to "Returns true if the INTx
> > vfio_pci_irq_ctx.masked value is changed".
> >   
> 
>  Thanks Alex.
>  I will rename the variable and update the comment.
> 
> > Testing is_intx() outside of the irqlock is potentially racy, so do we
> > need to add the pm-get/put wrappers on ioctls first to avoid the
> > possibility that pm-suspend can race a SET_IRQS ioctl?  Thanks,
> > 
> > Alex
> >   
>  
>  Even after adding this patch, the runtime suspend will not be supported
>  for the device with users. It will be supported only after patch 4 in
>  this patch series. So with this patch, the pm-suspend can be called only
>  for the cases where vfio device has no user and there we should not see
>  the race condition.

We should also not see is_intx() true for unused devices.  Thanks,

Alex
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index a0d69ddaf90d..5948d930449b 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -259,16 +259,45 @@  int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat
 	return ret;
 }
 
+#ifdef CONFIG_PM
+static int vfio_pci_core_runtime_suspend(struct device *dev)
+{
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
+
+	/*
+	 * If INTx is enabled, then mask INTx before going into the runtime
+	 * suspended state and unmask the same in the runtime resume.
+	 * If INTx has already been masked by the user, then
+	 * vfio_pci_intx_mask() will return false and in that case, INTx
+	 * should not be unmasked in the runtime resume.
+	 */
+	vdev->pm_intx_masked = (is_intx(vdev) && vfio_pci_intx_mask(vdev));
+
+	return 0;
+}
+
+static int vfio_pci_core_runtime_resume(struct device *dev)
+{
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
+
+	if (vdev->pm_intx_masked)
+		vfio_pci_intx_unmask(vdev);
+
+	return 0;
+}
+#endif /* CONFIG_PM */
+
 /*
- * The dev_pm_ops needs to be provided to make pci-driver runtime PM working,
- * so use structure without any callbacks.
- *
  * The pci-driver core runtime PM routines always save the device state
  * before going into suspended state. If the device is going into low power
  * state with only with runtime PM ops, then no explicit handling is needed
  * for the devices which have NoSoftRst-.
  */
-static const struct dev_pm_ops vfio_pci_core_pm_ops = { };
+static const struct dev_pm_ops vfio_pci_core_pm_ops = {
+	SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend,
+			   vfio_pci_core_runtime_resume,
+			   NULL)
+};
 
 int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 {
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 6069a11fb51a..1a37db99df48 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -33,10 +33,12 @@  static void vfio_send_intx_eventfd(void *opaque, void *unused)
 		eventfd_signal(vdev->ctx[0].trigger, 1);
 }
 
-void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
+/* Returns true if INTx has been masked by this function. */
+bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned long flags;
+	bool intx_masked = false;
 
 	spin_lock_irqsave(&vdev->irqlock, flags);
 
@@ -60,9 +62,11 @@  void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
 			disable_irq_nosync(pdev->irq);
 
 		vdev->ctx[0].masked = true;
+		intx_masked = true;
 	}
 
 	spin_unlock_irqrestore(&vdev->irqlock, flags);
+	return intx_masked;
 }
 
 /*
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 23c176d4b073..cdfd328ba6b1 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -124,6 +124,7 @@  struct vfio_pci_core_device {
 	bool			needs_reset;
 	bool			nointx;
 	bool			needs_pm_restore;
+	bool			pm_intx_masked;
 	struct pci_saved_state	*pci_saved_state;
 	struct pci_saved_state	*pm_save;
 	int			ioeventfds_nr;
@@ -147,7 +148,7 @@  struct vfio_pci_core_device {
 #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
 #define irq_is(vdev, type) (vdev->irq_type == type)
 
-extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
+extern bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
 extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
 
 extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,