diff mbox

[v6,2/3] drivers/vfio: EEH support for VFIO PCI device

Message ID 1400747034-15045-3-git-send-email-gwshan@linux.vnet.ibm.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Gavin Shan May 22, 2014, 8:23 a.m. UTC
The patch adds new IOCTL commands for VFIO PCI device to support
EEH functionality for PCI devices, which have been passed through
from host to somebody else via VFIO.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 Documentation/vfio.txt         |  88 ++++++++++-
 arch/powerpc/include/asm/eeh.h |  17 +++
 arch/powerpc/kernel/eeh.c      | 321 +++++++++++++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci.c    | 131 ++++++++++++++++-
 include/uapi/linux/vfio.h      |  53 +++++++
 5 files changed, 603 insertions(+), 7 deletions(-)

Comments

Alexander Graf May 22, 2014, 9:55 a.m. UTC | #1
On 22.05.14 10:23, Gavin Shan wrote:
> The patch adds new IOCTL commands for VFIO PCI device to support
> EEH functionality for PCI devices, which have been passed through
> from host to somebody else via VFIO.
>
> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>

This already looks a *lot* more sane than the previous versions. We're 
slowly getting there I think ;).

Ben, could you please check through the exported EEH interface itself 
and verify whether it does all the lockings correctly, uses the API 
properly and doesn't allow for a user space program to blow up the system?


> ---
>   Documentation/vfio.txt         |  88 ++++++++++-
>   arch/powerpc/include/asm/eeh.h |  17 +++
>   arch/powerpc/kernel/eeh.c      | 321 +++++++++++++++++++++++++++++++++++++++++
>   drivers/vfio/pci/vfio_pci.c    | 131 ++++++++++++++++-
>   include/uapi/linux/vfio.h      |  53 +++++++
>   5 files changed, 603 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index b9ca023..dd13db6 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
>   an excellent performance which has limitations such as inability to do
>   locked pages accounting in real time.
>   
> -So 3 additional ioctls have been added:
> +4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
> +which works on the basis of additional ioctl commands.
> +
> +So 8 additional ioctls have been added:
>   
>   	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>   		of the DMA window on the PCI bus.
> @@ -316,6 +319,20 @@ So 3 additional ioctls have been added:
>   
>   	VFIO_IOMMU_DISABLE - disables the container.
>   
> +	VFIO_EEH_PE_SET_OPTION - enables or disables EEH functinality on the

functionality

> +		specified device. Also, it can be used to remove IO or DMA
> +		stopped state on the frozen PE.
> +
> +	VFIO_EEH_PE_GET_ADDR - retrieve the unique address of the specified
> +		PE or query PE sharing mode.

What is PE?

> +
> +	VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
> +
> +	VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
> +		error recovering.
> +
> +	VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
> +		one of the major steps for error recoverying.
>   
>   The code flow from the example above should be slightly changed:
>   
> @@ -346,6 +363,75 @@ The code flow from the example above should be slightly changed:
>   	ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
>   	.....
>   
> +Based on the initial example we have, the following piece of code could be
> +reference for EEH setup and error handling:
> +
> +	struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
> +	struct vfio_eeh_pe_get_addr addr = { .argsz = sizeof(addr) };
> +	struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
> +	struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
> +	struct vfio_eeh_pe_configure config = { .argsz = sizeof(config) };
> +
> +	....
> +
> +	/* Get a file descriptor for the device */
> +	device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
> +
> +	/* Enable the EEH functionality on the device */
> +	option.option = EEH_OPT_ENABLE;
> +	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
> +
> +	/* Retrieve PE address and create and maintain PE by yourself */
> +	addr.option = EEH_OPT_GET_PE_ADDR;
> +	ioctl(device, VFIO_EEH_PE_GET_ADDR, &addr);
> +
> +	/* Assure EEH is supported on the PE and make PE functional */
> +	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
> +
> +	/* Save device's state. pci_save_state() would be good enough
> +	 * as an example.
> +	 */
> +
> +	/* Test and setup the device */
> +	ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
> +
> +	....
> +
> +	/* When 0xFF's returned from reading PCI config space or IO BARs
> +	 * of the PCI device. Check the PE state to see if that has been
> +	 * frozen.
> +	 */
> +	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
> +
> +	/* Waiting for pending PCI transactions to be completed and don't
> +	 * produce any more PCI traffic from/to the affected PE until
> +	 * recovery is finished.
> +	 */
> +
> +	/* Enable IO for the affected PE and collect logs. Usually, the
> +	 * standard part of PCI config space, AER registers are dumped
> +	 * as logs for further analysis.
> +	 */
> +	option.option = EEH_OPT_THAW_MMIO;
> +	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
> +
> +	/* Issue PE reset */
> +	reset.option = EEH_RESET_HOT;
> +	ioctl(device, VFIO_EEH_PE_RESET, &reset);
> +
> +	/* Configure the PCI bridges for the affected PE */
> +	ioctl(device, VFIO_EEH_PE_CONFIGURE, NULL);
> +
> +	/* Restored state we saved at initialization time. pci_restore_state()
> +	 * is good enough as an example.
> +	 */
> +
> +	/* Hopefully, error is recovered successfully. Now, you can resume to
> +	 * start PCI traffic to/from the affected PE.
> +	 */
> +
> +	....
> +
>   -------------------------------------------------------------------------------
>   
>   [1] VFIO was originally an acronym for "Virtual Function I/O" in its
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 34a2d83..dd5f1cf 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -191,6 +191,11 @@ enum {
>   #define EEH_OPT_ENABLE		1	/* EEH enable	*/
>   #define EEH_OPT_THAW_MMIO	2	/* MMIO enable	*/
>   #define EEH_OPT_THAW_DMA	3	/* DMA enable	*/
> +#define EEH_OPT_GET_PE_ADDR	0	/* Get PE addr	*/
> +#define EEH_OPT_GET_PE_MODE	1	/* Get PE mode	*/
> +#define EEH_PE_MODE_NONE	0	/* Invalid mode	*/
> +#define EEH_PE_MODE_NOT_SHARED	1	/* Not shared	*/
> +#define EEH_PE_MODE_SHARED	2	/* Shared mode	*/
>   #define EEH_STATE_UNAVAILABLE	(1 << 0)	/* State unavailable	*/
>   #define EEH_STATE_NOT_SUPPORT	(1 << 1)	/* EEH not supported	*/
>   #define EEH_STATE_RESET_ACTIVE	(1 << 2)	/* Active reset		*/
> @@ -198,6 +203,11 @@ enum {
>   #define EEH_STATE_DMA_ACTIVE	(1 << 4)	/* Active DMA		*/
>   #define EEH_STATE_MMIO_ENABLED	(1 << 5)	/* MMIO enabled		*/
>   #define EEH_STATE_DMA_ENABLED	(1 << 6)	/* DMA enabled		*/
> +#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
> +#define EEH_PE_STATE_RESET		1		/* PE reset	*/
> +#define EEH_PE_STATE_STOPPED_IO_DMA	2		/* Stopped	*/
> +#define EEH_PE_STATE_STOPPED_DMA	4		/* Stopped DMA	*/
> +#define EEH_PE_STATE_UNAVAIL		5		/* Unavailable	*/
>   #define EEH_RESET_DEACTIVATE	0	/* Deactivate the PE reset	*/
>   #define EEH_RESET_HOT		1	/* Hot reset			*/
>   #define EEH_RESET_FUNDAMENTAL	3	/* Fundamental reset		*/
> @@ -305,6 +315,13 @@ void eeh_add_device_late(struct pci_dev *);
>   void eeh_add_device_tree_late(struct pci_bus *);
>   void eeh_add_sysfs_files(struct pci_bus *);
>   void eeh_remove_device(struct pci_dev *);
> +int eeh_dev_open(struct pci_dev *pdev);
> +void eeh_dev_release(struct pci_dev *pdev);
> +int eeh_pe_set_option(struct pci_dev *pdev, int option);
> +int eeh_pe_get_addr(struct pci_dev *pdev, int option);
> +int eeh_pe_get_state(struct pci_dev *pdev);
> +int eeh_pe_reset(struct pci_dev *pdev, int option);
> +int eeh_pe_configure(struct pci_dev *pdev);
>   
>   /**
>    * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 9c6b899..b90a474 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -108,6 +108,9 @@ struct eeh_ops *eeh_ops = NULL;
>   /* Lock to avoid races due to multiple reports of an error */
>   DEFINE_RAW_SPINLOCK(confirm_error_lock);
>   
> +/* Lock to protect passed flags */
> +static DEFINE_MUTEX(eeh_dev_mutex);
> +
>   /* Buffer for reporting pci register dumps. Its here in BSS, and
>    * not dynamically alloced, so that it ends up in RMO where RTAS
>    * can access it.
> @@ -1098,6 +1101,324 @@ void eeh_remove_device(struct pci_dev *dev)
>   	edev->mode &= ~EEH_DEV_SYSFS;
>   }
>   
> +/**
> + * eeh_dev_open - Mark EEH device and PE as passed through
> + * @pdev: PCI device
> + *
> + * Mark the indicated EEH device and PE as passed through.
> + * In the result, the EEH errors detected on the PE won't be
> + * reported. The owner of the device will be responsible for
> + * detection and recovery.
> + */
> +int eeh_dev_open(struct pci_dev *pdev)
> +{
> +	struct eeh_dev *edev;
> +
> +	mutex_lock(&eeh_dev_mutex);
> +
> +	/* No PCI device ? */
> +	if (!pdev) {
> +		mutex_unlock(&eeh_dev_mutex);
> +		return -ENODEV;
> +	}
> +
> +	/* No EEH device ? */
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !edev->pe) {
> +		mutex_unlock(&eeh_dev_mutex);
> +		return -ENODEV;
> +	}
> +
> +	eeh_dev_set_passed(edev, true);
> +	eeh_pe_set_passed(edev->pe, true);
> +	mutex_unlock(&eeh_dev_mutex);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(eeh_dev_open);
> +
> +/**
> + * eeh_dev_release - Reclaim the ownership of EEH device
> + * @pdev: PCI device
> + *
> + * Reclaim ownership of EEH device, potentially the corresponding
> + * PE. In the result, the EEH errors detected on the PE will be
> + * reported and handled as usual.
> + */
> +void eeh_dev_release(struct pci_dev *pdev)
> +{
> +	bool release_pe = true;
> +	struct eeh_pe *pe = NULL;
> +	struct eeh_dev *tmp, *edev;
> +
> +	mutex_lock(&eeh_dev_mutex);
> +
> +	/* No PCI device ? */
> +	if (!pdev) {
> +		mutex_unlock(&eeh_dev_mutex);
> +		return;
> +	}
> +
> +	/* No EEH device ? */
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !eeh_dev_passed(edev) ||
> +	    !edev->pe || !eeh_pe_passed(pe)) {
> +		mutex_unlock(&eeh_dev_mutex);
> +		return;
> +	}
> +
> +	/* Release device */
> +	pe = edev->pe;
> +	eeh_dev_set_passed(edev, false);
> +
> +	/* Release PE */
> +	eeh_pe_for_each_dev(pe, edev, tmp) {
> +		if (eeh_dev_passed(edev)) {
> +			release_pe = false;
> +			break;
> +		}
> +	}
> +
> +	if (release_pe)
> +		eeh_pe_set_passed(pe, false);
> +
> +	mutex_unlock(&eeh_dev_mutex);
> +}
> +EXPORT_SYMBOL(eeh_dev_release);
> +
> +static int eeh_dev_check(struct pci_dev *pdev,
> +			 struct eeh_dev **pedev,
> +			 struct eeh_pe **ppe)
> +{
> +	struct eeh_dev *edev;
> +
> +	/* No device ? */
> +	if (!pdev)
> +		return -ENODEV;
> +
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !eeh_dev_passed(edev) ||
> +	    !edev->pe || !eeh_pe_passed(edev->pe))
> +		return -ENODEV;
> +
> +	if (pedev)
> +		*pedev = edev;
> +	if (ppe)
> +		*ppe = edev->pe;
> +
> +	return 0;
> +}
> +
> +/**
> + * eeh_pe_set_option - Set options for the indicated PE
> + * @pdev: PCI device
> + * @option: requested option
> + *
> + * The routine is called to enable or disable EEH functionality
> + * on the indicated PE, to enable IO or DMA for the frozen PE.
> + */
> +int eeh_pe_set_option(struct pci_dev *pdev, int option)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_dev_check(pdev, &edev, &pe);
> +	if (ret)
> +		return ret;
> +
> +	switch (option) {
> +	case EEH_OPT_DISABLE:
> +	case EEH_OPT_ENABLE:

This deserves a comment

> +		break;
> +	case EEH_OPT_THAW_MMIO:
> +	case EEH_OPT_THAW_DMA:
> +		if (!eeh_ops || !eeh_ops->set_option) {
> +			ret = -ENOENT;
> +			break;
> +		}
> +
> +		ret = eeh_ops->set_option(pe, option);
> +		break;
> +	default:
> +		pr_debug("%s: Option %d out of range (%d, %d)\n",
> +			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_pe_set_option);
> +
> +/**
> + * eeh_pe_get_addr - Retrieve the PE address or sharing mode
> + * @pdev: PCI device
> + * @option: option
> + *
> + * Retrieve the PE address or sharing mode.
> + */
> +int eeh_pe_get_addr(struct pci_dev *pdev, int option)
> +{
> +	struct pci_bus *bus;
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_dev_check(pdev, &edev, &pe);
> +	if (ret)

Probably safer to write this as ret < 0. Positive return values are 
success now.

> +		return ret;
> +
> +	switch (option) {
> +	case EEH_OPT_GET_PE_ADDR:
> +		bus = eeh_pe_bus_get(pe);
> +		if (!bus) {
> +			ret = -ENODEV;
> +			break;
> +		}
> +
> +		/* PE address has format "00BBSS00" */
> +		ret = bus->number << 16;
> +		break;
> +	case EEH_OPT_GET_PE_MODE:
> +		/* Wa always have shared PE */

Wa?

> +		ret = EEH_PE_MODE_SHARED;
> +		break;
> +	default:
> +		pr_debug("%s: option %d out of range (0, 1)\n",
> +			__func__, option);
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_pe_get_addr);
> +
> +/**
> + * eeh_pe_get_state - Retrieve PE's state
> + * @pdev: PCI device
> + *
> + * Retrieve the PE's state, which includes 3 aspects: enabled
> + * DMA, enabled IO and asserted reset.
> + */
> +int eeh_pe_get_state(struct pci_dev *pdev)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int result, ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_dev_check(pdev, &edev, &pe);
> +	if (ret)
> +		return ret;
> +
> +	if (!eeh_ops || !eeh_ops->get_state)
> +		return -ENOENT;
> +
> +	result = eeh_ops->get_state(pe, NULL);
> +	if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +	     (result & EEH_STATE_DMA_ENABLED) &&
> +	     (result & EEH_STATE_MMIO_ENABLED))
> +		ret = EEH_PE_STATE_NORMAL;
> +	else if (result & EEH_STATE_RESET_ACTIVE)
> +		ret = EEH_PE_STATE_RESET;
> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +		 !(result & EEH_STATE_DMA_ENABLED) &&
> +		 !(result & EEH_STATE_MMIO_ENABLED))
> +		ret = EEH_PE_STATE_STOPPED_IO_DMA;
> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +		 (result & EEH_STATE_DMA_ENABLED) &&
> +		 !(result & EEH_STATE_MMIO_ENABLED))
> +		ret = EEH_PE_STATE_STOPPED_DMA;
> +	else
> +		ret = EEH_PE_STATE_UNAVAIL;
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_pe_get_state);
> +
> +/**
> + * eeh_pe_reset - Issue PE reset according to specified type
> + * @pdev: PCI device
> + * @option: reset type
> + *
> + * The routine is called to reset the specified PE with the
> + * indicated type, either fundamental reset or hot reset.
> + * PE reset is the most important part for error recovery.
> + */
> +int eeh_pe_reset(struct pci_dev *pdev, int option)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_dev_check(pdev, &edev, &pe);
> +	if (ret)
> +		return ret;
> +
> +	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset)
> +		return -ENOENT;
> +
> +	switch (option) {
> +	case EEH_RESET_DEACTIVATE:
> +		ret = eeh_ops->reset(pe, option);
> +		if (ret)
> +			break;
> +
> +		/*
> +		 * The PE is still in frozen state and we need clear

to

> +		 * that. It's good to clear frozen state after deassert
> +		 * to avoid messy IO access during reset, which might
> +		 * cause recursive frozen PE.
> +		 */
> +		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
> +		if (!ret)
> +			ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
> +		if (!ret)
> +			eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
> +		break;
> +	case EEH_RESET_HOT:
> +	case EEH_RESET_FUNDAMENTAL:
> +		ret = eeh_ops->reset(pe, option);
> +		break;
> +	default:
> +		pr_debug("%s: Unsupported option %d\n",
> +			__func__, option);
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_pe_reset);
> +
> +/**
> + * eeh_pe_configure - Configure PCI bridges after PE reset
> + * @pdev: PCI device
> + *
> + * The routine is called to restore the PCI config space for
> + * those PCI devices, especially PCI bridges affected by PE
> + * reset issued previously.

So would it make sense to combine this with the reset call?

> + */
> +int eeh_pe_configure(struct pci_dev *pdev)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_dev_check(pdev, &edev, &pe);
> +	if (ret)
> +		return ret;
> +
> +	/* Restore config space for the affected devices */
> +	eeh_pe_restore_bars(pe);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_pe_configure);
> +
>   static int proc_eeh_show(struct seq_file *m, void *v)
>   {
>   	if (!eeh_enabled()) {
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 7ba0424..301ac18 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -25,6 +25,9 @@
>   #include <linux/types.h>
>   #include <linux/uaccess.h>
>   #include <linux/vfio.h>
> +#ifdef CONFIG_EEH
> +#include <asm/eeh.h>
> +#endif
>   
>   #include "vfio_pci_private.h"
>   
> @@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>   	pci_restore_state(pdev);
>   }
>   
> +static void vfio_eeh_pci_release(struct pci_dev *pdev)
> +{
> +#ifdef CONFIG_EEH
> +	eeh_dev_release(pdev);
> +#endif
> +}
> +
>   static void vfio_pci_release(void *device_data)
>   {
>   	struct vfio_pci_device *vdev = device_data;
>   
> -	if (atomic_dec_and_test(&vdev->refcnt))
> +	if (atomic_dec_and_test(&vdev->refcnt)) {
> +		vfio_eeh_pci_release(vdev->pdev);
>   		vfio_pci_disable(vdev);
> +	}
>   
>   	module_put(THIS_MODULE);
>   }
>   
> +static int vfio_eeh_pci_open(struct pci_dev *pdev)
> +{
> +	int ret = 0;
> +
> +#ifdef CONFIG_EEH
> +	ret = eeh_dev_open(pdev);
> +#endif
> +	return ret;
> +}
> +
>   static int vfio_pci_open(void *device_data)
>   {
>   	struct vfio_pci_device *vdev = device_data;
> +	int ret;
>   
>   	if (!try_module_get(THIS_MODULE))
>   		return -ENODEV;
>   
>   	if (atomic_inc_return(&vdev->refcnt) == 1) {
> -		int ret = vfio_pci_enable(vdev);
> -		if (ret) {
> -			module_put(THIS_MODULE);
> -			return ret;
> -		}
> +		ret = vfio_pci_enable(vdev);
> +		if (ret)
> +			goto error;
> +
> +		ret = vfio_eeh_pci_open(vdev->pdev);
> +		if (ret)
> +			goto error;
>   	}
>   
>   	return 0;
> +error:
> +	module_put(THIS_MODULE);
> +	return ret;
>   }
>   
>   static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
> @@ -321,6 +349,91 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>   	return walk.ret;
>   }
>   
> +static int vfio_eeh_pci_ioctl(struct pci_dev *pdev,
> +			      unsigned int cmd,
> +			      unsigned long arg)
> +{
> +	unsigned long minsz;
> +	int ret = 0;
> +
> +#ifdef CONFIG_EEH
> +	switch (cmd) {
> +	case VFIO_EEH_PE_SET_OPTION: {
> +		struct vfio_eeh_pe_set_option option;
> +
> +		minsz = offsetofend(struct vfio_eeh_pe_set_option, option);
> +		if (copy_from_user(&option, (void __user *)arg, minsz))
> +			return -EFAULT;
> +		if (option.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = eeh_pe_set_option(pdev, option.option);
> +		break;
> +	}
> +	case VFIO_EEH_PE_GET_ADDR: {
> +		struct vfio_eeh_pe_get_addr addr;
> +
> +		minsz = offsetofend(struct vfio_eeh_pe_get_addr, info);
> +		if (copy_from_user(&addr, (void __user *)arg, minsz))
> +			return -EFAULT;
> +		if (addr.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = eeh_pe_get_addr(pdev, addr.option);
> +		if (ret >= 0) {
> +			addr.info = ret;
> +			if (copy_to_user((void __user *)arg, &addr, minsz))
> +				return -EFAULT;
> +			ret = 0;
> +		}
> +
> +		break;
> +	}
> +	case VFIO_EEH_PE_GET_STATE: {
> +		struct vfio_eeh_pe_get_state state;
> +
> +		minsz = offsetofend(struct vfio_eeh_pe_get_state, state);
> +		if (copy_from_user(&state, (void __user *)arg, minsz))
> +			return -EFAULT;
> +		if (state.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = eeh_pe_get_state(pdev);
> +		if (ret >= 0) {
> +			state.state = ret;
> +			if (copy_to_user((void __user *)arg, &state, minsz))
> +				return -EFAULT;
> +			ret = 0;
> +		}
> +		break;
> +	}
> +	case VFIO_EEH_PE_RESET: {
> +		struct vfio_eeh_pe_reset reset;
> +
> +		minsz = offsetofend(struct vfio_eeh_pe_reset, option);
> +		if (copy_from_user(&reset, (void __user *)arg, minsz))
> +			return -EFAULT;
> +		if (reset.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = eeh_pe_reset(pdev, reset.option);
> +		break;
> +	}
> +	case VFIO_EEH_PE_CONFIGURE:
> +		ret = eeh_pe_configure(pdev);
> +		break;
> +	default:
> +		ret = -EINVAL;
> +		pr_debug("%s: Cannot handle command %d\n",
> +			__func__, cmd);
> +	}
> +#else
> +	ret = -ENOENT;
> +#endif
> +
> +	return ret;
> +}
> +
>   static long vfio_pci_ioctl(void *device_data,
>   			   unsigned int cmd, unsigned long arg)
>   {
> @@ -682,6 +795,12 @@ hot_reset_release:
>   
>   		kfree(groups);
>   		return ret;
> +	} else if (cmd == VFIO_EEH_PE_SET_OPTION ||
> +		   cmd == VFIO_EEH_PE_GET_ADDR ||
> +		   cmd == VFIO_EEH_PE_GET_STATE ||
> +		   cmd == VFIO_EEH_PE_RESET ||
> +		   cmd == VFIO_EEH_PE_CONFIGURE) {
> +			return vfio_eeh_pci_ioctl(vdev->pdev, cmd, arg);
>   	}
>   
>   	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index cb9023d..ef55682 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
>   
>   #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>   
> +/*
> + * EEH functionality can be enabled or disabled on one specific device.
> + * Also, the DMA or IO frozen state can be removed from the frozen PE
> + * if required.
> + */
> +struct vfio_eeh_pe_set_option {
> +	__u32 argsz;

What is this argsz thing? Is this your way of maintaining backwards 
compatibility when we introduce new fields? A new field will change the 
ioctl number, so I don't think that makes a lot of sense :).

Just make the ioctl have a u32 as incoming argument. No fancy structs, 
no complicated code.

The same applies for a number of structs below.

> +	__u32 option;
> +};
> +
> +#define VFIO_EEH_PE_SET_OPTION		_IO(VFIO_TYPE, VFIO_BASE + 21)
> +
> +/*
> + * Each EEH PE should have unique address to be identified. The command
> + * helps to retrieve the address and the sharing mode of the PE.
> + */
> +struct vfio_eeh_pe_get_addr {
> +	__u32 argsz;
> +	__u32 option;
> +	__u32 info;

Any particular reason you need the info field? Can't the return value of 
the ioctl hold this? Then you only have a single u32 argument left to 
the ioctl again.


Alex

> +};
> +
> +#define VFIO_EEH_PE_GET_ADDR		_IO(VFIO_TYPE, VFIO_BASE + 22)
> +
> +/*
> + * EEH PE might have been frozen because of PCI errors. Also, it might
> + * be experiencing reset for error revoery. The following command helps
> + * to get the state.
> + */
> +struct vfio_eeh_pe_get_state {
> +	__u32 argsz;
> +	__u32 state;
> +};
> +
> +#define VFIO_EEH_PE_GET_STATE		_IO(VFIO_TYPE, VFIO_BASE + 23)
> +
> +/*
> + * Reset is the major step to recover problematic PE. The following
> + * command helps on that.
> + */
> +struct vfio_eeh_pe_reset {
> +	__u32 argsz;
> +	__u32 option;
> +};
> +
> +#define VFIO_EEH_PE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 24)
> +
> +/*
> + * One of the steps for recovery after PE reset is to configure the
> + * PCI bridges affected by the PE reset.
> + */
> +#define VFIO_EEH_PE_CONFIGURE		_IO(VFIO_TYPE, VFIO_BASE + 25)
> +
>   /* ***************************************************************** */
>   
>   #endif /* _UAPIVFIO_H */
Gavin Shan May 23, 2014, 12:17 a.m. UTC | #2
On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
>
>On 22.05.14 10:23, Gavin Shan wrote:
>>The patch adds new IOCTL commands for VFIO PCI device to support
>>EEH functionality for PCI devices, which have been passed through
>>from host to somebody else via VFIO.
>>
>>Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>This already looks a *lot* more sane than the previous versions.
>We're slowly getting there I think ;).
>
>Ben, could you please check through the exported EEH interface itself
>and verify whether it does all the lockings correctly, uses the API
>properly and doesn't allow for a user space program to blow up the
>system?
>
>
>>---
>>  Documentation/vfio.txt         |  88 ++++++++++-
>>  arch/powerpc/include/asm/eeh.h |  17 +++
>>  arch/powerpc/kernel/eeh.c      | 321 +++++++++++++++++++++++++++++++++++++++++
>>  drivers/vfio/pci/vfio_pci.c    | 131 ++++++++++++++++-
>>  include/uapi/linux/vfio.h      |  53 +++++++
>>  5 files changed, 603 insertions(+), 7 deletions(-)
>>
>>diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>>index b9ca023..dd13db6 100644
>>--- a/Documentation/vfio.txt
>>+++ b/Documentation/vfio.txt
>>@@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
>>  an excellent performance which has limitations such as inability to do
>>  locked pages accounting in real time.
>>-So 3 additional ioctls have been added:
>>+4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
>>+which works on the basis of additional ioctl commands.
>>+
>>+So 8 additional ioctls have been added:
>>  	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>>  		of the DMA window on the PCI bus.
>>@@ -316,6 +319,20 @@ So 3 additional ioctls have been added:
>>  	VFIO_IOMMU_DISABLE - disables the container.
>>+	VFIO_EEH_PE_SET_OPTION - enables or disables EEH functinality on the
>
>functionality
>

Typo. Will fix in next revision.

>>+		specified device. Also, it can be used to remove IO or DMA
>>+		stopped state on the frozen PE.
>>+
>>+	VFIO_EEH_PE_GET_ADDR - retrieve the unique address of the specified
>>+		PE or query PE sharing mode.
>
>What is PE?
>

will add description about that to Documentation/vfio.txt.

>>+
>>+	VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
>>+
>>+	VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
>>+		error recovering.
>>+
>>+	VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
>>+		one of the major steps for error recoverying.
>>  The code flow from the example above should be slightly changed:
>>@@ -346,6 +363,75 @@ The code flow from the example above should be slightly changed:
>>  	ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
>>  	.....
>>+Based on the initial example we have, the following piece of code could be
>>+reference for EEH setup and error handling:
>>+
>>+	struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
>>+	struct vfio_eeh_pe_get_addr addr = { .argsz = sizeof(addr) };
>>+	struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
>>+	struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
>>+	struct vfio_eeh_pe_configure config = { .argsz = sizeof(config) };
>>+
>>+	....
>>+
>>+	/* Get a file descriptor for the device */
>>+	device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
>>+
>>+	/* Enable the EEH functionality on the device */
>>+	option.option = EEH_OPT_ENABLE;
>>+	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
>>+
>>+	/* Retrieve PE address and create and maintain PE by yourself */
>>+	addr.option = EEH_OPT_GET_PE_ADDR;
>>+	ioctl(device, VFIO_EEH_PE_GET_ADDR, &addr);
>>+
>>+	/* Assure EEH is supported on the PE and make PE functional */
>>+	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
>>+
>>+	/* Save device's state. pci_save_state() would be good enough
>>+	 * as an example.
>>+	 */
>>+
>>+	/* Test and setup the device */
>>+	ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
>>+
>>+	....
>>+
>>+	/* When 0xFF's returned from reading PCI config space or IO BARs
>>+	 * of the PCI device. Check the PE state to see if that has been
>>+	 * frozen.
>>+	 */
>>+	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
>>+
>>+	/* Waiting for pending PCI transactions to be completed and don't
>>+	 * produce any more PCI traffic from/to the affected PE until
>>+	 * recovery is finished.
>>+	 */
>>+
>>+	/* Enable IO for the affected PE and collect logs. Usually, the
>>+	 * standard part of PCI config space, AER registers are dumped
>>+	 * as logs for further analysis.
>>+	 */
>>+	option.option = EEH_OPT_THAW_MMIO;
>>+	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
>>+
>>+	/* Issue PE reset */
>>+	reset.option = EEH_RESET_HOT;
>>+	ioctl(device, VFIO_EEH_PE_RESET, &reset);
>>+
>>+	/* Configure the PCI bridges for the affected PE */
>>+	ioctl(device, VFIO_EEH_PE_CONFIGURE, NULL);
>>+
>>+	/* Restored state we saved at initialization time. pci_restore_state()
>>+	 * is good enough as an example.
>>+	 */
>>+
>>+	/* Hopefully, error is recovered successfully. Now, you can resume to
>>+	 * start PCI traffic to/from the affected PE.
>>+	 */
>>+
>>+	....
>>+
>>  -------------------------------------------------------------------------------
>>  [1] VFIO was originally an acronym for "Virtual Function I/O" in its
>>diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>>index 34a2d83..dd5f1cf 100644
>>--- a/arch/powerpc/include/asm/eeh.h
>>+++ b/arch/powerpc/include/asm/eeh.h
>>@@ -191,6 +191,11 @@ enum {
>>  #define EEH_OPT_ENABLE		1	/* EEH enable	*/
>>  #define EEH_OPT_THAW_MMIO	2	/* MMIO enable	*/
>>  #define EEH_OPT_THAW_DMA	3	/* DMA enable	*/
>>+#define EEH_OPT_GET_PE_ADDR	0	/* Get PE addr	*/
>>+#define EEH_OPT_GET_PE_MODE	1	/* Get PE mode	*/
>>+#define EEH_PE_MODE_NONE	0	/* Invalid mode	*/
>>+#define EEH_PE_MODE_NOT_SHARED	1	/* Not shared	*/
>>+#define EEH_PE_MODE_SHARED	2	/* Shared mode	*/
>>  #define EEH_STATE_UNAVAILABLE	(1 << 0)	/* State unavailable	*/
>>  #define EEH_STATE_NOT_SUPPORT	(1 << 1)	/* EEH not supported	*/
>>  #define EEH_STATE_RESET_ACTIVE	(1 << 2)	/* Active reset		*/
>>@@ -198,6 +203,11 @@ enum {
>>  #define EEH_STATE_DMA_ACTIVE	(1 << 4)	/* Active DMA		*/
>>  #define EEH_STATE_MMIO_ENABLED	(1 << 5)	/* MMIO enabled		*/
>>  #define EEH_STATE_DMA_ENABLED	(1 << 6)	/* DMA enabled		*/
>>+#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
>>+#define EEH_PE_STATE_RESET		1		/* PE reset	*/
>>+#define EEH_PE_STATE_STOPPED_IO_DMA	2		/* Stopped	*/
>>+#define EEH_PE_STATE_STOPPED_DMA	4		/* Stopped DMA	*/
>>+#define EEH_PE_STATE_UNAVAIL		5		/* Unavailable	*/
>>  #define EEH_RESET_DEACTIVATE	0	/* Deactivate the PE reset	*/
>>  #define EEH_RESET_HOT		1	/* Hot reset			*/
>>  #define EEH_RESET_FUNDAMENTAL	3	/* Fundamental reset		*/
>>@@ -305,6 +315,13 @@ void eeh_add_device_late(struct pci_dev *);
>>  void eeh_add_device_tree_late(struct pci_bus *);
>>  void eeh_add_sysfs_files(struct pci_bus *);
>>  void eeh_remove_device(struct pci_dev *);
>>+int eeh_dev_open(struct pci_dev *pdev);
>>+void eeh_dev_release(struct pci_dev *pdev);
>>+int eeh_pe_set_option(struct pci_dev *pdev, int option);
>>+int eeh_pe_get_addr(struct pci_dev *pdev, int option);
>>+int eeh_pe_get_state(struct pci_dev *pdev);
>>+int eeh_pe_reset(struct pci_dev *pdev, int option);
>>+int eeh_pe_configure(struct pci_dev *pdev);
>>  /**
>>   * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
>>diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
>>index 9c6b899..b90a474 100644
>>--- a/arch/powerpc/kernel/eeh.c
>>+++ b/arch/powerpc/kernel/eeh.c
>>@@ -108,6 +108,9 @@ struct eeh_ops *eeh_ops = NULL;
>>  /* Lock to avoid races due to multiple reports of an error */
>>  DEFINE_RAW_SPINLOCK(confirm_error_lock);
>>+/* Lock to protect passed flags */
>>+static DEFINE_MUTEX(eeh_dev_mutex);
>>+
>>  /* Buffer for reporting pci register dumps. Its here in BSS, and
>>   * not dynamically alloced, so that it ends up in RMO where RTAS
>>   * can access it.
>>@@ -1098,6 +1101,324 @@ void eeh_remove_device(struct pci_dev *dev)
>>  	edev->mode &= ~EEH_DEV_SYSFS;
>>  }
>>+/**
>>+ * eeh_dev_open - Mark EEH device and PE as passed through
>>+ * @pdev: PCI device
>>+ *
>>+ * Mark the indicated EEH device and PE as passed through.
>>+ * In the result, the EEH errors detected on the PE won't be
>>+ * reported. The owner of the device will be responsible for
>>+ * detection and recovery.
>>+ */
>>+int eeh_dev_open(struct pci_dev *pdev)
>>+{
>>+	struct eeh_dev *edev;
>>+
>>+	mutex_lock(&eeh_dev_mutex);
>>+
>>+	/* No PCI device ? */
>>+	if (!pdev) {
>>+		mutex_unlock(&eeh_dev_mutex);
>>+		return -ENODEV;
>>+	}
>>+
>>+	/* No EEH device ? */
>>+	edev = pci_dev_to_eeh_dev(pdev);
>>+	if (!edev || !edev->pe) {
>>+		mutex_unlock(&eeh_dev_mutex);
>>+		return -ENODEV;
>>+	}
>>+
>>+	eeh_dev_set_passed(edev, true);
>>+	eeh_pe_set_passed(edev->pe, true);
>>+	mutex_unlock(&eeh_dev_mutex);
>>+
>>+	return 0;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_dev_open);
>>+
>>+/**
>>+ * eeh_dev_release - Reclaim the ownership of EEH device
>>+ * @pdev: PCI device
>>+ *
>>+ * Reclaim ownership of EEH device, potentially the corresponding
>>+ * PE. In the result, the EEH errors detected on the PE will be
>>+ * reported and handled as usual.
>>+ */
>>+void eeh_dev_release(struct pci_dev *pdev)
>>+{
>>+	bool release_pe = true;
>>+	struct eeh_pe *pe = NULL;
>>+	struct eeh_dev *tmp, *edev;
>>+
>>+	mutex_lock(&eeh_dev_mutex);
>>+
>>+	/* No PCI device ? */
>>+	if (!pdev) {
>>+		mutex_unlock(&eeh_dev_mutex);
>>+		return;
>>+	}
>>+
>>+	/* No EEH device ? */
>>+	edev = pci_dev_to_eeh_dev(pdev);
>>+	if (!edev || !eeh_dev_passed(edev) ||
>>+	    !edev->pe || !eeh_pe_passed(pe)) {
>>+		mutex_unlock(&eeh_dev_mutex);
>>+		return;
>>+	}
>>+
>>+	/* Release device */
>>+	pe = edev->pe;
>>+	eeh_dev_set_passed(edev, false);
>>+
>>+	/* Release PE */
>>+	eeh_pe_for_each_dev(pe, edev, tmp) {
>>+		if (eeh_dev_passed(edev)) {
>>+			release_pe = false;
>>+			break;
>>+		}
>>+	}
>>+
>>+	if (release_pe)
>>+		eeh_pe_set_passed(pe, false);
>>+
>>+	mutex_unlock(&eeh_dev_mutex);
>>+}
>>+EXPORT_SYMBOL(eeh_dev_release);
>>+
>>+static int eeh_dev_check(struct pci_dev *pdev,
>>+			 struct eeh_dev **pedev,
>>+			 struct eeh_pe **ppe)
>>+{
>>+	struct eeh_dev *edev;
>>+
>>+	/* No device ? */
>>+	if (!pdev)
>>+		return -ENODEV;
>>+
>>+	edev = pci_dev_to_eeh_dev(pdev);
>>+	if (!edev || !eeh_dev_passed(edev) ||
>>+	    !edev->pe || !eeh_pe_passed(edev->pe))
>>+		return -ENODEV;
>>+
>>+	if (pedev)
>>+		*pedev = edev;
>>+	if (ppe)
>>+		*ppe = edev->pe;
>>+
>>+	return 0;
>>+}
>>+
>>+/**
>>+ * eeh_pe_set_option - Set options for the indicated PE
>>+ * @pdev: PCI device
>>+ * @option: requested option
>>+ *
>>+ * The routine is called to enable or disable EEH functionality
>>+ * on the indicated PE, to enable IO or DMA for the frozen PE.
>>+ */
>>+int eeh_pe_set_option(struct pci_dev *pdev, int option)
>>+{
>>+	struct eeh_dev *edev;
>>+	struct eeh_pe *pe;
>>+	int ret = 0;
>>+
>>+	/* Device existing ? */
>>+	ret = eeh_dev_check(pdev, &edev, &pe);
>>+	if (ret)
>>+		return ret;
>>+
>>+	switch (option) {
>>+	case EEH_OPT_DISABLE:
>>+	case EEH_OPT_ENABLE:
>
>This deserves a comment
>

Yep. will add it, thanks.

>>+		break;
>>+	case EEH_OPT_THAW_MMIO:
>>+	case EEH_OPT_THAW_DMA:
>>+		if (!eeh_ops || !eeh_ops->set_option) {
>>+			ret = -ENOENT;
>>+			break;
>>+		}
>>+
>>+		ret = eeh_ops->set_option(pe, option);
>>+		break;
>>+	default:
>>+		pr_debug("%s: Option %d out of range (%d, %d)\n",
>>+			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
>>+		ret = -EINVAL;
>>+	}
>>+
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_pe_set_option);
>>+
>>+/**
>>+ * eeh_pe_get_addr - Retrieve the PE address or sharing mode
>>+ * @pdev: PCI device
>>+ * @option: option
>>+ *
>>+ * Retrieve the PE address or sharing mode.
>>+ */
>>+int eeh_pe_get_addr(struct pci_dev *pdev, int option)
>>+{
>>+	struct pci_bus *bus;
>>+	struct eeh_dev *edev;
>>+	struct eeh_pe *pe;
>>+	int ret = 0;
>>+
>>+	/* Device existing ? */
>>+	ret = eeh_dev_check(pdev, &edev, &pe);
>>+	if (ret)
>
>Probably safer to write this as ret < 0. Positive return values are
>success now.
>

Ok. Will fix it.

>>+		return ret;
>>+
>>+	switch (option) {
>>+	case EEH_OPT_GET_PE_ADDR:
>>+		bus = eeh_pe_bus_get(pe);
>>+		if (!bus) {
>>+			ret = -ENODEV;
>>+			break;
>>+		}
>>+
>>+		/* PE address has format "00BBSS00" */
>>+		ret = bus->number << 16;
>>+		break;
>>+	case EEH_OPT_GET_PE_MODE:
>>+		/* Wa always have shared PE */
>
>Wa?
>

Basically, PE could have one single PCI device (function), or a PCI bus
(including subordinate PCI devices). we always have the later case currently.
And it's called PE in "shared mode" and I called it as "shared PE" :-)

>>+		ret = EEH_PE_MODE_SHARED;
>>+		break;
>>+	default:
>>+		pr_debug("%s: option %d out of range (0, 1)\n",
>>+			__func__, option);
>>+		ret = -EINVAL;
>>+	}
>>+
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_pe_get_addr);
>>+
>>+/**
>>+ * eeh_pe_get_state - Retrieve PE's state
>>+ * @pdev: PCI device
>>+ *
>>+ * Retrieve the PE's state, which includes 3 aspects: enabled
>>+ * DMA, enabled IO and asserted reset.
>>+ */
>>+int eeh_pe_get_state(struct pci_dev *pdev)
>>+{
>>+	struct eeh_dev *edev;
>>+	struct eeh_pe *pe;
>>+	int result, ret = 0;
>>+
>>+	/* Device existing ? */
>>+	ret = eeh_dev_check(pdev, &edev, &pe);
>>+	if (ret)
>>+		return ret;
>>+
>>+	if (!eeh_ops || !eeh_ops->get_state)
>>+		return -ENOENT;
>>+
>>+	result = eeh_ops->get_state(pe, NULL);
>>+	if (!(result & EEH_STATE_RESET_ACTIVE) &&
>>+	     (result & EEH_STATE_DMA_ENABLED) &&
>>+	     (result & EEH_STATE_MMIO_ENABLED))
>>+		ret = EEH_PE_STATE_NORMAL;
>>+	else if (result & EEH_STATE_RESET_ACTIVE)
>>+		ret = EEH_PE_STATE_RESET;
>>+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
>>+		 !(result & EEH_STATE_DMA_ENABLED) &&
>>+		 !(result & EEH_STATE_MMIO_ENABLED))
>>+		ret = EEH_PE_STATE_STOPPED_IO_DMA;
>>+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
>>+		 (result & EEH_STATE_DMA_ENABLED) &&
>>+		 !(result & EEH_STATE_MMIO_ENABLED))
>>+		ret = EEH_PE_STATE_STOPPED_DMA;
>>+	else
>>+		ret = EEH_PE_STATE_UNAVAIL;
>>+
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_pe_get_state);
>>+
>>+/**
>>+ * eeh_pe_reset - Issue PE reset according to specified type
>>+ * @pdev: PCI device
>>+ * @option: reset type
>>+ *
>>+ * The routine is called to reset the specified PE with the
>>+ * indicated type, either fundamental reset or hot reset.
>>+ * PE reset is the most important part for error recovery.
>>+ */
>>+int eeh_pe_reset(struct pci_dev *pdev, int option)
>>+{
>>+	struct eeh_dev *edev;
>>+	struct eeh_pe *pe;
>>+	int ret = 0;
>>+
>>+	/* Device existing ? */
>>+	ret = eeh_dev_check(pdev, &edev, &pe);
>>+	if (ret)
>>+		return ret;
>>+
>>+	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset)
>>+		return -ENOENT;
>>+
>>+	switch (option) {
>>+	case EEH_RESET_DEACTIVATE:
>>+		ret = eeh_ops->reset(pe, option);
>>+		if (ret)
>>+			break;
>>+
>>+		/*
>>+		 * The PE is still in frozen state and we need clear
>
>to
>

Will fix, thanks.

>>+		 * that. It's good to clear frozen state after deassert
>>+		 * to avoid messy IO access during reset, which might
>>+		 * cause recursive frozen PE.
>>+		 */
>>+		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
>>+		if (!ret)
>>+			ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
>>+		if (!ret)
>>+			eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
>>+		break;
>>+	case EEH_RESET_HOT:
>>+	case EEH_RESET_FUNDAMENTAL:
>>+		ret = eeh_ops->reset(pe, option);
>>+		break;
>>+	default:
>>+		pr_debug("%s: Unsupported option %d\n",
>>+			__func__, option);
>>+		ret = -EINVAL;
>>+	}
>>+
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_pe_reset);
>>+
>>+/**
>>+ * eeh_pe_configure - Configure PCI bridges after PE reset
>>+ * @pdev: PCI device
>>+ *
>>+ * The routine is called to restore the PCI config space for
>>+ * those PCI devices, especially PCI bridges affected by PE
>>+ * reset issued previously.
>
>So would it make sense to combine this with the reset call?
>

I hope to keep it as it's one of the major steps to do error
recovery. Lets keep it.

>>+ */
>>+int eeh_pe_configure(struct pci_dev *pdev)
>>+{
>>+	struct eeh_dev *edev;
>>+	struct eeh_pe *pe;
>>+	int ret = 0;
>>+
>>+	/* Device existing ? */
>>+	ret = eeh_dev_check(pdev, &edev, &pe);
>>+	if (ret)
>>+		return ret;
>>+
>>+	/* Restore config space for the affected devices */
>>+	eeh_pe_restore_bars(pe);
>>+
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_pe_configure);
>>+
>>  static int proc_eeh_show(struct seq_file *m, void *v)
>>  {
>>  	if (!eeh_enabled()) {
>>diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>>index 7ba0424..301ac18 100644
>>--- a/drivers/vfio/pci/vfio_pci.c
>>+++ b/drivers/vfio/pci/vfio_pci.c
>>@@ -25,6 +25,9 @@
>>  #include <linux/types.h>
>>  #include <linux/uaccess.h>
>>  #include <linux/vfio.h>
>>+#ifdef CONFIG_EEH
>>+#include <asm/eeh.h>
>>+#endif
>>  #include "vfio_pci_private.h"
>>@@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>>  	pci_restore_state(pdev);
>>  }
>>+static void vfio_eeh_pci_release(struct pci_dev *pdev)
>>+{
>>+#ifdef CONFIG_EEH
>>+	eeh_dev_release(pdev);
>>+#endif
>>+}
>>+
>>  static void vfio_pci_release(void *device_data)
>>  {
>>  	struct vfio_pci_device *vdev = device_data;
>>-	if (atomic_dec_and_test(&vdev->refcnt))
>>+	if (atomic_dec_and_test(&vdev->refcnt)) {
>>+		vfio_eeh_pci_release(vdev->pdev);
>>  		vfio_pci_disable(vdev);
>>+	}
>>  	module_put(THIS_MODULE);
>>  }
>>+static int vfio_eeh_pci_open(struct pci_dev *pdev)
>>+{
>>+	int ret = 0;
>>+
>>+#ifdef CONFIG_EEH
>>+	ret = eeh_dev_open(pdev);
>>+#endif
>>+	return ret;
>>+}
>>+
>>  static int vfio_pci_open(void *device_data)
>>  {
>>  	struct vfio_pci_device *vdev = device_data;
>>+	int ret;
>>  	if (!try_module_get(THIS_MODULE))
>>  		return -ENODEV;
>>  	if (atomic_inc_return(&vdev->refcnt) == 1) {
>>-		int ret = vfio_pci_enable(vdev);
>>-		if (ret) {
>>-			module_put(THIS_MODULE);
>>-			return ret;
>>-		}
>>+		ret = vfio_pci_enable(vdev);
>>+		if (ret)
>>+			goto error;
>>+
>>+		ret = vfio_eeh_pci_open(vdev->pdev);
>>+		if (ret)
>>+			goto error;
>>  	}
>>  	return 0;
>>+error:
>>+	module_put(THIS_MODULE);
>>+	return ret;
>>  }
>>  static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
>>@@ -321,6 +349,91 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>>  	return walk.ret;
>>  }
>>+static int vfio_eeh_pci_ioctl(struct pci_dev *pdev,
>>+			      unsigned int cmd,
>>+			      unsigned long arg)
>>+{
>>+	unsigned long minsz;
>>+	int ret = 0;
>>+
>>+#ifdef CONFIG_EEH
>>+	switch (cmd) {
>>+	case VFIO_EEH_PE_SET_OPTION: {
>>+		struct vfio_eeh_pe_set_option option;
>>+
>>+		minsz = offsetofend(struct vfio_eeh_pe_set_option, option);
>>+		if (copy_from_user(&option, (void __user *)arg, minsz))
>>+			return -EFAULT;
>>+		if (option.argsz < minsz)
>>+			return -EINVAL;
>>+
>>+		ret = eeh_pe_set_option(pdev, option.option);
>>+		break;
>>+	}
>>+	case VFIO_EEH_PE_GET_ADDR: {
>>+		struct vfio_eeh_pe_get_addr addr;
>>+
>>+		minsz = offsetofend(struct vfio_eeh_pe_get_addr, info);
>>+		if (copy_from_user(&addr, (void __user *)arg, minsz))
>>+			return -EFAULT;
>>+		if (addr.argsz < minsz)
>>+			return -EINVAL;
>>+
>>+		ret = eeh_pe_get_addr(pdev, addr.option);
>>+		if (ret >= 0) {
>>+			addr.info = ret;
>>+			if (copy_to_user((void __user *)arg, &addr, minsz))
>>+				return -EFAULT;
>>+			ret = 0;
>>+		}
>>+
>>+		break;
>>+	}
>>+	case VFIO_EEH_PE_GET_STATE: {
>>+		struct vfio_eeh_pe_get_state state;
>>+
>>+		minsz = offsetofend(struct vfio_eeh_pe_get_state, state);
>>+		if (copy_from_user(&state, (void __user *)arg, minsz))
>>+			return -EFAULT;
>>+		if (state.argsz < minsz)
>>+			return -EINVAL;
>>+
>>+		ret = eeh_pe_get_state(pdev);
>>+		if (ret >= 0) {
>>+			state.state = ret;
>>+			if (copy_to_user((void __user *)arg, &state, minsz))
>>+				return -EFAULT;
>>+			ret = 0;
>>+		}
>>+		break;
>>+	}
>>+	case VFIO_EEH_PE_RESET: {
>>+		struct vfio_eeh_pe_reset reset;
>>+
>>+		minsz = offsetofend(struct vfio_eeh_pe_reset, option);
>>+		if (copy_from_user(&reset, (void __user *)arg, minsz))
>>+			return -EFAULT;
>>+		if (reset.argsz < minsz)
>>+			return -EINVAL;
>>+
>>+		ret = eeh_pe_reset(pdev, reset.option);
>>+		break;
>>+	}
>>+	case VFIO_EEH_PE_CONFIGURE:
>>+		ret = eeh_pe_configure(pdev);
>>+		break;
>>+	default:
>>+		ret = -EINVAL;
>>+		pr_debug("%s: Cannot handle command %d\n",
>>+			__func__, cmd);
>>+	}
>>+#else
>>+	ret = -ENOENT;
>>+#endif
>>+
>>+	return ret;
>>+}
>>+
>>  static long vfio_pci_ioctl(void *device_data,
>>  			   unsigned int cmd, unsigned long arg)
>>  {
>>@@ -682,6 +795,12 @@ hot_reset_release:
>>  		kfree(groups);
>>  		return ret;
>>+	} else if (cmd == VFIO_EEH_PE_SET_OPTION ||
>>+		   cmd == VFIO_EEH_PE_GET_ADDR ||
>>+		   cmd == VFIO_EEH_PE_GET_STATE ||
>>+		   cmd == VFIO_EEH_PE_RESET ||
>>+		   cmd == VFIO_EEH_PE_CONFIGURE) {
>>+			return vfio_eeh_pci_ioctl(vdev->pdev, cmd, arg);
>>  	}
>>  	return -ENOTTY;
>>diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>index cb9023d..ef55682 100644
>>--- a/include/uapi/linux/vfio.h
>>+++ b/include/uapi/linux/vfio.h
>>@@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
>>  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>>+/*
>>+ * EEH functionality can be enabled or disabled on one specific device.
>>+ * Also, the DMA or IO frozen state can be removed from the frozen PE
>>+ * if required.
>>+ */
>>+struct vfio_eeh_pe_set_option {
>>+	__u32 argsz;
>
>What is this argsz thing? Is this your way of maintaining backwards
>compatibility when we introduce new fields? A new field will change
>the ioctl number, so I don't think that makes a lot of sense :).
>
>Just make the ioctl have a u32 as incoming argument. No fancy
>structs, no complicated code.
>
>The same applies for a number of structs below.
>

ok. Will do in next revision.

>>+	__u32 option;
>>+};
>>+
>>+#define VFIO_EEH_PE_SET_OPTION		_IO(VFIO_TYPE, VFIO_BASE + 21)
>>+
>>+/*
>>+ * Each EEH PE should have unique address to be identified. The command
>>+ * helps to retrieve the address and the sharing mode of the PE.
>>+ */
>>+struct vfio_eeh_pe_get_addr {
>>+	__u32 argsz;
>>+	__u32 option;
>>+	__u32 info;
>
>Any particular reason you need the info field? Can't the return value
>of the ioctl hold this? Then you only have a single u32 argument left
>to the ioctl again.
>

ok. Will do in next revision.

>>+};
>>+
>>+#define VFIO_EEH_PE_GET_ADDR		_IO(VFIO_TYPE, VFIO_BASE + 22)
>>+
>>+/*
>>+ * EEH PE might have been frozen because of PCI errors. Also, it might
>>+ * be experiencing reset for error revoery. The following command helps
>>+ * to get the state.
>>+ */
>>+struct vfio_eeh_pe_get_state {
>>+	__u32 argsz;
>>+	__u32 state;
>>+};
>>+
>>+#define VFIO_EEH_PE_GET_STATE		_IO(VFIO_TYPE, VFIO_BASE + 23)
>>+
>>+/*
>>+ * Reset is the major step to recover problematic PE. The following
>>+ * command helps on that.
>>+ */
>>+struct vfio_eeh_pe_reset {
>>+	__u32 argsz;
>>+	__u32 option;
>>+};
>>+
>>+#define VFIO_EEH_PE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 24)
>>+
>>+/*
>>+ * One of the steps for recovery after PE reset is to configure the
>>+ * PCI bridges affected by the PE reset.
>>+ */
>>+#define VFIO_EEH_PE_CONFIGURE		_IO(VFIO_TYPE, VFIO_BASE + 25)
>>+
>>  /* ***************************************************************** */
>>  #endif /* _UAPIVFIO_H */

Thanks,
Gavin
Gavin Shan May 23, 2014, 12:37 a.m. UTC | #3
On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
>On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
>>On 22.05.14 10:23, Gavin Shan wrote:

.../...

>>>diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>>index cb9023d..ef55682 100644
>>>--- a/include/uapi/linux/vfio.h
>>>+++ b/include/uapi/linux/vfio.h
>>>@@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
>>>  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>>>+/*
>>>+ * EEH functionality can be enabled or disabled on one specific device.
>>>+ * Also, the DMA or IO frozen state can be removed from the frozen PE
>>>+ * if required.
>>>+ */
>>>+struct vfio_eeh_pe_set_option {
>>>+	__u32 argsz;
>>
>>What is this argsz thing? Is this your way of maintaining backwards
>>compatibility when we introduce new fields? A new field will change
>>the ioctl number, so I don't think that makes a lot of sense :).
>>
>>Just make the ioctl have a u32 as incoming argument. No fancy
>>structs, no complicated code.
>>
>>The same applies for a number of structs below.
>>
>
>ok. Will do in next revision.
>

Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl command
always has "argsz". I guess it was used as checker by Alex.W. Do you really
want remove "argsz" ?

>>>+	__u32 option;
>>>+};
>>>+
>>>+#define VFIO_EEH_PE_SET_OPTION		_IO(VFIO_TYPE, VFIO_BASE + 21)
>>>+
>>>+/*
>>>+ * Each EEH PE should have unique address to be identified. The command
>>>+ * helps to retrieve the address and the sharing mode of the PE.
>>>+ */
>>>+struct vfio_eeh_pe_get_addr {
>>>+	__u32 argsz;
>>>+	__u32 option;
>>>+	__u32 info;
>>
>>Any particular reason you need the info field? Can't the return value
>>of the ioctl hold this? Then you only have a single u32 argument left
>>to the ioctl again.
>>
>
>ok. Will do in next revision.
>

If we eventually remove "argsz" and let ioctl() return value to hold
information (or negative number for errors), we don't need any data
struct because the 3rd parameter of ioctl() would be used as input
and I only need one input parameter. Do you want see this ?

Hopefully, Alex.W saw this and hasn't objections :)

Thanks,
Gavin
Alex Williamson May 23, 2014, 3:10 a.m. UTC | #4
On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
> The patch adds new IOCTL commands for VFIO PCI device to support
> EEH functionality for PCI devices, which have been passed through
> from host to somebody else via VFIO.
> 
> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> ---
>  Documentation/vfio.txt         |  88 ++++++++++-
>  arch/powerpc/include/asm/eeh.h |  17 +++
>  arch/powerpc/kernel/eeh.c      | 321 +++++++++++++++++++++++++++++++++++++++++
>  drivers/vfio/pci/vfio_pci.c    | 131 ++++++++++++++++-
>  include/uapi/linux/vfio.h      |  53 +++++++
>  5 files changed, 603 insertions(+), 7 deletions(-)

Maybe a chicken and egg problem, but it seems like we could split the
platform code and vfio code into separate patches.

> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index b9ca023..dd13db6 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
>  an excellent performance which has limitations such as inability to do
>  locked pages accounting in real time.
>  
> -So 3 additional ioctls have been added:
> +4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
> +which works on the basis of additional ioctl commands.
> +
> +So 8 additional ioctls have been added:
>  
>  	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>  		of the DMA window on the PCI bus.
> @@ -316,6 +319,20 @@ So 3 additional ioctls have been added:
>  
>  	VFIO_IOMMU_DISABLE - disables the container.
>  
> +	VFIO_EEH_PE_SET_OPTION - enables or disables EEH functinality on the
> +		specified device. Also, it can be used to remove IO or DMA
> +		stopped state on the frozen PE.
> +
> +	VFIO_EEH_PE_GET_ADDR - retrieve the unique address of the specified
> +		PE or query PE sharing mode.
> +
> +	VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
> +
> +	VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
> +		error recovering.
> +
> +	VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
> +		one of the major steps for error recoverying.
>  
>  The code flow from the example above should be slightly changed:
>  
> @@ -346,6 +363,75 @@ The code flow from the example above should be slightly changed:
>  	ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
>  	.....
>  
> +Based on the initial example we have, the following piece of code could be
> +reference for EEH setup and error handling:
> +
> +	struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
> +	struct vfio_eeh_pe_get_addr addr = { .argsz = sizeof(addr) };
> +	struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
> +	struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
> +	struct vfio_eeh_pe_configure config = { .argsz = sizeof(config) };
> +
> +	....
> +
> +	/* Get a file descriptor for the device */
> +	device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
> +
> +	/* Enable the EEH functionality on the device */
> +	option.option = EEH_OPT_ENABLE;
> +	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
> +
> +	/* Retrieve PE address and create and maintain PE by yourself */
> +	addr.option = EEH_OPT_GET_PE_ADDR;
> +	ioctl(device, VFIO_EEH_PE_GET_ADDR, &addr);
> +
> +	/* Assure EEH is supported on the PE and make PE functional */
> +	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
> +
> +	/* Save device's state. pci_save_state() would be good enough
> +	 * as an example.
> +	 */
> +
> +	/* Test and setup the device */
> +	ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
> +
> +	....
> +
> +	/* When 0xFF's returned from reading PCI config space or IO BARs
> +	 * of the PCI device. Check the PE state to see if that has been
> +	 * frozen.
> +	 */
> +	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);

There's no notification, the user needs to observe the return value an
poll?  Should we be enabling an eventfd to notify the user of the state
change?

> +
> +	/* Waiting for pending PCI transactions to be completed and don't
> +	 * produce any more PCI traffic from/to the affected PE until
> +	 * recovery is finished.
> +	 */
> +
> +	/* Enable IO for the affected PE and collect logs. Usually, the
> +	 * standard part of PCI config space, AER registers are dumped
> +	 * as logs for further analysis.
> +	 */
> +	option.option = EEH_OPT_THAW_MMIO;
> +	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);

How does the guest learn about the error?  Does it need to?
> +
> +	/* Issue PE reset */
> +	reset.option = EEH_RESET_HOT;
> +	ioctl(device, VFIO_EEH_PE_RESET, &reset);
> +
> +	/* Configure the PCI bridges for the affected PE */
> +	ioctl(device, VFIO_EEH_PE_CONFIGURE, NULL);
> +

I'm not sure I see why we've split these into separate ioctls.  FWIW,
the one ioctl we currently have for reset that takes no options is
probably going to be the first to get deprecated because of it.

> +	/* Restored state we saved at initialization time. pci_restore_state()
> +	 * is good enough as an example.
> +	 */
> +
> +	/* Hopefully, error is recovered successfully. Now, you can resume to
> +	 * start PCI traffic to/from the affected PE.
> +	 */
> +
> +	....
> +
>  -------------------------------------------------------------------------------
>  
>  [1] VFIO was originally an acronym for "Virtual Function I/O" in its
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 34a2d83..dd5f1cf 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -191,6 +191,11 @@ enum {
>  #define EEH_OPT_ENABLE		1	/* EEH enable	*/
>  #define EEH_OPT_THAW_MMIO	2	/* MMIO enable	*/
>  #define EEH_OPT_THAW_DMA	3	/* DMA enable	*/
> +#define EEH_OPT_GET_PE_ADDR	0	/* Get PE addr	*/
> +#define EEH_OPT_GET_PE_MODE	1	/* Get PE mode	*/
> +#define EEH_PE_MODE_NONE	0	/* Invalid mode	*/
> +#define EEH_PE_MODE_NOT_SHARED	1	/* Not shared	*/
> +#define EEH_PE_MODE_SHARED	2	/* Shared mode	*/
>  #define EEH_STATE_UNAVAILABLE	(1 << 0)	/* State unavailable	*/
>  #define EEH_STATE_NOT_SUPPORT	(1 << 1)	/* EEH not supported	*/
>  #define EEH_STATE_RESET_ACTIVE	(1 << 2)	/* Active reset		*/
> @@ -198,6 +203,11 @@ enum {
>  #define EEH_STATE_DMA_ACTIVE	(1 << 4)	/* Active DMA		*/
>  #define EEH_STATE_MMIO_ENABLED	(1 << 5)	/* MMIO enabled		*/
>  #define EEH_STATE_DMA_ENABLED	(1 << 6)	/* DMA enabled		*/
> +#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
> +#define EEH_PE_STATE_RESET		1		/* PE reset	*/
> +#define EEH_PE_STATE_STOPPED_IO_DMA	2		/* Stopped	*/
> +#define EEH_PE_STATE_STOPPED_DMA	4		/* Stopped DMA	*/
> +#define EEH_PE_STATE_UNAVAIL		5		/* Unavailable	*/
>  #define EEH_RESET_DEACTIVATE	0	/* Deactivate the PE reset	*/
>  #define EEH_RESET_HOT		1	/* Hot reset			*/
>  #define EEH_RESET_FUNDAMENTAL	3	/* Fundamental reset		*/
> @@ -305,6 +315,13 @@ void eeh_add_device_late(struct pci_dev *);
>  void eeh_add_device_tree_late(struct pci_bus *);
>  void eeh_add_sysfs_files(struct pci_bus *);
>  void eeh_remove_device(struct pci_dev *);
> +int eeh_dev_open(struct pci_dev *pdev);
> +void eeh_dev_release(struct pci_dev *pdev);
> +int eeh_pe_set_option(struct pci_dev *pdev, int option);
> +int eeh_pe_get_addr(struct pci_dev *pdev, int option);
> +int eeh_pe_get_state(struct pci_dev *pdev);
> +int eeh_pe_reset(struct pci_dev *pdev, int option);
> +int eeh_pe_configure(struct pci_dev *pdev);
>  
>  /**
>   * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 9c6b899..b90a474 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -108,6 +108,9 @@ struct eeh_ops *eeh_ops = NULL;
>  /* Lock to avoid races due to multiple reports of an error */
>  DEFINE_RAW_SPINLOCK(confirm_error_lock);
>  
> +/* Lock to protect passed flags */
> +static DEFINE_MUTEX(eeh_dev_mutex);
> +
>  /* Buffer for reporting pci register dumps. Its here in BSS, and
>   * not dynamically alloced, so that it ends up in RMO where RTAS
>   * can access it.
> @@ -1098,6 +1101,324 @@ void eeh_remove_device(struct pci_dev *dev)
>  	edev->mode &= ~EEH_DEV_SYSFS;
>  }
>  
> +/**
> + * eeh_dev_open - Mark EEH device and PE as passed through
> + * @pdev: PCI device
> + *
> + * Mark the indicated EEH device and PE as passed through.
> + * In the result, the EEH errors detected on the PE won't be
> + * reported. The owner of the device will be responsible for
> + * detection and recovery.
> + */
> +int eeh_dev_open(struct pci_dev *pdev)
> +{
> +	struct eeh_dev *edev;
> +
> +	mutex_lock(&eeh_dev_mutex);
> +
> +	/* No PCI device ? */
> +	if (!pdev) {
> +		mutex_unlock(&eeh_dev_mutex);
> +		return -ENODEV;
> +	}
> +
> +	/* No EEH device ? */
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !edev->pe) {
> +		mutex_unlock(&eeh_dev_mutex);
> +		return -ENODEV;
> +	}
> +
> +	eeh_dev_set_passed(edev, true);
> +	eeh_pe_set_passed(edev->pe, true);
> +	mutex_unlock(&eeh_dev_mutex);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(eeh_dev_open);
> +
> +/**
> + * eeh_dev_release - Reclaim the ownership of EEH device
> + * @pdev: PCI device
> + *
> + * Reclaim ownership of EEH device, potentially the corresponding
> + * PE. In the result, the EEH errors detected on the PE will be
> + * reported and handled as usual.
> + */
> +void eeh_dev_release(struct pci_dev *pdev)
> +{
> +	bool release_pe = true;
> +	struct eeh_pe *pe = NULL;
> +	struct eeh_dev *tmp, *edev;
> +
> +	mutex_lock(&eeh_dev_mutex);
> +
> +	/* No PCI device ? */
> +	if (!pdev) {
> +		mutex_unlock(&eeh_dev_mutex);
> +		return;
> +	}
> +
> +	/* No EEH device ? */
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !eeh_dev_passed(edev) ||
> +	    !edev->pe || !eeh_pe_passed(pe)) {
> +		mutex_unlock(&eeh_dev_mutex);
> +		return;
> +	}
> +
> +	/* Release device */
> +	pe = edev->pe;
> +	eeh_dev_set_passed(edev, false);
> +
> +	/* Release PE */
> +	eeh_pe_for_each_dev(pe, edev, tmp) {
> +		if (eeh_dev_passed(edev)) {
> +			release_pe = false;
> +			break;
> +		}
> +	}
> +
> +	if (release_pe)
> +		eeh_pe_set_passed(pe, false);
> +
> +	mutex_unlock(&eeh_dev_mutex);
> +}
> +EXPORT_SYMBOL(eeh_dev_release);
> +
> +static int eeh_dev_check(struct pci_dev *pdev,
> +			 struct eeh_dev **pedev,
> +			 struct eeh_pe **ppe)
> +{
> +	struct eeh_dev *edev;
> +
> +	/* No device ? */
> +	if (!pdev)
> +		return -ENODEV;
> +
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !eeh_dev_passed(edev) ||
> +	    !edev->pe || !eeh_pe_passed(edev->pe))
> +		return -ENODEV;
> +
> +	if (pedev)
> +		*pedev = edev;
> +	if (ppe)
> +		*ppe = edev->pe;
> +
> +	return 0;
> +}
> +
> +/**
> + * eeh_pe_set_option - Set options for the indicated PE
> + * @pdev: PCI device
> + * @option: requested option
> + *
> + * The routine is called to enable or disable EEH functionality
> + * on the indicated PE, to enable IO or DMA for the frozen PE.
> + */
> +int eeh_pe_set_option(struct pci_dev *pdev, int option)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_dev_check(pdev, &edev, &pe);
> +	if (ret)
> +		return ret;
> +
> +	switch (option) {
> +	case EEH_OPT_DISABLE:
> +	case EEH_OPT_ENABLE:
> +		break;
> +	case EEH_OPT_THAW_MMIO:
> +	case EEH_OPT_THAW_DMA:
> +		if (!eeh_ops || !eeh_ops->set_option) {
> +			ret = -ENOENT;
> +			break;
> +		}
> +
> +		ret = eeh_ops->set_option(pe, option);
> +		break;
> +	default:
> +		pr_debug("%s: Option %d out of range (%d, %d)\n",
> +			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_pe_set_option);
> +
> +/**
> + * eeh_pe_get_addr - Retrieve the PE address or sharing mode
> + * @pdev: PCI device
> + * @option: option
> + *
> + * Retrieve the PE address or sharing mode.
> + */
> +int eeh_pe_get_addr(struct pci_dev *pdev, int option)
> +{
> +	struct pci_bus *bus;
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_dev_check(pdev, &edev, &pe);
> +	if (ret)
> +		return ret;
> +
> +	switch (option) {
> +	case EEH_OPT_GET_PE_ADDR:
> +		bus = eeh_pe_bus_get(pe);
> +		if (!bus) {
> +			ret = -ENODEV;
> +			break;
> +		}
> +
> +		/* PE address has format "00BBSS00" */
> +		ret = bus->number << 16;
> +		break;
> +	case EEH_OPT_GET_PE_MODE:
> +		/* Wa always have shared PE */
> +		ret = EEH_PE_MODE_SHARED;
> +		break;
> +	default:
> +		pr_debug("%s: option %d out of range (0, 1)\n",
> +			__func__, option);
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_pe_get_addr);
> +
> +/**
> + * eeh_pe_get_state - Retrieve PE's state
> + * @pdev: PCI device
> + *
> + * Retrieve the PE's state, which includes 3 aspects: enabled
> + * DMA, enabled IO and asserted reset.
> + */
> +int eeh_pe_get_state(struct pci_dev *pdev)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int result, ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_dev_check(pdev, &edev, &pe);
> +	if (ret)
> +		return ret;
> +
> +	if (!eeh_ops || !eeh_ops->get_state)
> +		return -ENOENT;
> +
> +	result = eeh_ops->get_state(pe, NULL);
> +	if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +	     (result & EEH_STATE_DMA_ENABLED) &&
> +	     (result & EEH_STATE_MMIO_ENABLED))
> +		ret = EEH_PE_STATE_NORMAL;
> +	else if (result & EEH_STATE_RESET_ACTIVE)
> +		ret = EEH_PE_STATE_RESET;
> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +		 !(result & EEH_STATE_DMA_ENABLED) &&
> +		 !(result & EEH_STATE_MMIO_ENABLED))
> +		ret = EEH_PE_STATE_STOPPED_IO_DMA;
> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +		 (result & EEH_STATE_DMA_ENABLED) &&
> +		 !(result & EEH_STATE_MMIO_ENABLED))
> +		ret = EEH_PE_STATE_STOPPED_DMA;
> +	else
> +		ret = EEH_PE_STATE_UNAVAIL;
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_pe_get_state);
> +
> +/**
> + * eeh_pe_reset - Issue PE reset according to specified type
> + * @pdev: PCI device
> + * @option: reset type
> + *
> + * The routine is called to reset the specified PE with the
> + * indicated type, either fundamental reset or hot reset.
> + * PE reset is the most important part for error recovery.
> + */
> +int eeh_pe_reset(struct pci_dev *pdev, int option)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_dev_check(pdev, &edev, &pe);
> +	if (ret)
> +		return ret;
> +
> +	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset)
> +		return -ENOENT;
> +
> +	switch (option) {
> +	case EEH_RESET_DEACTIVATE:
> +		ret = eeh_ops->reset(pe, option);
> +		if (ret)
> +			break;
> +
> +		/*
> +		 * The PE is still in frozen state and we need clear
> +		 * that. It's good to clear frozen state after deassert
> +		 * to avoid messy IO access during reset, which might
> +		 * cause recursive frozen PE.
> +		 */
> +		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
> +		if (!ret)
> +			ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
> +		if (!ret)
> +			eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
> +		break;
> +	case EEH_RESET_HOT:
> +	case EEH_RESET_FUNDAMENTAL:
> +		ret = eeh_ops->reset(pe, option);
> +		break;
> +	default:
> +		pr_debug("%s: Unsupported option %d\n",
> +			__func__, option);
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_pe_reset);
> +
> +/**
> + * eeh_pe_configure - Configure PCI bridges after PE reset
> + * @pdev: PCI device
> + *
> + * The routine is called to restore the PCI config space for
> + * those PCI devices, especially PCI bridges affected by PE
> + * reset issued previously.
> + */
> +int eeh_pe_configure(struct pci_dev *pdev)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_dev_check(pdev, &edev, &pe);
> +	if (ret)
> +		return ret;
> +
> +	/* Restore config space for the affected devices */
> +	eeh_pe_restore_bars(pe);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_pe_configure);
> +
>  static int proc_eeh_show(struct seq_file *m, void *v)
>  {
>  	if (!eeh_enabled()) {
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 7ba0424..301ac18 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -25,6 +25,9 @@
>  #include <linux/types.h>
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
> +#ifdef CONFIG_EEH
> +#include <asm/eeh.h>
> +#endif

Can we make a vfio_pci_eeh file that properly stubs out anything called
from common code?  I don't want to see all these inline ifdefs.

>  
>  #include "vfio_pci_private.h"
>  
> @@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  	pci_restore_state(pdev);
>  }
>  
> +static void vfio_eeh_pci_release(struct pci_dev *pdev)
> +{
> +#ifdef CONFIG_EEH
> +	eeh_dev_release(pdev);
> +#endif
> +}
> +
>  static void vfio_pci_release(void *device_data)
>  {
>  	struct vfio_pci_device *vdev = device_data;
>  
> -	if (atomic_dec_and_test(&vdev->refcnt))
> +	if (atomic_dec_and_test(&vdev->refcnt)) {
> +		vfio_eeh_pci_release(vdev->pdev);
>  		vfio_pci_disable(vdev);
> +	}
>  
>  	module_put(THIS_MODULE);
>  }
>  
> +static int vfio_eeh_pci_open(struct pci_dev *pdev)
> +{
> +	int ret = 0;
> +
> +#ifdef CONFIG_EEH
> +	ret = eeh_dev_open(pdev);
> +#endif
> +	return ret;
> +}
> +
>  static int vfio_pci_open(void *device_data)
>  {
>  	struct vfio_pci_device *vdev = device_data;
> +	int ret;
>  
>  	if (!try_module_get(THIS_MODULE))
>  		return -ENODEV;
>  
>  	if (atomic_inc_return(&vdev->refcnt) == 1) {
> -		int ret = vfio_pci_enable(vdev);
> -		if (ret) {
> -			module_put(THIS_MODULE);
> -			return ret;
> -		}
> +		ret = vfio_pci_enable(vdev);
> +		if (ret)
> +			goto error;
> +
> +		ret = vfio_eeh_pci_open(vdev->pdev);
> +		if (ret)
> +			goto error;
>  	}
>  
>  	return 0;
> +error:
> +	module_put(THIS_MODULE);
> +	return ret;
>  }
>  
>  static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
> @@ -321,6 +349,91 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>  	return walk.ret;
>  }
>  
> +static int vfio_eeh_pci_ioctl(struct pci_dev *pdev,
> +			      unsigned int cmd,
> +			      unsigned long arg)
> +{
> +	unsigned long minsz;
> +	int ret = 0;
> +
> +#ifdef CONFIG_EEH
> +	switch (cmd) {
> +	case VFIO_EEH_PE_SET_OPTION: {
> +		struct vfio_eeh_pe_set_option option;
> +
> +		minsz = offsetofend(struct vfio_eeh_pe_set_option, option);
> +		if (copy_from_user(&option, (void __user *)arg, minsz))
> +			return -EFAULT;
> +		if (option.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = eeh_pe_set_option(pdev, option.option);
> +		break;
> +	}
> +	case VFIO_EEH_PE_GET_ADDR: {
> +		struct vfio_eeh_pe_get_addr addr;
> +
> +		minsz = offsetofend(struct vfio_eeh_pe_get_addr, info);
> +		if (copy_from_user(&addr, (void __user *)arg, minsz))
> +			return -EFAULT;
> +		if (addr.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = eeh_pe_get_addr(pdev, addr.option);
> +		if (ret >= 0) {
> +			addr.info = ret;
> +			if (copy_to_user((void __user *)arg, &addr, minsz))
> +				return -EFAULT;
> +			ret = 0;
> +		}
> +
> +		break;
> +	}
> +	case VFIO_EEH_PE_GET_STATE: {
> +		struct vfio_eeh_pe_get_state state;
> +
> +		minsz = offsetofend(struct vfio_eeh_pe_get_state, state);
> +		if (copy_from_user(&state, (void __user *)arg, minsz))
> +			return -EFAULT;
> +		if (state.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = eeh_pe_get_state(pdev);
> +		if (ret >= 0) {
> +			state.state = ret;
> +			if (copy_to_user((void __user *)arg, &state, minsz))
> +				return -EFAULT;
> +			ret = 0;
> +		}
> +		break;
> +	}
> +	case VFIO_EEH_PE_RESET: {
> +		struct vfio_eeh_pe_reset reset;
> +
> +		minsz = offsetofend(struct vfio_eeh_pe_reset, option);
> +		if (copy_from_user(&reset, (void __user *)arg, minsz))
> +			return -EFAULT;
> +		if (reset.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = eeh_pe_reset(pdev, reset.option);
> +		break;
> +	}
> +	case VFIO_EEH_PE_CONFIGURE:
> +		ret = eeh_pe_configure(pdev);
> +		break;
> +	default:
> +		ret = -EINVAL;
> +		pr_debug("%s: Cannot handle command %d\n",
> +			__func__, cmd);
> +	}
> +#else
> +	ret = -ENOENT;
> +#endif
> +
> +	return ret;
> +}
> +
>  static long vfio_pci_ioctl(void *device_data,
>  			   unsigned int cmd, unsigned long arg)
>  {
> @@ -682,6 +795,12 @@ hot_reset_release:
>  
>  		kfree(groups);
>  		return ret;
> +	} else if (cmd == VFIO_EEH_PE_SET_OPTION ||
> +		   cmd == VFIO_EEH_PE_GET_ADDR ||
> +		   cmd == VFIO_EEH_PE_GET_STATE ||
> +		   cmd == VFIO_EEH_PE_RESET ||
> +		   cmd == VFIO_EEH_PE_CONFIGURE) {
> +			return vfio_eeh_pci_ioctl(vdev->pdev, cmd, arg);
>  	}
>  
>  	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index cb9023d..ef55682 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
>  
>  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>  
> +/*
> + * EEH functionality can be enabled or disabled on one specific device.
> + * Also, the DMA or IO frozen state can be removed from the frozen PE
> + * if required.
> + */
> +struct vfio_eeh_pe_set_option {
> +	__u32 argsz;
> +	__u32 option;
> +};
> +
> +#define VFIO_EEH_PE_SET_OPTION		_IO(VFIO_TYPE, VFIO_BASE + 21)

What proposed ioctls are making you jump to 21?

argsz is probably not as useful without a flags field.  Otherwise the
caller can't indicate what the extra space is.

> +
> +/*
> + * Each EEH PE should have unique address to be identified. The command
> + * helps to retrieve the address and the sharing mode of the PE.
> + */
> +struct vfio_eeh_pe_get_addr {
> +	__u32 argsz;
> +	__u32 option;
> +	__u32 info;
> +};
> +
> +#define VFIO_EEH_PE_GET_ADDR		_IO(VFIO_TYPE, VFIO_BASE + 22)
> +
> +/*
> + * EEH PE might have been frozen because of PCI errors. Also, it might
> + * be experiencing reset for error revoery. The following command helps
> + * to get the state.
> + */
> +struct vfio_eeh_pe_get_state {
> +	__u32 argsz;
> +	__u32 state;
> +};
> +
> +#define VFIO_EEH_PE_GET_STATE		_IO(VFIO_TYPE, VFIO_BASE + 23)
> +
> +/*
> + * Reset is the major step to recover problematic PE. The following
> + * command helps on that.
> + */
> +struct vfio_eeh_pe_reset {
> +	__u32 argsz;
> +	__u32 option;
> +};
> +
> +#define VFIO_EEH_PE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 24)
> +
> +/*
> + * One of the steps for recovery after PE reset is to configure the
> + * PCI bridges affected by the PE reset.
> + */
> +#define VFIO_EEH_PE_CONFIGURE		_IO(VFIO_TYPE, VFIO_BASE + 25)

What can the user do differently by making these separate ioctls?

> +
>  /* ***************************************************************** */
>  
>  #endif /* _UAPIVFIO_H */
Alex Williamson May 23, 2014, 3:23 a.m. UTC | #5
On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
> >On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
> >>On 22.05.14 10:23, Gavin Shan wrote:
> 
> .../...
> 
> >>>diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >>>index cb9023d..ef55682 100644
> >>>--- a/include/uapi/linux/vfio.h
> >>>+++ b/include/uapi/linux/vfio.h
> >>>@@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
> >>>  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> >>>+/*
> >>>+ * EEH functionality can be enabled or disabled on one specific device.
> >>>+ * Also, the DMA or IO frozen state can be removed from the frozen PE
> >>>+ * if required.
> >>>+ */
> >>>+struct vfio_eeh_pe_set_option {
> >>>+	__u32 argsz;
> >>
> >>What is this argsz thing? Is this your way of maintaining backwards
> >>compatibility when we introduce new fields? A new field will change
> >>the ioctl number, so I don't think that makes a lot of sense :).
> >>
> >>Just make the ioctl have a u32 as incoming argument. No fancy
> >>structs, no complicated code.
> >>
> >>The same applies for a number of structs below.
> >>
> >
> >ok. Will do in next revision.
> >
> 
> Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl command
> always has "argsz". I guess it was used as checker by Alex.W. Do you really
> want remove "argsz" ?


IIRC, this was actually a suggestion incorporated from David Gibson, but
using _IO with an argsz and flags field we can maintain compatibility
without bumping the ioctl number.  It really only makes sense if we have
a flags field so we can identify what additional information is being
provided.  Flags can be used as a bitmap of trailing structures or as
revision if we want a set of trailing structures that may change over
time.  Unless you can come up with a good argument against it that would
prevent us inventing a new ioctl as soon as we need a minor tweak, I'd
prefer to keep it.  As I noted in a previous comment, the one ioctl we
have for reset that doesn't take any options is likely going to be the
first ioctl that we need to entirely replace.  If we don't keep argsz,
it seems like we probably need a flags field and reserved structures.

> >>>+	__u32 option;
> >>>+};
> >>>+
> >>>+#define VFIO_EEH_PE_SET_OPTION		_IO(VFIO_TYPE, VFIO_BASE + 21)
> >>>+
> >>>+/*
> >>>+ * Each EEH PE should have unique address to be identified. The command
> >>>+ * helps to retrieve the address and the sharing mode of the PE.
> >>>+ */
> >>>+struct vfio_eeh_pe_get_addr {
> >>>+	__u32 argsz;
> >>>+	__u32 option;
> >>>+	__u32 info;
> >>
> >>Any particular reason you need the info field? Can't the return value
> >>of the ioctl hold this? Then you only have a single u32 argument left
> >>to the ioctl again.
> >>
> >
> >ok. Will do in next revision.
> >
> 
> If we eventually remove "argsz" and let ioctl() return value to hold
> information (or negative number for errors), we don't need any data
> struct because the 3rd parameter of ioctl() would be used as input
> and I only need one input parameter. Do you want see this ?
> 
> Hopefully, Alex.W saw this and hasn't objections :)

I'm not sure why we're pushing for the minimal data set to pass to an
ioctl.  Seems like a recipe for dead, useless ioctls.  Thanks,

Alex
Gavin Shan May 23, 2014, 4:37 a.m. UTC | #6
On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
>On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
>> The patch adds new IOCTL commands for VFIO PCI device to support
>> EEH functionality for PCI devices, which have been passed through
>> from host to somebody else via VFIO.
>> 
>> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> ---
>>  Documentation/vfio.txt         |  88 ++++++++++-
>>  arch/powerpc/include/asm/eeh.h |  17 +++
>>  arch/powerpc/kernel/eeh.c      | 321 +++++++++++++++++++++++++++++++++++++++++
>>  drivers/vfio/pci/vfio_pci.c    | 131 ++++++++++++++++-
>>  include/uapi/linux/vfio.h      |  53 +++++++
>>  5 files changed, 603 insertions(+), 7 deletions(-)
>
>Maybe a chicken and egg problem, but it seems like we could split the
>platform code and vfio code into separate patches.
>

Ok. I'll keep egg/chicken separated in next revision.

>> 
>> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>> index b9ca023..dd13db6 100644
>> --- a/Documentation/vfio.txt
>> +++ b/Documentation/vfio.txt
>> @@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
>>  an excellent performance which has limitations such as inability to do
>>  locked pages accounting in real time.
>>  
>> -So 3 additional ioctls have been added:
>> +4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
>> +which works on the basis of additional ioctl commands.
>> +
>> +So 8 additional ioctls have been added:
>>  
>>  	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>>  		of the DMA window on the PCI bus.
>> @@ -316,6 +319,20 @@ So 3 additional ioctls have been added:
>>  
>>  	VFIO_IOMMU_DISABLE - disables the container.
>>  
>> +	VFIO_EEH_PE_SET_OPTION - enables or disables EEH functinality on the
>> +		specified device. Also, it can be used to remove IO or DMA
>> +		stopped state on the frozen PE.
>> +
>> +	VFIO_EEH_PE_GET_ADDR - retrieve the unique address of the specified
>> +		PE or query PE sharing mode.
>> +
>> +	VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
>> +
>> +	VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
>> +		error recovering.
>> +
>> +	VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
>> +		one of the major steps for error recoverying.
>>  
>>  The code flow from the example above should be slightly changed:
>>  
>> @@ -346,6 +363,75 @@ The code flow from the example above should be slightly changed:
>>  	ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
>>  	.....
>>  
>> +Based on the initial example we have, the following piece of code could be
>> +reference for EEH setup and error handling:
>> +
>> +	struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
>> +	struct vfio_eeh_pe_get_addr addr = { .argsz = sizeof(addr) };
>> +	struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
>> +	struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
>> +	struct vfio_eeh_pe_configure config = { .argsz = sizeof(config) };
>> +
>> +	....
>> +
>> +	/* Get a file descriptor for the device */
>> +	device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
>> +
>> +	/* Enable the EEH functionality on the device */
>> +	option.option = EEH_OPT_ENABLE;
>> +	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
>> +
>> +	/* Retrieve PE address and create and maintain PE by yourself */
>> +	addr.option = EEH_OPT_GET_PE_ADDR;
>> +	ioctl(device, VFIO_EEH_PE_GET_ADDR, &addr);
>> +
>> +	/* Assure EEH is supported on the PE and make PE functional */
>> +	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
>> +
>> +	/* Save device's state. pci_save_state() would be good enough
>> +	 * as an example.
>> +	 */
>> +
>> +	/* Test and setup the device */
>> +	ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
>> +
>> +	....
>> +
>> +	/* When 0xFF's returned from reading PCI config space or IO BARs
>> +	 * of the PCI device. Check the PE state to see if that has been
>> +	 * frozen.
>> +	 */
>> +	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
>
>There's no notification, the user needs to observe the return value an
>poll?  Should we be enabling an eventfd to notify the user of the state
>change?
>

Yes. The user needs to monitor the return value. we should have one notification,
but it's for later as we discussed :-)

>> +
>> +	/* Waiting for pending PCI transactions to be completed and don't
>> +	 * produce any more PCI traffic from/to the affected PE until
>> +	 * recovery is finished.
>> +	 */
>> +
>> +	/* Enable IO for the affected PE and collect logs. Usually, the
>> +	 * standard part of PCI config space, AER registers are dumped
>> +	 * as logs for further analysis.
>> +	 */
>> +	option.option = EEH_OPT_THAW_MMIO;
>> +	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
>
>How does the guest learn about the error?  Does it need to?

When guest detects 0xFF's from reading PCI config space or IO, it's going
check the device (PE) state. If the device (PE) has been put into frozen
state, the recovery will be started.

>> +
>> +	/* Issue PE reset */
>> +	reset.option = EEH_RESET_HOT;
>> +	ioctl(device, VFIO_EEH_PE_RESET, &reset);
>> +
>> +	/* Configure the PCI bridges for the affected PE */
>> +	ioctl(device, VFIO_EEH_PE_CONFIGURE, NULL);
>> +
>
>I'm not sure I see why we've split these into separate ioctls.  FWIW,
>the one ioctl we currently have for reset that takes no options is
>probably going to be the first to get deprecated because of it.
>
>> +	/* Restored state we saved at initialization time. pci_restore_state()
>> +	 * is good enough as an example.
>> +	 */
>> +
>> +	/* Hopefully, error is recovered successfully. Now, you can resume to
>> +	 * start PCI traffic to/from the affected PE.
>> +	 */
>> +
>> +	....
>> +
>>  -------------------------------------------------------------------------------
>>  
>>  [1] VFIO was originally an acronym for "Virtual Function I/O" in its
>> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>> index 34a2d83..dd5f1cf 100644
>> --- a/arch/powerpc/include/asm/eeh.h
>> +++ b/arch/powerpc/include/asm/eeh.h
>> @@ -191,6 +191,11 @@ enum {
>>  #define EEH_OPT_ENABLE		1	/* EEH enable	*/
>>  #define EEH_OPT_THAW_MMIO	2	/* MMIO enable	*/
>>  #define EEH_OPT_THAW_DMA	3	/* DMA enable	*/
>> +#define EEH_OPT_GET_PE_ADDR	0	/* Get PE addr	*/
>> +#define EEH_OPT_GET_PE_MODE	1	/* Get PE mode	*/
>> +#define EEH_PE_MODE_NONE	0	/* Invalid mode	*/
>> +#define EEH_PE_MODE_NOT_SHARED	1	/* Not shared	*/
>> +#define EEH_PE_MODE_SHARED	2	/* Shared mode	*/
>>  #define EEH_STATE_UNAVAILABLE	(1 << 0)	/* State unavailable	*/
>>  #define EEH_STATE_NOT_SUPPORT	(1 << 1)	/* EEH not supported	*/
>>  #define EEH_STATE_RESET_ACTIVE	(1 << 2)	/* Active reset		*/
>> @@ -198,6 +203,11 @@ enum {
>>  #define EEH_STATE_DMA_ACTIVE	(1 << 4)	/* Active DMA		*/
>>  #define EEH_STATE_MMIO_ENABLED	(1 << 5)	/* MMIO enabled		*/
>>  #define EEH_STATE_DMA_ENABLED	(1 << 6)	/* DMA enabled		*/
>> +#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
>> +#define EEH_PE_STATE_RESET		1		/* PE reset	*/
>> +#define EEH_PE_STATE_STOPPED_IO_DMA	2		/* Stopped	*/
>> +#define EEH_PE_STATE_STOPPED_DMA	4		/* Stopped DMA	*/
>> +#define EEH_PE_STATE_UNAVAIL		5		/* Unavailable	*/
>>  #define EEH_RESET_DEACTIVATE	0	/* Deactivate the PE reset	*/
>>  #define EEH_RESET_HOT		1	/* Hot reset			*/
>>  #define EEH_RESET_FUNDAMENTAL	3	/* Fundamental reset		*/
>> @@ -305,6 +315,13 @@ void eeh_add_device_late(struct pci_dev *);
>>  void eeh_add_device_tree_late(struct pci_bus *);
>>  void eeh_add_sysfs_files(struct pci_bus *);
>>  void eeh_remove_device(struct pci_dev *);
>> +int eeh_dev_open(struct pci_dev *pdev);
>> +void eeh_dev_release(struct pci_dev *pdev);
>> +int eeh_pe_set_option(struct pci_dev *pdev, int option);
>> +int eeh_pe_get_addr(struct pci_dev *pdev, int option);
>> +int eeh_pe_get_state(struct pci_dev *pdev);
>> +int eeh_pe_reset(struct pci_dev *pdev, int option);
>> +int eeh_pe_configure(struct pci_dev *pdev);
>>  
>>  /**
>>   * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
>> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
>> index 9c6b899..b90a474 100644
>> --- a/arch/powerpc/kernel/eeh.c
>> +++ b/arch/powerpc/kernel/eeh.c
>> @@ -108,6 +108,9 @@ struct eeh_ops *eeh_ops = NULL;
>>  /* Lock to avoid races due to multiple reports of an error */
>>  DEFINE_RAW_SPINLOCK(confirm_error_lock);
>>  
>> +/* Lock to protect passed flags */
>> +static DEFINE_MUTEX(eeh_dev_mutex);
>> +
>>  /* Buffer for reporting pci register dumps. Its here in BSS, and
>>   * not dynamically alloced, so that it ends up in RMO where RTAS
>>   * can access it.
>> @@ -1098,6 +1101,324 @@ void eeh_remove_device(struct pci_dev *dev)
>>  	edev->mode &= ~EEH_DEV_SYSFS;
>>  }
>>  
>> +/**
>> + * eeh_dev_open - Mark EEH device and PE as passed through
>> + * @pdev: PCI device
>> + *
>> + * Mark the indicated EEH device and PE as passed through.
>> + * In the result, the EEH errors detected on the PE won't be
>> + * reported. The owner of the device will be responsible for
>> + * detection and recovery.
>> + */
>> +int eeh_dev_open(struct pci_dev *pdev)
>> +{
>> +	struct eeh_dev *edev;
>> +
>> +	mutex_lock(&eeh_dev_mutex);
>> +
>> +	/* No PCI device ? */
>> +	if (!pdev) {
>> +		mutex_unlock(&eeh_dev_mutex);
>> +		return -ENODEV;
>> +	}
>> +
>> +	/* No EEH device ? */
>> +	edev = pci_dev_to_eeh_dev(pdev);
>> +	if (!edev || !edev->pe) {
>> +		mutex_unlock(&eeh_dev_mutex);
>> +		return -ENODEV;
>> +	}
>> +
>> +	eeh_dev_set_passed(edev, true);
>> +	eeh_pe_set_passed(edev->pe, true);
>> +	mutex_unlock(&eeh_dev_mutex);
>> +
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(eeh_dev_open);
>> +
>> +/**
>> + * eeh_dev_release - Reclaim the ownership of EEH device
>> + * @pdev: PCI device
>> + *
>> + * Reclaim ownership of EEH device, potentially the corresponding
>> + * PE. In the result, the EEH errors detected on the PE will be
>> + * reported and handled as usual.
>> + */
>> +void eeh_dev_release(struct pci_dev *pdev)
>> +{
>> +	bool release_pe = true;
>> +	struct eeh_pe *pe = NULL;
>> +	struct eeh_dev *tmp, *edev;
>> +
>> +	mutex_lock(&eeh_dev_mutex);
>> +
>> +	/* No PCI device ? */
>> +	if (!pdev) {
>> +		mutex_unlock(&eeh_dev_mutex);
>> +		return;
>> +	}
>> +
>> +	/* No EEH device ? */
>> +	edev = pci_dev_to_eeh_dev(pdev);
>> +	if (!edev || !eeh_dev_passed(edev) ||
>> +	    !edev->pe || !eeh_pe_passed(pe)) {
>> +		mutex_unlock(&eeh_dev_mutex);
>> +		return;
>> +	}
>> +
>> +	/* Release device */
>> +	pe = edev->pe;
>> +	eeh_dev_set_passed(edev, false);
>> +
>> +	/* Release PE */
>> +	eeh_pe_for_each_dev(pe, edev, tmp) {
>> +		if (eeh_dev_passed(edev)) {
>> +			release_pe = false;
>> +			break;
>> +		}
>> +	}
>> +
>> +	if (release_pe)
>> +		eeh_pe_set_passed(pe, false);
>> +
>> +	mutex_unlock(&eeh_dev_mutex);
>> +}
>> +EXPORT_SYMBOL(eeh_dev_release);
>> +
>> +static int eeh_dev_check(struct pci_dev *pdev,
>> +			 struct eeh_dev **pedev,
>> +			 struct eeh_pe **ppe)
>> +{
>> +	struct eeh_dev *edev;
>> +
>> +	/* No device ? */
>> +	if (!pdev)
>> +		return -ENODEV;
>> +
>> +	edev = pci_dev_to_eeh_dev(pdev);
>> +	if (!edev || !eeh_dev_passed(edev) ||
>> +	    !edev->pe || !eeh_pe_passed(edev->pe))
>> +		return -ENODEV;
>> +
>> +	if (pedev)
>> +		*pedev = edev;
>> +	if (ppe)
>> +		*ppe = edev->pe;
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * eeh_pe_set_option - Set options for the indicated PE
>> + * @pdev: PCI device
>> + * @option: requested option
>> + *
>> + * The routine is called to enable or disable EEH functionality
>> + * on the indicated PE, to enable IO or DMA for the frozen PE.
>> + */
>> +int eeh_pe_set_option(struct pci_dev *pdev, int option)
>> +{
>> +	struct eeh_dev *edev;
>> +	struct eeh_pe *pe;
>> +	int ret = 0;
>> +
>> +	/* Device existing ? */
>> +	ret = eeh_dev_check(pdev, &edev, &pe);
>> +	if (ret)
>> +		return ret;
>> +
>> +	switch (option) {
>> +	case EEH_OPT_DISABLE:
>> +	case EEH_OPT_ENABLE:
>> +		break;
>> +	case EEH_OPT_THAW_MMIO:
>> +	case EEH_OPT_THAW_DMA:
>> +		if (!eeh_ops || !eeh_ops->set_option) {
>> +			ret = -ENOENT;
>> +			break;
>> +		}
>> +
>> +		ret = eeh_ops->set_option(pe, option);
>> +		break;
>> +	default:
>> +		pr_debug("%s: Option %d out of range (%d, %d)\n",
>> +			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
>> +		ret = -EINVAL;
>> +	}
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(eeh_pe_set_option);
>> +
>> +/**
>> + * eeh_pe_get_addr - Retrieve the PE address or sharing mode
>> + * @pdev: PCI device
>> + * @option: option
>> + *
>> + * Retrieve the PE address or sharing mode.
>> + */
>> +int eeh_pe_get_addr(struct pci_dev *pdev, int option)
>> +{
>> +	struct pci_bus *bus;
>> +	struct eeh_dev *edev;
>> +	struct eeh_pe *pe;
>> +	int ret = 0;
>> +
>> +	/* Device existing ? */
>> +	ret = eeh_dev_check(pdev, &edev, &pe);
>> +	if (ret)
>> +		return ret;
>> +
>> +	switch (option) {
>> +	case EEH_OPT_GET_PE_ADDR:
>> +		bus = eeh_pe_bus_get(pe);
>> +		if (!bus) {
>> +			ret = -ENODEV;
>> +			break;
>> +		}
>> +
>> +		/* PE address has format "00BBSS00" */
>> +		ret = bus->number << 16;
>> +		break;
>> +	case EEH_OPT_GET_PE_MODE:
>> +		/* Wa always have shared PE */
>> +		ret = EEH_PE_MODE_SHARED;
>> +		break;
>> +	default:
>> +		pr_debug("%s: option %d out of range (0, 1)\n",
>> +			__func__, option);
>> +		ret = -EINVAL;
>> +	}
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(eeh_pe_get_addr);
>> +
>> +/**
>> + * eeh_pe_get_state - Retrieve PE's state
>> + * @pdev: PCI device
>> + *
>> + * Retrieve the PE's state, which includes 3 aspects: enabled
>> + * DMA, enabled IO and asserted reset.
>> + */
>> +int eeh_pe_get_state(struct pci_dev *pdev)
>> +{
>> +	struct eeh_dev *edev;
>> +	struct eeh_pe *pe;
>> +	int result, ret = 0;
>> +
>> +	/* Device existing ? */
>> +	ret = eeh_dev_check(pdev, &edev, &pe);
>> +	if (ret)
>> +		return ret;
>> +
>> +	if (!eeh_ops || !eeh_ops->get_state)
>> +		return -ENOENT;
>> +
>> +	result = eeh_ops->get_state(pe, NULL);
>> +	if (!(result & EEH_STATE_RESET_ACTIVE) &&
>> +	     (result & EEH_STATE_DMA_ENABLED) &&
>> +	     (result & EEH_STATE_MMIO_ENABLED))
>> +		ret = EEH_PE_STATE_NORMAL;
>> +	else if (result & EEH_STATE_RESET_ACTIVE)
>> +		ret = EEH_PE_STATE_RESET;
>> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
>> +		 !(result & EEH_STATE_DMA_ENABLED) &&
>> +		 !(result & EEH_STATE_MMIO_ENABLED))
>> +		ret = EEH_PE_STATE_STOPPED_IO_DMA;
>> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
>> +		 (result & EEH_STATE_DMA_ENABLED) &&
>> +		 !(result & EEH_STATE_MMIO_ENABLED))
>> +		ret = EEH_PE_STATE_STOPPED_DMA;
>> +	else
>> +		ret = EEH_PE_STATE_UNAVAIL;
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(eeh_pe_get_state);
>> +
>> +/**
>> + * eeh_pe_reset - Issue PE reset according to specified type
>> + * @pdev: PCI device
>> + * @option: reset type
>> + *
>> + * The routine is called to reset the specified PE with the
>> + * indicated type, either fundamental reset or hot reset.
>> + * PE reset is the most important part for error recovery.
>> + */
>> +int eeh_pe_reset(struct pci_dev *pdev, int option)
>> +{
>> +	struct eeh_dev *edev;
>> +	struct eeh_pe *pe;
>> +	int ret = 0;
>> +
>> +	/* Device existing ? */
>> +	ret = eeh_dev_check(pdev, &edev, &pe);
>> +	if (ret)
>> +		return ret;
>> +
>> +	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset)
>> +		return -ENOENT;
>> +
>> +	switch (option) {
>> +	case EEH_RESET_DEACTIVATE:
>> +		ret = eeh_ops->reset(pe, option);
>> +		if (ret)
>> +			break;
>> +
>> +		/*
>> +		 * The PE is still in frozen state and we need clear
>> +		 * that. It's good to clear frozen state after deassert
>> +		 * to avoid messy IO access during reset, which might
>> +		 * cause recursive frozen PE.
>> +		 */
>> +		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
>> +		if (!ret)
>> +			ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
>> +		if (!ret)
>> +			eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
>> +		break;
>> +	case EEH_RESET_HOT:
>> +	case EEH_RESET_FUNDAMENTAL:
>> +		ret = eeh_ops->reset(pe, option);
>> +		break;
>> +	default:
>> +		pr_debug("%s: Unsupported option %d\n",
>> +			__func__, option);
>> +		ret = -EINVAL;
>> +	}
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(eeh_pe_reset);
>> +
>> +/**
>> + * eeh_pe_configure - Configure PCI bridges after PE reset
>> + * @pdev: PCI device
>> + *
>> + * The routine is called to restore the PCI config space for
>> + * those PCI devices, especially PCI bridges affected by PE
>> + * reset issued previously.
>> + */
>> +int eeh_pe_configure(struct pci_dev *pdev)
>> +{
>> +	struct eeh_dev *edev;
>> +	struct eeh_pe *pe;
>> +	int ret = 0;
>> +
>> +	/* Device existing ? */
>> +	ret = eeh_dev_check(pdev, &edev, &pe);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* Restore config space for the affected devices */
>> +	eeh_pe_restore_bars(pe);
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(eeh_pe_configure);
>> +
>>  static int proc_eeh_show(struct seq_file *m, void *v)
>>  {
>>  	if (!eeh_enabled()) {
>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>> index 7ba0424..301ac18 100644
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -25,6 +25,9 @@
>>  #include <linux/types.h>
>>  #include <linux/uaccess.h>
>>  #include <linux/vfio.h>
>> +#ifdef CONFIG_EEH
>> +#include <asm/eeh.h>
>> +#endif
>
>Can we make a vfio_pci_eeh file that properly stubs out anything called
>from common code?  I don't want to see all these inline ifdefs.
>

well. do you want see drivers/vfio/vfio-pci/vfio_pci_eeh.c and export
following functins for vfio_pci.c to use?

vfio_pci_eeh_open()
vfio_pci_eeh_release()
vfio_pci_eeh_ioctl()


>>  
>>  #include "vfio_pci_private.h"
>>  
>> @@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>>  	pci_restore_state(pdev);
>>  }
>>  
>> +static void vfio_eeh_pci_release(struct pci_dev *pdev)
>> +{
>> +#ifdef CONFIG_EEH
>> +	eeh_dev_release(pdev);
>> +#endif
>> +}
>> +
>>  static void vfio_pci_release(void *device_data)
>>  {
>>  	struct vfio_pci_device *vdev = device_data;
>>  
>> -	if (atomic_dec_and_test(&vdev->refcnt))
>> +	if (atomic_dec_and_test(&vdev->refcnt)) {
>> +		vfio_eeh_pci_release(vdev->pdev);
>>  		vfio_pci_disable(vdev);
>> +	}
>>  
>>  	module_put(THIS_MODULE);
>>  }
>>  
>> +static int vfio_eeh_pci_open(struct pci_dev *pdev)
>> +{
>> +	int ret = 0;
>> +
>> +#ifdef CONFIG_EEH
>> +	ret = eeh_dev_open(pdev);
>> +#endif
>> +	return ret;
>> +}
>> +
>>  static int vfio_pci_open(void *device_data)
>>  {
>>  	struct vfio_pci_device *vdev = device_data;
>> +	int ret;
>>  
>>  	if (!try_module_get(THIS_MODULE))
>>  		return -ENODEV;
>>  
>>  	if (atomic_inc_return(&vdev->refcnt) == 1) {
>> -		int ret = vfio_pci_enable(vdev);
>> -		if (ret) {
>> -			module_put(THIS_MODULE);
>> -			return ret;
>> -		}
>> +		ret = vfio_pci_enable(vdev);
>> +		if (ret)
>> +			goto error;
>> +
>> +		ret = vfio_eeh_pci_open(vdev->pdev);
>> +		if (ret)
>> +			goto error;
>>  	}
>>  
>>  	return 0;
>> +error:
>> +	module_put(THIS_MODULE);
>> +	return ret;
>>  }
>>  
>>  static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
>> @@ -321,6 +349,91 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>>  	return walk.ret;
>>  }
>>  
>> +static int vfio_eeh_pci_ioctl(struct pci_dev *pdev,
>> +			      unsigned int cmd,
>> +			      unsigned long arg)
>> +{
>> +	unsigned long minsz;
>> +	int ret = 0;
>> +
>> +#ifdef CONFIG_EEH
>> +	switch (cmd) {
>> +	case VFIO_EEH_PE_SET_OPTION: {
>> +		struct vfio_eeh_pe_set_option option;
>> +
>> +		minsz = offsetofend(struct vfio_eeh_pe_set_option, option);
>> +		if (copy_from_user(&option, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +		if (option.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		ret = eeh_pe_set_option(pdev, option.option);
>> +		break;
>> +	}
>> +	case VFIO_EEH_PE_GET_ADDR: {
>> +		struct vfio_eeh_pe_get_addr addr;
>> +
>> +		minsz = offsetofend(struct vfio_eeh_pe_get_addr, info);
>> +		if (copy_from_user(&addr, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +		if (addr.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		ret = eeh_pe_get_addr(pdev, addr.option);
>> +		if (ret >= 0) {
>> +			addr.info = ret;
>> +			if (copy_to_user((void __user *)arg, &addr, minsz))
>> +				return -EFAULT;
>> +			ret = 0;
>> +		}
>> +
>> +		break;
>> +	}
>> +	case VFIO_EEH_PE_GET_STATE: {
>> +		struct vfio_eeh_pe_get_state state;
>> +
>> +		minsz = offsetofend(struct vfio_eeh_pe_get_state, state);
>> +		if (copy_from_user(&state, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +		if (state.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		ret = eeh_pe_get_state(pdev);
>> +		if (ret >= 0) {
>> +			state.state = ret;
>> +			if (copy_to_user((void __user *)arg, &state, minsz))
>> +				return -EFAULT;
>> +			ret = 0;
>> +		}
>> +		break;
>> +	}
>> +	case VFIO_EEH_PE_RESET: {
>> +		struct vfio_eeh_pe_reset reset;
>> +
>> +		minsz = offsetofend(struct vfio_eeh_pe_reset, option);
>> +		if (copy_from_user(&reset, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +		if (reset.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		ret = eeh_pe_reset(pdev, reset.option);
>> +		break;
>> +	}
>> +	case VFIO_EEH_PE_CONFIGURE:
>> +		ret = eeh_pe_configure(pdev);
>> +		break;
>> +	default:
>> +		ret = -EINVAL;
>> +		pr_debug("%s: Cannot handle command %d\n",
>> +			__func__, cmd);
>> +	}
>> +#else
>> +	ret = -ENOENT;
>> +#endif
>> +
>> +	return ret;
>> +}
>> +
>>  static long vfio_pci_ioctl(void *device_data,
>>  			   unsigned int cmd, unsigned long arg)
>>  {
>> @@ -682,6 +795,12 @@ hot_reset_release:
>>  
>>  		kfree(groups);
>>  		return ret;
>> +	} else if (cmd == VFIO_EEH_PE_SET_OPTION ||
>> +		   cmd == VFIO_EEH_PE_GET_ADDR ||
>> +		   cmd == VFIO_EEH_PE_GET_STATE ||
>> +		   cmd == VFIO_EEH_PE_RESET ||
>> +		   cmd == VFIO_EEH_PE_CONFIGURE) {
>> +			return vfio_eeh_pci_ioctl(vdev->pdev, cmd, arg);
>>  	}
>>  
>>  	return -ENOTTY;
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index cb9023d..ef55682 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
>>  
>>  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>>  
>> +/*
>> + * EEH functionality can be enabled or disabled on one specific device.
>> + * Also, the DMA or IO frozen state can be removed from the frozen PE
>> + * if required.
>> + */
>> +struct vfio_eeh_pe_set_option {
>> +	__u32 argsz;
>> +	__u32 option;
>> +};
>> +
>> +#define VFIO_EEH_PE_SET_OPTION		_IO(VFIO_TYPE, VFIO_BASE + 21)
>
>What proposed ioctls are making you jump to 21?
>
>argsz is probably not as useful without a flags field.  Otherwise the
>caller can't indicate what the extra space is.
>

The QEMU patches are based on Alexey's additional feature ("ddw"), which
consumed several ioctl commands.

ok. So you also prefer to remove "argsz"?

>> +
>> +/*
>> + * Each EEH PE should have unique address to be identified. The command
>> + * helps to retrieve the address and the sharing mode of the PE.
>> + */
>> +struct vfio_eeh_pe_get_addr {
>> +	__u32 argsz;
>> +	__u32 option;
>> +	__u32 info;
>> +};
>> +
>> +#define VFIO_EEH_PE_GET_ADDR		_IO(VFIO_TYPE, VFIO_BASE + 22)
>> +
>> +/*
>> + * EEH PE might have been frozen because of PCI errors. Also, it might
>> + * be experiencing reset for error revoery. The following command helps
>> + * to get the state.
>> + */
>> +struct vfio_eeh_pe_get_state {
>> +	__u32 argsz;
>> +	__u32 state;
>> +};
>> +
>> +#define VFIO_EEH_PE_GET_STATE		_IO(VFIO_TYPE, VFIO_BASE + 23)
>> +
>> +/*
>> + * Reset is the major step to recover problematic PE. The following
>> + * command helps on that.
>> + */
>> +struct vfio_eeh_pe_reset {
>> +	__u32 argsz;
>> +	__u32 option;
>> +};
>> +
>> +#define VFIO_EEH_PE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 24)
>> +
>> +/*
>> + * One of the steps for recovery after PE reset is to configure the
>> + * PCI bridges affected by the PE reset.
>> + */
>> +#define VFIO_EEH_PE_CONFIGURE		_IO(VFIO_TYPE, VFIO_BASE + 25)
>
>What can the user do differently by making these separate ioctls?
>

hrm, I didn't understood as well. Alex.G could have the explaination.

>> +
>>  /* ***************************************************************** */
>>  
>>  #endif /* _UAPIVFIO_H */

Thanks,
Gavin
Benjamin Herrenschmidt May 23, 2014, 5 a.m. UTC | #7
On Fri, 2014-05-23 at 14:37 +1000, Gavin Shan wrote:
> >There's no notification, the user needs to observe the return value an
> >poll?  Should we be enabling an eventfd to notify the user of the state
> >change?
> >
> 
> Yes. The user needs to monitor the return value. we should have one notification,
> but it's for later as we discussed :-)

 ../..

> >How does the guest learn about the error?  Does it need to?
> 
> When guest detects 0xFF's from reading PCI config space or IO, it's going
> check the device (PE) state. If the device (PE) has been put into frozen
> state, the recovery will be started.

Quick recap for Alex W (we discussed that with Alex G).

While a notification looks like a worthwhile addition in the long run, it
is not sufficient and not used today and I prefer that we keep that as something
to add later for those two main reasons:

 - First, the kernel itself isn't always notified. For example, if we implement
on top of an RTAS backend (PR KVM under pHyp) or if we are on top of PowerNV but
the error is a PHB "fence" (the entire PCI Host bridge gets fenced out in hardware
due to an internal error), then we get no notification. Only polling of the
hardware or firmware will tell us. Since we don't want to have a polling timer
in the kernel, that means that the userspace client of VFIO (or alternatively
the KVM guest) is the one that polls.

 - Second, this is how our primary user expects it: The primary (and only initial)
user of this will be qemu/KVM for PAPR guests and they don't have a notification
mechanism. Instead they query the EEH state after detecting an all 1's return from
MMIO or config space. This is how PAPR specifies it so we are just implementing the
spec here :-)

Because of these, I think we shouldn't worry too much about notification at
this stage.

Cheers,
Ben.
Alexander Graf May 23, 2014, 6:52 a.m. UTC | #8
> Am 23.05.2014 um 05:23 schrieb Alex Williamson <alex.williamson@redhat.com>:
> 
>> On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
>>> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
>>>> On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
>>>> On 22.05.14 10:23, Gavin Shan wrote:
>> 
>> .../...
>> 
>>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>>>> index cb9023d..ef55682 100644
>>>>> --- a/include/uapi/linux/vfio.h
>>>>> +++ b/include/uapi/linux/vfio.h
>>>>> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
>>>>> #define VFIO_IOMMU_SPAPR_TCE_GET_INFO    _IO(VFIO_TYPE, VFIO_BASE + 12)
>>>>> +/*
>>>>> + * EEH functionality can be enabled or disabled on one specific device.
>>>>> + * Also, the DMA or IO frozen state can be removed from the frozen PE
>>>>> + * if required.
>>>>> + */
>>>>> +struct vfio_eeh_pe_set_option {
>>>>> +    __u32 argsz;
>>>> 
>>>> What is this argsz thing? Is this your way of maintaining backwards
>>>> compatibility when we introduce new fields? A new field will change
>>>> the ioctl number, so I don't think that makes a lot of sense :).
>>>> 
>>>> Just make the ioctl have a u32 as incoming argument. No fancy
>>>> structs, no complicated code.
>>>> 
>>>> The same applies for a number of structs below.
>>> 
>>> ok. Will do in next revision.
>> 
>> Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl command
>> always has "argsz". I guess it was used as checker by Alex.W. Do you really
>> want remove "argsz" ?
> 
> 
> IIRC, this was actually a suggestion incorporated from David Gibson, but
> using _IO with an argsz and flags field we can maintain compatibility
> without bumping the ioctl number.  It really only makes sense if we have
> a flags field so we can identify what additional information is being
> provided.  Flags can be used as a bitmap of trailing structures or as
> revision if we want a set of trailing structures that may change over
> time.  Unless you can come up with a good argument against it that would
> prevent us inventing a new ioctl as soon as we need a minor tweak, I'd
> prefer to keep it.  As I noted in a previous comment, the one ioctl we
> have for reset that doesn't take any options is likely going to be the
> first ioctl that we need to entirely replace.  If we don't keep argsz,
> it seems like we probably need a flags field and reserved structures.
> 
>>>>> +    __u32 option;
>>>>> +};
>>>>> +
>>>>> +#define VFIO_EEH_PE_SET_OPTION        _IO(VFIO_TYPE, VFIO_BASE + 21)
>>>>> +
>>>>> +/*
>>>>> + * Each EEH PE should have unique address to be identified. The command
>>>>> + * helps to retrieve the address and the sharing mode of the PE.
>>>>> + */
>>>>> +struct vfio_eeh_pe_get_addr {
>>>>> +    __u32 argsz;
>>>>> +    __u32 option;
>>>>> +    __u32 info;
>>>> 
>>>> Any particular reason you need the info field? Can't the return value
>>>> of the ioctl hold this? Then you only have a single u32 argument left
>>>> to the ioctl again.
>>> 
>>> ok. Will do in next revision.
>> 
>> If we eventually remove "argsz" and let ioctl() return value to hold
>> information (or negative number for errors), we don't need any data
>> struct because the 3rd parameter of ioctl() would be used as input
>> and I only need one input parameter. Do you want see this ?
>> 
>> Hopefully, Alex.W saw this and hasn't objections :)
> 
> I'm not sure why we're pushing for the minimal data set to pass to an
> ioctl.  Seems like a recipe for dead, useless ioctls.  Thanks,
> 

The ioctl number includes sizeof(payload). So if a new parameter gets added, that would be a different ioctl number.

If you want to maintain backwards compatibility ioctl number wise in the kernel, you'll have to have a "flags" field to indicate whether new data is available and a "pad" field, prefarably in a union, that ensures the size of the struct doesn't change.

I'm not sure it's really necessary here to have identical ioctl numbers if we add parameters, since we can always just define a new ioctl with a bigger payload that can then become the default handler and a shim backwards compatible handler with the old number.

But if you think it is important, let's do it for real, not just halfway.


Alex
Alexander Graf May 23, 2014, 6:55 a.m. UTC | #9
> Am 23.05.2014 um 06:37 schrieb Gavin Shan <gwshan@linux.vnet.ibm.com>:
> 
>> On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
>>> On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
>>> The patch adds new IOCTL commands for VFIO PCI device to support
>>> EEH functionality for PCI devices, which have been passed through
>>> from host to somebody else via VFIO.
>>> 
>>> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>> ---
>>> Documentation/vfio.txt         |  88 ++++++++++-
>>> arch/powerpc/include/asm/eeh.h |  17 +++
>>> arch/powerpc/kernel/eeh.c      | 321 +++++++++++++++++++++++++++++++++++++++++
>>> drivers/vfio/pci/vfio_pci.c    | 131 ++++++++++++++++-
>>> include/uapi/linux/vfio.h      |  53 +++++++
>>> 5 files changed, 603 insertions(+), 7 deletions(-)
>> 
>> Maybe a chicken and egg problem, but it seems like we could split the
>> platform code and vfio code into separate patches.
> 
> Ok. I'll keep egg/chicken separated in next revision.
> 
>>> 
>>> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>>> index b9ca023..dd13db6 100644
>>> --- a/Documentation/vfio.txt
>>> +++ b/Documentation/vfio.txt
>>> @@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
>>> an excellent performance which has limitations such as inability to do
>>> locked pages accounting in real time.
>>> 
>>> -So 3 additional ioctls have been added:
>>> +4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
>>> +which works on the basis of additional ioctl commands.
>>> +
>>> +So 8 additional ioctls have been added:
>>> 
>>>    VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>>>        of the DMA window on the PCI bus.
>>> @@ -316,6 +319,20 @@ So 3 additional ioctls have been added:
>>> 
>>>    VFIO_IOMMU_DISABLE - disables the container.
>>> 
>>> +    VFIO_EEH_PE_SET_OPTION - enables or disables EEH functinality on the
>>> +        specified device. Also, it can be used to remove IO or DMA
>>> +        stopped state on the frozen PE.
>>> +
>>> +    VFIO_EEH_PE_GET_ADDR - retrieve the unique address of the specified
>>> +        PE or query PE sharing mode.
>>> +
>>> +    VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
>>> +
>>> +    VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
>>> +        error recovering.
>>> +
>>> +    VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
>>> +        one of the major steps for error recoverying.
>>> 
>>> The code flow from the example above should be slightly changed:
>>> 
>>> @@ -346,6 +363,75 @@ The code flow from the example above should be slightly changed:
>>>    ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
>>>    .....
>>> 
>>> +Based on the initial example we have, the following piece of code could be
>>> +reference for EEH setup and error handling:
>>> +
>>> +    struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
>>> +    struct vfio_eeh_pe_get_addr addr = { .argsz = sizeof(addr) };
>>> +    struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
>>> +    struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
>>> +    struct vfio_eeh_pe_configure config = { .argsz = sizeof(config) };
>>> +
>>> +    ....
>>> +
>>> +    /* Get a file descriptor for the device */
>>> +    device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
>>> +
>>> +    /* Enable the EEH functionality on the device */
>>> +    option.option = EEH_OPT_ENABLE;
>>> +    ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
>>> +
>>> +    /* Retrieve PE address and create and maintain PE by yourself */
>>> +    addr.option = EEH_OPT_GET_PE_ADDR;
>>> +    ioctl(device, VFIO_EEH_PE_GET_ADDR, &addr);
>>> +
>>> +    /* Assure EEH is supported on the PE and make PE functional */
>>> +    ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
>>> +
>>> +    /* Save device's state. pci_save_state() would be good enough
>>> +     * as an example.
>>> +     */
>>> +
>>> +    /* Test and setup the device */
>>> +    ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
>>> +
>>> +    ....
>>> +
>>> +    /* When 0xFF's returned from reading PCI config space or IO BARs
>>> +     * of the PCI device. Check the PE state to see if that has been
>>> +     * frozen.
>>> +     */
>>> +    ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
>> 
>> There's no notification, the user needs to observe the return value an
>> poll?  Should we be enabling an eventfd to notify the user of the state
>> change?
> 
> Yes. The user needs to monitor the return value. we should have one notification,
> but it's for later as we discussed :-)
> 
>>> +
>>> +    /* Waiting for pending PCI transactions to be completed and don't
>>> +     * produce any more PCI traffic from/to the affected PE until
>>> +     * recovery is finished.
>>> +     */
>>> +
>>> +    /* Enable IO for the affected PE and collect logs. Usually, the
>>> +     * standard part of PCI config space, AER registers are dumped
>>> +     * as logs for further analysis.
>>> +     */
>>> +    option.option = EEH_OPT_THAW_MMIO;
>>> +    ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
>> 
>> How does the guest learn about the error?  Does it need to?
> 
> When guest detects 0xFF's from reading PCI config space or IO, it's going
> check the device (PE) state. If the device (PE) has been put into frozen
> state, the recovery will be started.
> 
>>> +
>>> +    /* Issue PE reset */
>>> +    reset.option = EEH_RESET_HOT;
>>> +    ioctl(device, VFIO_EEH_PE_RESET, &reset);
>>> +
>>> +    /* Configure the PCI bridges for the affected PE */
>>> +    ioctl(device, VFIO_EEH_PE_CONFIGURE, NULL);
>>> +
>> 
>> I'm not sure I see why we've split these into separate ioctls.  FWIW,
>> the one ioctl we currently have for reset that takes no options is
>> probably going to be the first to get deprecated because of it.
>> 
>>> +    /* Restored state we saved at initialization time. pci_restore_state()
>>> +     * is good enough as an example.
>>> +     */
>>> +
>>> +    /* Hopefully, error is recovered successfully. Now, you can resume to
>>> +     * start PCI traffic to/from the affected PE.
>>> +     */
>>> +
>>> +    ....
>>> +
>>> -------------------------------------------------------------------------------
>>> 
>>> [1] VFIO was originally an acronym for "Virtual Function I/O" in its
>>> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>>> index 34a2d83..dd5f1cf 100644
>>> --- a/arch/powerpc/include/asm/eeh.h
>>> +++ b/arch/powerpc/include/asm/eeh.h
>>> @@ -191,6 +191,11 @@ enum {
>>> #define EEH_OPT_ENABLE        1    /* EEH enable    */
>>> #define EEH_OPT_THAW_MMIO    2    /* MMIO enable    */
>>> #define EEH_OPT_THAW_DMA    3    /* DMA enable    */
>>> +#define EEH_OPT_GET_PE_ADDR    0    /* Get PE addr    */
>>> +#define EEH_OPT_GET_PE_MODE    1    /* Get PE mode    */
>>> +#define EEH_PE_MODE_NONE    0    /* Invalid mode    */
>>> +#define EEH_PE_MODE_NOT_SHARED    1    /* Not shared    */
>>> +#define EEH_PE_MODE_SHARED    2    /* Shared mode    */
>>> #define EEH_STATE_UNAVAILABLE    (1 << 0)    /* State unavailable    */
>>> #define EEH_STATE_NOT_SUPPORT    (1 << 1)    /* EEH not supported    */
>>> #define EEH_STATE_RESET_ACTIVE    (1 << 2)    /* Active reset        */
>>> @@ -198,6 +203,11 @@ enum {
>>> #define EEH_STATE_DMA_ACTIVE    (1 << 4)    /* Active DMA        */
>>> #define EEH_STATE_MMIO_ENABLED    (1 << 5)    /* MMIO enabled        */
>>> #define EEH_STATE_DMA_ENABLED    (1 << 6)    /* DMA enabled        */
>>> +#define EEH_PE_STATE_NORMAL        0    /* Normal state        */
>>> +#define EEH_PE_STATE_RESET        1        /* PE reset    */
>>> +#define EEH_PE_STATE_STOPPED_IO_DMA    2        /* Stopped    */
>>> +#define EEH_PE_STATE_STOPPED_DMA    4        /* Stopped DMA    */
>>> +#define EEH_PE_STATE_UNAVAIL        5        /* Unavailable    */
>>> #define EEH_RESET_DEACTIVATE    0    /* Deactivate the PE reset    */
>>> #define EEH_RESET_HOT        1    /* Hot reset            */
>>> #define EEH_RESET_FUNDAMENTAL    3    /* Fundamental reset        */
>>> @@ -305,6 +315,13 @@ void eeh_add_device_late(struct pci_dev *);
>>> void eeh_add_device_tree_late(struct pci_bus *);
>>> void eeh_add_sysfs_files(struct pci_bus *);
>>> void eeh_remove_device(struct pci_dev *);
>>> +int eeh_dev_open(struct pci_dev *pdev);
>>> +void eeh_dev_release(struct pci_dev *pdev);
>>> +int eeh_pe_set_option(struct pci_dev *pdev, int option);
>>> +int eeh_pe_get_addr(struct pci_dev *pdev, int option);
>>> +int eeh_pe_get_state(struct pci_dev *pdev);
>>> +int eeh_pe_reset(struct pci_dev *pdev, int option);
>>> +int eeh_pe_configure(struct pci_dev *pdev);
>>> 
>>> /**
>>>  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
>>> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
>>> index 9c6b899..b90a474 100644
>>> --- a/arch/powerpc/kernel/eeh.c
>>> +++ b/arch/powerpc/kernel/eeh.c
>>> @@ -108,6 +108,9 @@ struct eeh_ops *eeh_ops = NULL;
>>> /* Lock to avoid races due to multiple reports of an error */
>>> DEFINE_RAW_SPINLOCK(confirm_error_lock);
>>> 
>>> +/* Lock to protect passed flags */
>>> +static DEFINE_MUTEX(eeh_dev_mutex);
>>> +
>>> /* Buffer for reporting pci register dumps. Its here in BSS, and
>>>  * not dynamically alloced, so that it ends up in RMO where RTAS
>>>  * can access it.
>>> @@ -1098,6 +1101,324 @@ void eeh_remove_device(struct pci_dev *dev)
>>>    edev->mode &= ~EEH_DEV_SYSFS;
>>> }
>>> 
>>> +/**
>>> + * eeh_dev_open - Mark EEH device and PE as passed through
>>> + * @pdev: PCI device
>>> + *
>>> + * Mark the indicated EEH device and PE as passed through.
>>> + * In the result, the EEH errors detected on the PE won't be
>>> + * reported. The owner of the device will be responsible for
>>> + * detection and recovery.
>>> + */
>>> +int eeh_dev_open(struct pci_dev *pdev)
>>> +{
>>> +    struct eeh_dev *edev;
>>> +
>>> +    mutex_lock(&eeh_dev_mutex);
>>> +
>>> +    /* No PCI device ? */
>>> +    if (!pdev) {
>>> +        mutex_unlock(&eeh_dev_mutex);
>>> +        return -ENODEV;
>>> +    }
>>> +
>>> +    /* No EEH device ? */
>>> +    edev = pci_dev_to_eeh_dev(pdev);
>>> +    if (!edev || !edev->pe) {
>>> +        mutex_unlock(&eeh_dev_mutex);
>>> +        return -ENODEV;
>>> +    }
>>> +
>>> +    eeh_dev_set_passed(edev, true);
>>> +    eeh_pe_set_passed(edev->pe, true);
>>> +    mutex_unlock(&eeh_dev_mutex);
>>> +
>>> +    return 0;
>>> +}
>>> +EXPORT_SYMBOL_GPL(eeh_dev_open);
>>> +
>>> +/**
>>> + * eeh_dev_release - Reclaim the ownership of EEH device
>>> + * @pdev: PCI device
>>> + *
>>> + * Reclaim ownership of EEH device, potentially the corresponding
>>> + * PE. In the result, the EEH errors detected on the PE will be
>>> + * reported and handled as usual.
>>> + */
>>> +void eeh_dev_release(struct pci_dev *pdev)
>>> +{
>>> +    bool release_pe = true;
>>> +    struct eeh_pe *pe = NULL;
>>> +    struct eeh_dev *tmp, *edev;
>>> +
>>> +    mutex_lock(&eeh_dev_mutex);
>>> +
>>> +    /* No PCI device ? */
>>> +    if (!pdev) {
>>> +        mutex_unlock(&eeh_dev_mutex);
>>> +        return;
>>> +    }
>>> +
>>> +    /* No EEH device ? */
>>> +    edev = pci_dev_to_eeh_dev(pdev);
>>> +    if (!edev || !eeh_dev_passed(edev) ||
>>> +        !edev->pe || !eeh_pe_passed(pe)) {
>>> +        mutex_unlock(&eeh_dev_mutex);
>>> +        return;
>>> +    }
>>> +
>>> +    /* Release device */
>>> +    pe = edev->pe;
>>> +    eeh_dev_set_passed(edev, false);
>>> +
>>> +    /* Release PE */
>>> +    eeh_pe_for_each_dev(pe, edev, tmp) {
>>> +        if (eeh_dev_passed(edev)) {
>>> +            release_pe = false;
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    if (release_pe)
>>> +        eeh_pe_set_passed(pe, false);
>>> +
>>> +    mutex_unlock(&eeh_dev_mutex);
>>> +}
>>> +EXPORT_SYMBOL(eeh_dev_release);
>>> +
>>> +static int eeh_dev_check(struct pci_dev *pdev,
>>> +             struct eeh_dev **pedev,
>>> +             struct eeh_pe **ppe)
>>> +{
>>> +    struct eeh_dev *edev;
>>> +
>>> +    /* No device ? */
>>> +    if (!pdev)
>>> +        return -ENODEV;
>>> +
>>> +    edev = pci_dev_to_eeh_dev(pdev);
>>> +    if (!edev || !eeh_dev_passed(edev) ||
>>> +        !edev->pe || !eeh_pe_passed(edev->pe))
>>> +        return -ENODEV;
>>> +
>>> +    if (pedev)
>>> +        *pedev = edev;
>>> +    if (ppe)
>>> +        *ppe = edev->pe;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +/**
>>> + * eeh_pe_set_option - Set options for the indicated PE
>>> + * @pdev: PCI device
>>> + * @option: requested option
>>> + *
>>> + * The routine is called to enable or disable EEH functionality
>>> + * on the indicated PE, to enable IO or DMA for the frozen PE.
>>> + */
>>> +int eeh_pe_set_option(struct pci_dev *pdev, int option)
>>> +{
>>> +    struct eeh_dev *edev;
>>> +    struct eeh_pe *pe;
>>> +    int ret = 0;
>>> +
>>> +    /* Device existing ? */
>>> +    ret = eeh_dev_check(pdev, &edev, &pe);
>>> +    if (ret)
>>> +        return ret;
>>> +
>>> +    switch (option) {
>>> +    case EEH_OPT_DISABLE:
>>> +    case EEH_OPT_ENABLE:
>>> +        break;
>>> +    case EEH_OPT_THAW_MMIO:
>>> +    case EEH_OPT_THAW_DMA:
>>> +        if (!eeh_ops || !eeh_ops->set_option) {
>>> +            ret = -ENOENT;
>>> +            break;
>>> +        }
>>> +
>>> +        ret = eeh_ops->set_option(pe, option);
>>> +        break;
>>> +    default:
>>> +        pr_debug("%s: Option %d out of range (%d, %d)\n",
>>> +            __func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
>>> +        ret = -EINVAL;
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +EXPORT_SYMBOL_GPL(eeh_pe_set_option);
>>> +
>>> +/**
>>> + * eeh_pe_get_addr - Retrieve the PE address or sharing mode
>>> + * @pdev: PCI device
>>> + * @option: option
>>> + *
>>> + * Retrieve the PE address or sharing mode.
>>> + */
>>> +int eeh_pe_get_addr(struct pci_dev *pdev, int option)
>>> +{
>>> +    struct pci_bus *bus;
>>> +    struct eeh_dev *edev;
>>> +    struct eeh_pe *pe;
>>> +    int ret = 0;
>>> +
>>> +    /* Device existing ? */
>>> +    ret = eeh_dev_check(pdev, &edev, &pe);
>>> +    if (ret)
>>> +        return ret;
>>> +
>>> +    switch (option) {
>>> +    case EEH_OPT_GET_PE_ADDR:
>>> +        bus = eeh_pe_bus_get(pe);
>>> +        if (!bus) {
>>> +            ret = -ENODEV;
>>> +            break;
>>> +        }
>>> +
>>> +        /* PE address has format "00BBSS00" */
>>> +        ret = bus->number << 16;
>>> +        break;
>>> +    case EEH_OPT_GET_PE_MODE:
>>> +        /* Wa always have shared PE */
>>> +        ret = EEH_PE_MODE_SHARED;
>>> +        break;
>>> +    default:
>>> +        pr_debug("%s: option %d out of range (0, 1)\n",
>>> +            __func__, option);
>>> +        ret = -EINVAL;
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +EXPORT_SYMBOL_GPL(eeh_pe_get_addr);
>>> +
>>> +/**
>>> + * eeh_pe_get_state - Retrieve PE's state
>>> + * @pdev: PCI device
>>> + *
>>> + * Retrieve the PE's state, which includes 3 aspects: enabled
>>> + * DMA, enabled IO and asserted reset.
>>> + */
>>> +int eeh_pe_get_state(struct pci_dev *pdev)
>>> +{
>>> +    struct eeh_dev *edev;
>>> +    struct eeh_pe *pe;
>>> +    int result, ret = 0;
>>> +
>>> +    /* Device existing ? */
>>> +    ret = eeh_dev_check(pdev, &edev, &pe);
>>> +    if (ret)
>>> +        return ret;
>>> +
>>> +    if (!eeh_ops || !eeh_ops->get_state)
>>> +        return -ENOENT;
>>> +
>>> +    result = eeh_ops->get_state(pe, NULL);
>>> +    if (!(result & EEH_STATE_RESET_ACTIVE) &&
>>> +         (result & EEH_STATE_DMA_ENABLED) &&
>>> +         (result & EEH_STATE_MMIO_ENABLED))
>>> +        ret = EEH_PE_STATE_NORMAL;
>>> +    else if (result & EEH_STATE_RESET_ACTIVE)
>>> +        ret = EEH_PE_STATE_RESET;
>>> +    else if (!(result & EEH_STATE_RESET_ACTIVE) &&
>>> +         !(result & EEH_STATE_DMA_ENABLED) &&
>>> +         !(result & EEH_STATE_MMIO_ENABLED))
>>> +        ret = EEH_PE_STATE_STOPPED_IO_DMA;
>>> +    else if (!(result & EEH_STATE_RESET_ACTIVE) &&
>>> +         (result & EEH_STATE_DMA_ENABLED) &&
>>> +         !(result & EEH_STATE_MMIO_ENABLED))
>>> +        ret = EEH_PE_STATE_STOPPED_DMA;
>>> +    else
>>> +        ret = EEH_PE_STATE_UNAVAIL;
>>> +
>>> +    return ret;
>>> +}
>>> +EXPORT_SYMBOL_GPL(eeh_pe_get_state);
>>> +
>>> +/**
>>> + * eeh_pe_reset - Issue PE reset according to specified type
>>> + * @pdev: PCI device
>>> + * @option: reset type
>>> + *
>>> + * The routine is called to reset the specified PE with the
>>> + * indicated type, either fundamental reset or hot reset.
>>> + * PE reset is the most important part for error recovery.
>>> + */
>>> +int eeh_pe_reset(struct pci_dev *pdev, int option)
>>> +{
>>> +    struct eeh_dev *edev;
>>> +    struct eeh_pe *pe;
>>> +    int ret = 0;
>>> +
>>> +    /* Device existing ? */
>>> +    ret = eeh_dev_check(pdev, &edev, &pe);
>>> +    if (ret)
>>> +        return ret;
>>> +
>>> +    if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset)
>>> +        return -ENOENT;
>>> +
>>> +    switch (option) {
>>> +    case EEH_RESET_DEACTIVATE:
>>> +        ret = eeh_ops->reset(pe, option);
>>> +        if (ret)
>>> +            break;
>>> +
>>> +        /*
>>> +         * The PE is still in frozen state and we need clear
>>> +         * that. It's good to clear frozen state after deassert
>>> +         * to avoid messy IO access during reset, which might
>>> +         * cause recursive frozen PE.
>>> +         */
>>> +        ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
>>> +        if (!ret)
>>> +            ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
>>> +        if (!ret)
>>> +            eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
>>> +        break;
>>> +    case EEH_RESET_HOT:
>>> +    case EEH_RESET_FUNDAMENTAL:
>>> +        ret = eeh_ops->reset(pe, option);
>>> +        break;
>>> +    default:
>>> +        pr_debug("%s: Unsupported option %d\n",
>>> +            __func__, option);
>>> +        ret = -EINVAL;
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +EXPORT_SYMBOL_GPL(eeh_pe_reset);
>>> +
>>> +/**
>>> + * eeh_pe_configure - Configure PCI bridges after PE reset
>>> + * @pdev: PCI device
>>> + *
>>> + * The routine is called to restore the PCI config space for
>>> + * those PCI devices, especially PCI bridges affected by PE
>>> + * reset issued previously.
>>> + */
>>> +int eeh_pe_configure(struct pci_dev *pdev)
>>> +{
>>> +    struct eeh_dev *edev;
>>> +    struct eeh_pe *pe;
>>> +    int ret = 0;
>>> +
>>> +    /* Device existing ? */
>>> +    ret = eeh_dev_check(pdev, &edev, &pe);
>>> +    if (ret)
>>> +        return ret;
>>> +
>>> +    /* Restore config space for the affected devices */
>>> +    eeh_pe_restore_bars(pe);
>>> +
>>> +    return ret;
>>> +}
>>> +EXPORT_SYMBOL_GPL(eeh_pe_configure);
>>> +
>>> static int proc_eeh_show(struct seq_file *m, void *v)
>>> {
>>>    if (!eeh_enabled()) {
>>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>>> index 7ba0424..301ac18 100644
>>> --- a/drivers/vfio/pci/vfio_pci.c
>>> +++ b/drivers/vfio/pci/vfio_pci.c
>>> @@ -25,6 +25,9 @@
>>> #include <linux/types.h>
>>> #include <linux/uaccess.h>
>>> #include <linux/vfio.h>
>>> +#ifdef CONFIG_EEH
>>> +#include <asm/eeh.h>
>>> +#endif
>> 
>> Can we make a vfio_pci_eeh file that properly stubs out anything called
>> from common code?  I don't want to see all these inline ifdefs.
> 
> well. do you want see drivers/vfio/vfio-pci/vfio_pci_eeh.c and export
> following functins for vfio_pci.c to use?
> 
> vfio_pci_eeh_open()
> vfio_pci_eeh_release()
> vfio_pci_eeh_ioctl()
> 
> 
>>> 
>>> #include "vfio_pci_private.h"
>>> 
>>> @@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>>>    pci_restore_state(pdev);
>>> }
>>> 
>>> +static void vfio_eeh_pci_release(struct pci_dev *pdev)
>>> +{
>>> +#ifdef CONFIG_EEH
>>> +    eeh_dev_release(pdev);
>>> +#endif
>>> +}
>>> +
>>> static void vfio_pci_release(void *device_data)
>>> {
>>>    struct vfio_pci_device *vdev = device_data;
>>> 
>>> -    if (atomic_dec_and_test(&vdev->refcnt))
>>> +    if (atomic_dec_and_test(&vdev->refcnt)) {
>>> +        vfio_eeh_pci_release(vdev->pdev);
>>>        vfio_pci_disable(vdev);
>>> +    }
>>> 
>>>    module_put(THIS_MODULE);
>>> }
>>> 
>>> +static int vfio_eeh_pci_open(struct pci_dev *pdev)
>>> +{
>>> +    int ret = 0;
>>> +
>>> +#ifdef CONFIG_EEH
>>> +    ret = eeh_dev_open(pdev);
>>> +#endif
>>> +    return ret;
>>> +}
>>> +
>>> static int vfio_pci_open(void *device_data)
>>> {
>>>    struct vfio_pci_device *vdev = device_data;
>>> +    int ret;
>>> 
>>>    if (!try_module_get(THIS_MODULE))
>>>        return -ENODEV;
>>> 
>>>    if (atomic_inc_return(&vdev->refcnt) == 1) {
>>> -        int ret = vfio_pci_enable(vdev);
>>> -        if (ret) {
>>> -            module_put(THIS_MODULE);
>>> -            return ret;
>>> -        }
>>> +        ret = vfio_pci_enable(vdev);
>>> +        if (ret)
>>> +            goto error;
>>> +
>>> +        ret = vfio_eeh_pci_open(vdev->pdev);
>>> +        if (ret)
>>> +            goto error;
>>>    }
>>> 
>>>    return 0;
>>> +error:
>>> +    module_put(THIS_MODULE);
>>> +    return ret;
>>> }
>>> 
>>> static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
>>> @@ -321,6 +349,91 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>>>    return walk.ret;
>>> }
>>> 
>>> +static int vfio_eeh_pci_ioctl(struct pci_dev *pdev,
>>> +                  unsigned int cmd,
>>> +                  unsigned long arg)
>>> +{
>>> +    unsigned long minsz;
>>> +    int ret = 0;
>>> +
>>> +#ifdef CONFIG_EEH
>>> +    switch (cmd) {
>>> +    case VFIO_EEH_PE_SET_OPTION: {
>>> +        struct vfio_eeh_pe_set_option option;
>>> +
>>> +        minsz = offsetofend(struct vfio_eeh_pe_set_option, option);
>>> +        if (copy_from_user(&option, (void __user *)arg, minsz))
>>> +            return -EFAULT;
>>> +        if (option.argsz < minsz)
>>> +            return -EINVAL;
>>> +
>>> +        ret = eeh_pe_set_option(pdev, option.option);
>>> +        break;
>>> +    }
>>> +    case VFIO_EEH_PE_GET_ADDR: {
>>> +        struct vfio_eeh_pe_get_addr addr;
>>> +
>>> +        minsz = offsetofend(struct vfio_eeh_pe_get_addr, info);
>>> +        if (copy_from_user(&addr, (void __user *)arg, minsz))
>>> +            return -EFAULT;
>>> +        if (addr.argsz < minsz)
>>> +            return -EINVAL;
>>> +
>>> +        ret = eeh_pe_get_addr(pdev, addr.option);
>>> +        if (ret >= 0) {
>>> +            addr.info = ret;
>>> +            if (copy_to_user((void __user *)arg, &addr, minsz))
>>> +                return -EFAULT;
>>> +            ret = 0;
>>> +        }
>>> +
>>> +        break;
>>> +    }
>>> +    case VFIO_EEH_PE_GET_STATE: {
>>> +        struct vfio_eeh_pe_get_state state;
>>> +
>>> +        minsz = offsetofend(struct vfio_eeh_pe_get_state, state);
>>> +        if (copy_from_user(&state, (void __user *)arg, minsz))
>>> +            return -EFAULT;
>>> +        if (state.argsz < minsz)
>>> +            return -EINVAL;
>>> +
>>> +        ret = eeh_pe_get_state(pdev);
>>> +        if (ret >= 0) {
>>> +            state.state = ret;
>>> +            if (copy_to_user((void __user *)arg, &state, minsz))
>>> +                return -EFAULT;
>>> +            ret = 0;
>>> +        }
>>> +        break;
>>> +    }
>>> +    case VFIO_EEH_PE_RESET: {
>>> +        struct vfio_eeh_pe_reset reset;
>>> +
>>> +        minsz = offsetofend(struct vfio_eeh_pe_reset, option);
>>> +        if (copy_from_user(&reset, (void __user *)arg, minsz))
>>> +            return -EFAULT;
>>> +        if (reset.argsz < minsz)
>>> +            return -EINVAL;
>>> +
>>> +        ret = eeh_pe_reset(pdev, reset.option);
>>> +        break;
>>> +    }
>>> +    case VFIO_EEH_PE_CONFIGURE:
>>> +        ret = eeh_pe_configure(pdev);
>>> +        break;
>>> +    default:
>>> +        ret = -EINVAL;
>>> +        pr_debug("%s: Cannot handle command %d\n",
>>> +            __func__, cmd);
>>> +    }
>>> +#else
>>> +    ret = -ENOENT;
>>> +#endif
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> static long vfio_pci_ioctl(void *device_data,
>>>               unsigned int cmd, unsigned long arg)
>>> {
>>> @@ -682,6 +795,12 @@ hot_reset_release:
>>> 
>>>        kfree(groups);
>>>        return ret;
>>> +    } else if (cmd == VFIO_EEH_PE_SET_OPTION ||
>>> +           cmd == VFIO_EEH_PE_GET_ADDR ||
>>> +           cmd == VFIO_EEH_PE_GET_STATE ||
>>> +           cmd == VFIO_EEH_PE_RESET ||
>>> +           cmd == VFIO_EEH_PE_CONFIGURE) {
>>> +            return vfio_eeh_pci_ioctl(vdev->pdev, cmd, arg);
>>>    }
>>> 
>>>    return -ENOTTY;
>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>> index cb9023d..ef55682 100644
>>> --- a/include/uapi/linux/vfio.h
>>> +++ b/include/uapi/linux/vfio.h
>>> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
>>> 
>>> #define VFIO_IOMMU_SPAPR_TCE_GET_INFO    _IO(VFIO_TYPE, VFIO_BASE + 12)
>>> 
>>> +/*
>>> + * EEH functionality can be enabled or disabled on one specific device.
>>> + * Also, the DMA or IO frozen state can be removed from the frozen PE
>>> + * if required.
>>> + */
>>> +struct vfio_eeh_pe_set_option {
>>> +    __u32 argsz;
>>> +    __u32 option;
>>> +};
>>> +
>>> +#define VFIO_EEH_PE_SET_OPTION        _IO(VFIO_TYPE, VFIO_BASE + 21)
>> 
>> What proposed ioctls are making you jump to 21?
>> 
>> argsz is probably not as useful without a flags field.  Otherwise the
>> caller can't indicate what the extra space is.
> 
> The QEMU patches are based on Alexey's additional feature ("ddw"), which
> consumed several ioctl commands.
> 
> ok. So you also prefer to remove "argsz"?
> 
>>> +
>>> +/*
>>> + * Each EEH PE should have unique address to be identified. The command
>>> + * helps to retrieve the address and the sharing mode of the PE.
>>> + */
>>> +struct vfio_eeh_pe_get_addr {
>>> +    __u32 argsz;
>>> +    __u32 option;
>>> +    __u32 info;
>>> +};
>>> +
>>> +#define VFIO_EEH_PE_GET_ADDR        _IO(VFIO_TYPE, VFIO_BASE + 22)
>>> +
>>> +/*
>>> + * EEH PE might have been frozen because of PCI errors. Also, it might
>>> + * be experiencing reset for error revoery. The following command helps
>>> + * to get the state.
>>> + */
>>> +struct vfio_eeh_pe_get_state {
>>> +    __u32 argsz;
>>> +    __u32 state;
>>> +};
>>> +
>>> +#define VFIO_EEH_PE_GET_STATE        _IO(VFIO_TYPE, VFIO_BASE + 23)
>>> +
>>> +/*
>>> + * Reset is the major step to recover problematic PE. The following
>>> + * command helps on that.
>>> + */
>>> +struct vfio_eeh_pe_reset {
>>> +    __u32 argsz;
>>> +    __u32 option;
>>> +};
>>> +
>>> +#define VFIO_EEH_PE_RESET        _IO(VFIO_TYPE, VFIO_BASE + 24)
>>> +
>>> +/*
>>> + * One of the steps for recovery after PE reset is to configure the
>>> + * PCI bridges affected by the PE reset.
>>> + */
>>> +#define VFIO_EEH_PE_CONFIGURE        _IO(VFIO_TYPE, VFIO_BASE + 25)
>> 
>> What can the user do differently by making these separate ioctls?
> 
> hrm, I didn't understood as well. Alex.G could have the explaination.

Alex raised the same concern as me: why separate reset and configure? When we want to recover a device, we need a reset call anyway, right?


Alex

> 
>>> +
>>> /* ***************************************************************** */
>>> 
>>> #endif /* _UAPIVFIO_H */
> 
> Thanks,
> Gavin
>
Gavin Shan May 23, 2014, 7:37 a.m. UTC | #10
On Fri, May 23, 2014 at 08:55:15AM +0200, Alexander Graf wrote:
>> Am 23.05.2014 um 06:37 schrieb Gavin Shan <gwshan@linux.vnet.ibm.com>:
>>> On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
>>>> On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
>>>> The patch adds new IOCTL commands for VFIO PCI device to support
>>>> EEH functionality for PCI devices, which have been passed through
>>>> from host to somebody else via VFIO.

.../...

>>>> +
>>>> +/*
>>>> + * Reset is the major step to recover problematic PE. The following
>>>> + * command helps on that.
>>>> + */
>>>> +struct vfio_eeh_pe_reset {
>>>> +    __u32 argsz;
>>>> +    __u32 option;
>>>> +};
>>>> +
>>>> +#define VFIO_EEH_PE_RESET        _IO(VFIO_TYPE, VFIO_BASE + 24)
>>>> +
>>>> +/*
>>>> + * One of the steps for recovery after PE reset is to configure the
>>>> + * PCI bridges affected by the PE reset.
>>>> + */
>>>> +#define VFIO_EEH_PE_CONFIGURE        _IO(VFIO_TYPE, VFIO_BASE + 25)
>>> 
>>> What can the user do differently by making these separate ioctls?
>> 
>> hrm, I didn't understood as well. Alex.G could have the explaination.
>
>Alex raised the same concern as me: why separate reset and configure? When we want to recover a device, we need a reset call anyway, right?
>

Ok. With current ioctl commands, "reset+configure" is required to do
error recovery. Before the recovery, we also need call "configure"
in order to retrieve error log correctly.

Also, they corresponds to 2 separate RTAS services: "ibm,set-slot-reset"
and "ibm,configure-pe".

Thanks,
Gavin
Alexander Graf May 23, 2014, 9:58 a.m. UTC | #11
On 23.05.14 09:37, Gavin Shan wrote:
> On Fri, May 23, 2014 at 08:55:15AM +0200, Alexander Graf wrote:
>>> Am 23.05.2014 um 06:37 schrieb Gavin Shan <gwshan@linux.vnet.ibm.com>:
>>>> On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
>>>>> On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
>>>>> The patch adds new IOCTL commands for VFIO PCI device to support
>>>>> EEH functionality for PCI devices, which have been passed through
>>>>> from host to somebody else via VFIO.
> .../...
>
>>>>> +
>>>>> +/*
>>>>> + * Reset is the major step to recover problematic PE. The following
>>>>> + * command helps on that.
>>>>> + */
>>>>> +struct vfio_eeh_pe_reset {
>>>>> +    __u32 argsz;
>>>>> +    __u32 option;
>>>>> +};
>>>>> +
>>>>> +#define VFIO_EEH_PE_RESET        _IO(VFIO_TYPE, VFIO_BASE + 24)
>>>>> +
>>>>> +/*
>>>>> + * One of the steps for recovery after PE reset is to configure the
>>>>> + * PCI bridges affected by the PE reset.
>>>>> + */
>>>>> +#define VFIO_EEH_PE_CONFIGURE        _IO(VFIO_TYPE, VFIO_BASE + 25)
>>>> What can the user do differently by making these separate ioctls?
>>> hrm, I didn't understood as well. Alex.G could have the explaination.
>> Alex raised the same concern as me: why separate reset and configure? When we want to recover a device, we need a reset call anyway, right?
>>
> Ok. With current ioctl commands, "reset+configure" is required to do
> error recovery. Before the recovery, we also need call "configure"
> in order to retrieve error log correctly.

Well, the "configure" ioctl (which is a really bad name for what it does 
btw) currently only restores the BARs which doesn't sound like error log 
retrieval to me.

> Also, they corresponds to 2 separate RTAS services: "ibm,set-slot-reset"
> and "ibm,configure-pe".

Does a guest always issue both? What's the order it calls them in?


Alex
Gavin Shan May 23, 2014, 11:55 a.m. UTC | #12
On Fri, May 23, 2014 at 11:58:22AM +0200, Alexander Graf wrote:
>
>On 23.05.14 09:37, Gavin Shan wrote:
>>On Fri, May 23, 2014 at 08:55:15AM +0200, Alexander Graf wrote:
>>>>Am 23.05.2014 um 06:37 schrieb Gavin Shan <gwshan@linux.vnet.ibm.com>:
>>>>>On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
>>>>>>On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
>>>>>>The patch adds new IOCTL commands for VFIO PCI device to support
>>>>>>EEH functionality for PCI devices, which have been passed through
>>>>>>from host to somebody else via VFIO.
>>.../...
>>
>>>>>>+
>>>>>>+/*
>>>>>>+ * Reset is the major step to recover problematic PE. The following
>>>>>>+ * command helps on that.
>>>>>>+ */
>>>>>>+struct vfio_eeh_pe_reset {
>>>>>>+    __u32 argsz;
>>>>>>+    __u32 option;
>>>>>>+};
>>>>>>+
>>>>>>+#define VFIO_EEH_PE_RESET        _IO(VFIO_TYPE, VFIO_BASE + 24)
>>>>>>+
>>>>>>+/*
>>>>>>+ * One of the steps for recovery after PE reset is to configure the
>>>>>>+ * PCI bridges affected by the PE reset.
>>>>>>+ */
>>>>>>+#define VFIO_EEH_PE_CONFIGURE        _IO(VFIO_TYPE, VFIO_BASE + 25)
>>>>>What can the user do differently by making these separate ioctls?
>>>>hrm, I didn't understood as well. Alex.G could have the explaination.
>>>Alex raised the same concern as me: why separate reset and configure? When we want to recover a device, we need a reset call anyway, right?
>>>
>>Ok. With current ioctl commands, "reset+configure" is required to do
>>error recovery. Before the recovery, we also need call "configure"
>>in order to retrieve error log correctly.
>
>Well, the "configure" ioctl (which is a really bad name for what it
>does btw) currently only restores the BARs which doesn't sound like
>error log retrieval to me.
>

Could you please suggest a better name? I had VFIO_EEH_PE_CONFIGURE because
it's for RTAS call "ibm,configure-pe".

>>Also, they corresponds to 2 separate RTAS services: "ibm,set-slot-reset"
>>and "ibm,configure-pe".
>
>Does a guest always issue both? What's the order it calls them in?
>

For one error, the following RTAS calls was called in general:

< stop device drivers, no PCI traffic expected during recovery >
ibm,set-eeh-option
ibm,configure-pe
< error log retrival >
ibm,set-slot-reset
ibm,read-slot-reset-state2
ibm,configure-pe
< resume device drivers >

We have other scenario. For example, PE reset failure and collect
the permanent log. Prior to that, "ibm,configure-pe" should be called.

Thanks,
Gavin


>
>Alex
>
Gavin Shan May 23, 2014, 11:58 a.m. UTC | #13
On Fri, May 23, 2014 at 08:52:23AM +0200, Alexander Graf wrote:
>
>
>> Am 23.05.2014 um 05:23 schrieb Alex Williamson <alex.williamson@redhat.com>:
>> 
>>> On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
>>>> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
>>>>> On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
>>>>> On 22.05.14 10:23, Gavin Shan wrote:
>>> 
>>> .../...
>>> 
>>>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>>>>> index cb9023d..ef55682 100644
>>>>>> --- a/include/uapi/linux/vfio.h
>>>>>> +++ b/include/uapi/linux/vfio.h
>>>>>> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
>>>>>> #define VFIO_IOMMU_SPAPR_TCE_GET_INFO    _IO(VFIO_TYPE, VFIO_BASE + 12)
>>>>>> +/*
>>>>>> + * EEH functionality can be enabled or disabled on one specific device.
>>>>>> + * Also, the DMA or IO frozen state can be removed from the frozen PE
>>>>>> + * if required.
>>>>>> + */
>>>>>> +struct vfio_eeh_pe_set_option {
>>>>>> +    __u32 argsz;
>>>>> 
>>>>> What is this argsz thing? Is this your way of maintaining backwards
>>>>> compatibility when we introduce new fields? A new field will change
>>>>> the ioctl number, so I don't think that makes a lot of sense :).
>>>>> 
>>>>> Just make the ioctl have a u32 as incoming argument. No fancy
>>>>> structs, no complicated code.
>>>>> 
>>>>> The same applies for a number of structs below.
>>>> 
>>>> ok. Will do in next revision.
>>> 
>>> Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl command
>>> always has "argsz". I guess it was used as checker by Alex.W. Do you really
>>> want remove "argsz" ?
>> 
>> 
>> IIRC, this was actually a suggestion incorporated from David Gibson, but
>> using _IO with an argsz and flags field we can maintain compatibility
>> without bumping the ioctl number.  It really only makes sense if we have
>> a flags field so we can identify what additional information is being
>> provided.  Flags can be used as a bitmap of trailing structures or as
>> revision if we want a set of trailing structures that may change over
>> time.  Unless you can come up with a good argument against it that would
>> prevent us inventing a new ioctl as soon as we need a minor tweak, I'd
>> prefer to keep it.  As I noted in a previous comment, the one ioctl we
>> have for reset that doesn't take any options is likely going to be the
>> first ioctl that we need to entirely replace.  If we don't keep argsz,
>> it seems like we probably need a flags field and reserved structures.
>> 
>>>>>> +    __u32 option;
>>>>>> +};
>>>>>> +
>>>>>> +#define VFIO_EEH_PE_SET_OPTION        _IO(VFIO_TYPE, VFIO_BASE + 21)
>>>>>> +
>>>>>> +/*
>>>>>> + * Each EEH PE should have unique address to be identified. The command
>>>>>> + * helps to retrieve the address and the sharing mode of the PE.
>>>>>> + */
>>>>>> +struct vfio_eeh_pe_get_addr {
>>>>>> +    __u32 argsz;
>>>>>> +    __u32 option;
>>>>>> +    __u32 info;
>>>>> 
>>>>> Any particular reason you need the info field? Can't the return value
>>>>> of the ioctl hold this? Then you only have a single u32 argument left
>>>>> to the ioctl again.
>>>> 
>>>> ok. Will do in next revision.
>>> 
>>> If we eventually remove "argsz" and let ioctl() return value to hold
>>> information (or negative number for errors), we don't need any data
>>> struct because the 3rd parameter of ioctl() would be used as input
>>> and I only need one input parameter. Do you want see this ?
>>> 
>>> Hopefully, Alex.W saw this and hasn't objections :)
>> 
>> I'm not sure why we're pushing for the minimal data set to pass to an
>> ioctl.  Seems like a recipe for dead, useless ioctls.  Thanks,
>> 
>
>The ioctl number includes sizeof(payload). So if a new parameter gets added, that would be a different ioctl number.
>
>If you want to maintain backwards compatibility ioctl number wise in the kernel, you'll have to have a "flags" field to indicate whether new data is available and a "pad" field, prefarably in a union, that ensures the size of the struct doesn't change.
>
>I'm not sure it's really necessary here to have identical ioctl numbers if we add parameters, since we can always just define a new ioctl with a bigger payload that can then become the default handler and a shim backwards compatible handler with the old number.
>
>But if you think it is important, let's do it for real, not just halfway.
>

So I need add additional field "flags" ? Also, I need keep the return value from
ioctl() less or equal to 0 ? :-)

Thanks,
Gavin
Alexander Graf May 23, 2014, 11:58 a.m. UTC | #14
On 23.05.14 13:55, Gavin Shan wrote:
> On Fri, May 23, 2014 at 11:58:22AM +0200, Alexander Graf wrote:
>> On 23.05.14 09:37, Gavin Shan wrote:
>>> On Fri, May 23, 2014 at 08:55:15AM +0200, Alexander Graf wrote:
>>>>> Am 23.05.2014 um 06:37 schrieb Gavin Shan <gwshan@linux.vnet.ibm.com>:
>>>>>> On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
>>>>>>> On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
>>>>>>> The patch adds new IOCTL commands for VFIO PCI device to support
>>>>>>> EEH functionality for PCI devices, which have been passed through
>>>>>> >from host to somebody else via VFIO.
>>> .../...
>>>
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Reset is the major step to recover problematic PE. The following
>>>>>>> + * command helps on that.
>>>>>>> + */
>>>>>>> +struct vfio_eeh_pe_reset {
>>>>>>> +    __u32 argsz;
>>>>>>> +    __u32 option;
>>>>>>> +};
>>>>>>> +
>>>>>>> +#define VFIO_EEH_PE_RESET        _IO(VFIO_TYPE, VFIO_BASE + 24)
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * One of the steps for recovery after PE reset is to configure the
>>>>>>> + * PCI bridges affected by the PE reset.
>>>>>>> + */
>>>>>>> +#define VFIO_EEH_PE_CONFIGURE        _IO(VFIO_TYPE, VFIO_BASE + 25)
>>>>>> What can the user do differently by making these separate ioctls?
>>>>> hrm, I didn't understood as well. Alex.G could have the explaination.
>>>> Alex raised the same concern as me: why separate reset and configure? When we want to recover a device, we need a reset call anyway, right?
>>>>
>>> Ok. With current ioctl commands, "reset+configure" is required to do
>>> error recovery. Before the recovery, we also need call "configure"
>>> in order to retrieve error log correctly.
>> Well, the "configure" ioctl (which is a really bad name for what it
>> does btw) currently only restores the BARs which doesn't sound like
>> error log retrieval to me.
>>
> Could you please suggest a better name? I had VFIO_EEH_PE_CONFIGURE because
> it's for RTAS call "ibm,configure-pe".

VFIO_RESTORE_BARS maybe?

>>> Also, they corresponds to 2 separate RTAS services: "ibm,set-slot-reset"
>>> and "ibm,configure-pe".
>> Does a guest always issue both? What's the order it calls them in?
>>
> For one error, the following RTAS calls was called in general:
>
> < stop device drivers, no PCI traffic expected during recovery >
> ibm,set-eeh-option
> ibm,configure-pe
> < error log retrival >

I see. So the guest retrieves the log via BARs from the device? I guess 
I'm failing to see what "the log" is.


Alex

> ibm,set-slot-reset
> ibm,read-slot-reset-state2
> ibm,configure-pe
> < resume device drivers >
>
> We have other scenario. For example, PE reset failure and collect
> the permanent log. Prior to that, "ibm,configure-pe" should be called.
>
> Thanks,
> Gavin
>
>
>> Alex
>>
Alexander Graf May 23, 2014, 12:30 p.m. UTC | #15
On 23.05.14 13:58, Gavin Shan wrote:
> On Fri, May 23, 2014 at 08:52:23AM +0200, Alexander Graf wrote:
>>
>>> Am 23.05.2014 um 05:23 schrieb Alex Williamson <alex.williamson@redhat.com>:
>>>
>>>> On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
>>>>> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
>>>>>> On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
>>>>>> On 22.05.14 10:23, Gavin Shan wrote:
>>>> .../...
>>>>
>>>>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>>>>>> index cb9023d..ef55682 100644
>>>>>>> --- a/include/uapi/linux/vfio.h
>>>>>>> +++ b/include/uapi/linux/vfio.h
>>>>>>> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
>>>>>>> #define VFIO_IOMMU_SPAPR_TCE_GET_INFO    _IO(VFIO_TYPE, VFIO_BASE + 12)
>>>>>>> +/*
>>>>>>> + * EEH functionality can be enabled or disabled on one specific device.
>>>>>>> + * Also, the DMA or IO frozen state can be removed from the frozen PE
>>>>>>> + * if required.
>>>>>>> + */
>>>>>>> +struct vfio_eeh_pe_set_option {
>>>>>>> +    __u32 argsz;
>>>>>> What is this argsz thing? Is this your way of maintaining backwards
>>>>>> compatibility when we introduce new fields? A new field will change
>>>>>> the ioctl number, so I don't think that makes a lot of sense :).
>>>>>>
>>>>>> Just make the ioctl have a u32 as incoming argument. No fancy
>>>>>> structs, no complicated code.
>>>>>>
>>>>>> The same applies for a number of structs below.
>>>>> ok. Will do in next revision.
>>>> Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl command
>>>> always has "argsz". I guess it was used as checker by Alex.W. Do you really
>>>> want remove "argsz" ?
>>>
>>> IIRC, this was actually a suggestion incorporated from David Gibson, but
>>> using _IO with an argsz and flags field we can maintain compatibility
>>> without bumping the ioctl number.  It really only makes sense if we have
>>> a flags field so we can identify what additional information is being
>>> provided.  Flags can be used as a bitmap of trailing structures or as
>>> revision if we want a set of trailing structures that may change over
>>> time.  Unless you can come up with a good argument against it that would
>>> prevent us inventing a new ioctl as soon as we need a minor tweak, I'd
>>> prefer to keep it.  As I noted in a previous comment, the one ioctl we
>>> have for reset that doesn't take any options is likely going to be the
>>> first ioctl that we need to entirely replace.  If we don't keep argsz,
>>> it seems like we probably need a flags field and reserved structures.
>>>
>>>>>>> +    __u32 option;
>>>>>>> +};
>>>>>>> +
>>>>>>> +#define VFIO_EEH_PE_SET_OPTION        _IO(VFIO_TYPE, VFIO_BASE + 21)
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Each EEH PE should have unique address to be identified. The command
>>>>>>> + * helps to retrieve the address and the sharing mode of the PE.
>>>>>>> + */
>>>>>>> +struct vfio_eeh_pe_get_addr {
>>>>>>> +    __u32 argsz;
>>>>>>> +    __u32 option;
>>>>>>> +    __u32 info;
>>>>>> Any particular reason you need the info field? Can't the return value
>>>>>> of the ioctl hold this? Then you only have a single u32 argument left
>>>>>> to the ioctl again.
>>>>> ok. Will do in next revision.
>>>> If we eventually remove "argsz" and let ioctl() return value to hold
>>>> information (or negative number for errors), we don't need any data
>>>> struct because the 3rd parameter of ioctl() would be used as input
>>>> and I only need one input parameter. Do you want see this ?
>>>>
>>>> Hopefully, Alex.W saw this and hasn't objections :)
>>> I'm not sure why we're pushing for the minimal data set to pass to an
>>> ioctl.  Seems like a recipe for dead, useless ioctls.  Thanks,
>>>
>> The ioctl number includes sizeof(payload). So if a new parameter gets added, that would be a different ioctl number.
>>
>> If you want to maintain backwards compatibility ioctl number wise in the kernel, you'll have to have a "flags" field to indicate whether new data is available and a "pad" field, prefarably in a union, that ensures the size of the struct doesn't change.
>>
>> I'm not sure it's really necessary here to have identical ioctl numbers if we add parameters, since we can always just define a new ioctl with a bigger payload that can then become the default handler and a shim backwards compatible handler with the old number.
>>
>> But if you think it is important, let's do it for real, not just halfway.
>>
> So I need add additional field "flags" ?

This is Alex' call on how he prefers the interface to look like. The 
struct size that you pass into an ioctl is part of the ioctl number, so 
putting that into an argsz field only makes sense when you do ugly 
things like

struct foo *x;
uint32_t *y;

x = malloc(sizeof(*x) + sizeof(*y));
y = (void*)&x[1];

ioctl(..., x);

But I'd prefer not to have such nasty code really. It breaks assumptions 
on *anything* that wraps ioctls, such as strace or qemu's linux-user 
emulation.

>   Also, I need keep the return value from
> ioctl() less or equal to 0 ? :-)

Usually return values are

   < 0 means error
   == 0 means success
   > 0 means return payload


Alex
Gavin Shan May 23, 2014, 12:43 p.m. UTC | #16
On Fri, May 23, 2014 at 01:58:50PM +0200, Alexander Graf wrote:
>
>On 23.05.14 13:55, Gavin Shan wrote:
>>On Fri, May 23, 2014 at 11:58:22AM +0200, Alexander Graf wrote:
>>>On 23.05.14 09:37, Gavin Shan wrote:
>>>>On Fri, May 23, 2014 at 08:55:15AM +0200, Alexander Graf wrote:
>>>>>>Am 23.05.2014 um 06:37 schrieb Gavin Shan <gwshan@linux.vnet.ibm.com>:
>>>>>>>On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
>>>>>>>>On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
>>>>>>>>The patch adds new IOCTL commands for VFIO PCI device to support
>>>>>>>>EEH functionality for PCI devices, which have been passed through
>>>>>>>>from host to somebody else via VFIO.
>>>>.../...
>>>>
>>>>>>>>+
>>>>>>>>+/*
>>>>>>>>+ * Reset is the major step to recover problematic PE. The following
>>>>>>>>+ * command helps on that.
>>>>>>>>+ */
>>>>>>>>+struct vfio_eeh_pe_reset {
>>>>>>>>+    __u32 argsz;
>>>>>>>>+    __u32 option;
>>>>>>>>+};
>>>>>>>>+
>>>>>>>>+#define VFIO_EEH_PE_RESET        _IO(VFIO_TYPE, VFIO_BASE + 24)
>>>>>>>>+
>>>>>>>>+/*
>>>>>>>>+ * One of the steps for recovery after PE reset is to configure the
>>>>>>>>+ * PCI bridges affected by the PE reset.
>>>>>>>>+ */
>>>>>>>>+#define VFIO_EEH_PE_CONFIGURE        _IO(VFIO_TYPE, VFIO_BASE + 25)
>>>>>>>What can the user do differently by making these separate ioctls?
>>>>>>hrm, I didn't understood as well. Alex.G could have the explaination.
>>>>>Alex raised the same concern as me: why separate reset and configure? When we want to recover a device, we need a reset call anyway, right?
>>>>>
>>>>Ok. With current ioctl commands, "reset+configure" is required to do
>>>>error recovery. Before the recovery, we also need call "configure"
>>>>in order to retrieve error log correctly.
>>>Well, the "configure" ioctl (which is a really bad name for what it
>>>does btw) currently only restores the BARs which doesn't sound like
>>>error log retrieval to me.
>>>
>>Could you please suggest a better name? I had VFIO_EEH_PE_CONFIGURE because
>>it's for RTAS call "ibm,configure-pe".
>
>VFIO_RESTORE_BARS maybe?
>

hrm, It's not better than the original one. Could we just
have VFIO_EEH_PE_CONFIGURE as all left ioctl command names
are stick to RTAS call names.

Also, I might add more logic to this function to improve
reliability. For example, if there're multiple PCI bridges
included in the PE, I need reset them one by one and ensure
their PCI link comes up. It's obviously not restoring BARs,
but configuring PE :-)

>>>>Also, they corresponds to 2 separate RTAS services: "ibm,set-slot-reset"
>>>>and "ibm,configure-pe".
>>>Does a guest always issue both? What's the order it calls them in?
>>>
>>For one error, the following RTAS calls was called in general:
>>
>>< stop device drivers, no PCI traffic expected during recovery >
>>ibm,set-eeh-option
>>ibm,configure-pe
>>< error log retrival >
>
>I see. So the guest retrieves the log via BARs from the device? I
>guess I'm failing to see what "the log" is.
>

well. It seems that I didn't describe it clearly enough. The ioctl
command was introduced to finish the function that should be done
with RTAS call "ibm,configure-pe", which is to configure PCI bridges's
config space correctly. Without that, it's possible that we can't
access the config space of the subordinate PCI devices of the PCI
bridges. So we should restore config space for PCI bridges. However,
we also need restore config space for normal PCI devices because
userland has some config space registers masked off and can't access
them all, so it's not reliable to restore config space for normal
PCI devices from userland.

So the restoring config space of PCI bridges is required, but restoring
config space for normal devices are a trick here.

>>ibm,set-slot-reset
>>ibm,read-slot-reset-state2
>>ibm,configure-pe
>>< resume device drivers >
>>

Thanks,
Gavin
Alexander Graf May 23, 2014, 12:49 p.m. UTC | #17
On 23.05.14 14:43, Gavin Shan wrote:
> On Fri, May 23, 2014 at 01:58:50PM +0200, Alexander Graf wrote:
>> On 23.05.14 13:55, Gavin Shan wrote:
>>> On Fri, May 23, 2014 at 11:58:22AM +0200, Alexander Graf wrote:
>>>> On 23.05.14 09:37, Gavin Shan wrote:
>>>>> On Fri, May 23, 2014 at 08:55:15AM +0200, Alexander Graf wrote:
>>>>>>> Am 23.05.2014 um 06:37 schrieb Gavin Shan <gwshan@linux.vnet.ibm.com>:
>>>>>>>> On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
>>>>>>>>> On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
>>>>>>>>> The patch adds new IOCTL commands for VFIO PCI device to support
>>>>>>>>> EEH functionality for PCI devices, which have been passed through
>>>>>>>> >from host to somebody else via VFIO.
>>>>> .../...
>>>>>
>>>>>>>>> +
>>>>>>>>> +/*
>>>>>>>>> + * Reset is the major step to recover problematic PE. The following
>>>>>>>>> + * command helps on that.
>>>>>>>>> + */
>>>>>>>>> +struct vfio_eeh_pe_reset {
>>>>>>>>> +    __u32 argsz;
>>>>>>>>> +    __u32 option;
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +#define VFIO_EEH_PE_RESET        _IO(VFIO_TYPE, VFIO_BASE + 24)
>>>>>>>>> +
>>>>>>>>> +/*
>>>>>>>>> + * One of the steps for recovery after PE reset is to configure the
>>>>>>>>> + * PCI bridges affected by the PE reset.
>>>>>>>>> + */
>>>>>>>>> +#define VFIO_EEH_PE_CONFIGURE        _IO(VFIO_TYPE, VFIO_BASE + 25)
>>>>>>>> What can the user do differently by making these separate ioctls?
>>>>>>> hrm, I didn't understood as well. Alex.G could have the explaination.
>>>>>> Alex raised the same concern as me: why separate reset and configure? When we want to recover a device, we need a reset call anyway, right?
>>>>>>
>>>>> Ok. With current ioctl commands, "reset+configure" is required to do
>>>>> error recovery. Before the recovery, we also need call "configure"
>>>>> in order to retrieve error log correctly.
>>>> Well, the "configure" ioctl (which is a really bad name for what it
>>>> does btw) currently only restores the BARs which doesn't sound like
>>>> error log retrieval to me.
>>>>
>>> Could you please suggest a better name? I had VFIO_EEH_PE_CONFIGURE because
>>> it's for RTAS call "ibm,configure-pe".
>> VFIO_RESTORE_BARS maybe?
>>
> hrm, It's not better than the original one. Could we just
> have VFIO_EEH_PE_CONFIGURE as all left ioctl command names
> are stick to RTAS call names.
>
> Also, I might add more logic to this function to improve
> reliability. For example, if there're multiple PCI bridges
> included in the PE, I need reset them one by one and ensure
> their PCI link comes up. It's obviously not restoring BARs,
> but configuring PE :-)


VFIO_EEH_RECOVER?

>
>>>>> Also, they corresponds to 2 separate RTAS services: "ibm,set-slot-reset"
>>>>> and "ibm,configure-pe".
>>>> Does a guest always issue both? What's the order it calls them in?
>>>>
>>> For one error, the following RTAS calls was called in general:
>>>
>>> < stop device drivers, no PCI traffic expected during recovery >
>>> ibm,set-eeh-option
>>> ibm,configure-pe
>>> < error log retrival >
>> I see. So the guest retrieves the log via BARs from the device? I
>> guess I'm failing to see what "the log" is.
>>
> well. It seems that I didn't describe it clearly enough. The ioctl
> command was introduced to finish the function that should be done
> with RTAS call "ibm,configure-pe", which is to configure PCI bridges's
> config space correctly. Without that, it's possible that we can't
> access the config space of the subordinate PCI devices of the PCI
> bridges. So we should restore config space for PCI bridges. However,
> we also need restore config space for normal PCI devices because
> userland has some config space registers masked off and can't access
> them all, so it's not reliable to restore config space for normal
> PCI devices from userland.
>
> So the restoring config space of PCI bridges is required, but restoring
> config space for normal devices are a trick here.

So what if user space accesses config space while the device is broken? 
What if it accesses an mmap'ed BAR while the device is in broken state 
and BARs haven't been recovered yet?


Alex
Alex Williamson May 23, 2014, 12:51 p.m. UTC | #18
On Fri, 2014-05-23 at 08:52 +0200, Alexander Graf wrote:
> 
> > Am 23.05.2014 um 05:23 schrieb Alex Williamson <alex.williamson@redhat.com>:
> > 
> >> On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
> >>> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
> >>>> On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
> >>>> On 22.05.14 10:23, Gavin Shan wrote:
> >> 
> >> .../...
> >> 
> >>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >>>>> index cb9023d..ef55682 100644
> >>>>> --- a/include/uapi/linux/vfio.h
> >>>>> +++ b/include/uapi/linux/vfio.h
> >>>>> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
> >>>>> #define VFIO_IOMMU_SPAPR_TCE_GET_INFO    _IO(VFIO_TYPE, VFIO_BASE + 12)
> >>>>> +/*
> >>>>> + * EEH functionality can be enabled or disabled on one specific device.
> >>>>> + * Also, the DMA or IO frozen state can be removed from the frozen PE
> >>>>> + * if required.
> >>>>> + */
> >>>>> +struct vfio_eeh_pe_set_option {
> >>>>> +    __u32 argsz;
> >>>> 
> >>>> What is this argsz thing? Is this your way of maintaining backwards
> >>>> compatibility when we introduce new fields? A new field will change
> >>>> the ioctl number, so I don't think that makes a lot of sense :).
> >>>> 
> >>>> Just make the ioctl have a u32 as incoming argument. No fancy
> >>>> structs, no complicated code.
> >>>> 
> >>>> The same applies for a number of structs below.
> >>> 
> >>> ok. Will do in next revision.
> >> 
> >> Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl command
> >> always has "argsz". I guess it was used as checker by Alex.W. Do you really
> >> want remove "argsz" ?
> > 
> > 
> > IIRC, this was actually a suggestion incorporated from David Gibson, but
> > using _IO with an argsz and flags field we can maintain compatibility
> > without bumping the ioctl number.  It really only makes sense if we have
> > a flags field so we can identify what additional information is being
> > provided.  Flags can be used as a bitmap of trailing structures or as
> > revision if we want a set of trailing structures that may change over
> > time.  Unless you can come up with a good argument against it that would
> > prevent us inventing a new ioctl as soon as we need a minor tweak, I'd
> > prefer to keep it.  As I noted in a previous comment, the one ioctl we
> > have for reset that doesn't take any options is likely going to be the
> > first ioctl that we need to entirely replace.  If we don't keep argsz,
> > it seems like we probably need a flags field and reserved structures.
> > 
> >>>>> +    __u32 option;
> >>>>> +};
> >>>>> +
> >>>>> +#define VFIO_EEH_PE_SET_OPTION        _IO(VFIO_TYPE, VFIO_BASE + 21)
> >>>>> +
> >>>>> +/*
> >>>>> + * Each EEH PE should have unique address to be identified. The command
> >>>>> + * helps to retrieve the address and the sharing mode of the PE.
> >>>>> + */
> >>>>> +struct vfio_eeh_pe_get_addr {
> >>>>> +    __u32 argsz;
> >>>>> +    __u32 option;
> >>>>> +    __u32 info;
> >>>> 
> >>>> Any particular reason you need the info field? Can't the return value
> >>>> of the ioctl hold this? Then you only have a single u32 argument left
> >>>> to the ioctl again.
> >>> 
> >>> ok. Will do in next revision.
> >> 
> >> If we eventually remove "argsz" and let ioctl() return value to hold
> >> information (or negative number for errors), we don't need any data
> >> struct because the 3rd parameter of ioctl() would be used as input
> >> and I only need one input parameter. Do you want see this ?
> >> 
> >> Hopefully, Alex.W saw this and hasn't objections :)
> > 
> > I'm not sure why we're pushing for the minimal data set to pass to an
> > ioctl.  Seems like a recipe for dead, useless ioctls.  Thanks,
> > 
> 
> The ioctl number includes sizeof(payload). So if a new parameter gets
> added, that would be a different ioctl number.

Not when we use _IO

> If you want to maintain backwards compatibility ioctl number wise in
> the kernel, you'll have to have a "flags" field to indicate whether
> new data is available and a "pad" field, prefarably in a union, that
> ensures the size of the struct doesn't change.
> 
> I'm not sure it's really necessary here to have identical ioctl
> numbers if we add parameters, since we can always just define a new
> ioctl with a bigger payload that can then become the default handler
> and a shim backwards compatible handler with the old number.
> 
> But if you think it is important, let's do it for real, not just
> halfway.
> 
> 
> Alex
>
Alexander Graf May 23, 2014, 1:24 p.m. UTC | #19
On 23.05.14 14:51, Alex Williamson wrote:
> On Fri, 2014-05-23 at 08:52 +0200, Alexander Graf wrote:
>>> Am 23.05.2014 um 05:23 schrieb Alex Williamson <alex.williamson@redhat.com>:
>>>
>>>> On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
>>>>> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
>>>>>> On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
>>>>>> On 22.05.14 10:23, Gavin Shan wrote:
>>>> .../...
>>>>
>>>>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>>>>>> index cb9023d..ef55682 100644
>>>>>>> --- a/include/uapi/linux/vfio.h
>>>>>>> +++ b/include/uapi/linux/vfio.h
>>>>>>> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
>>>>>>> #define VFIO_IOMMU_SPAPR_TCE_GET_INFO    _IO(VFIO_TYPE, VFIO_BASE + 12)
>>>>>>> +/*
>>>>>>> + * EEH functionality can be enabled or disabled on one specific device.
>>>>>>> + * Also, the DMA or IO frozen state can be removed from the frozen PE
>>>>>>> + * if required.
>>>>>>> + */
>>>>>>> +struct vfio_eeh_pe_set_option {
>>>>>>> +    __u32 argsz;
>>>>>> What is this argsz thing? Is this your way of maintaining backwards
>>>>>> compatibility when we introduce new fields? A new field will change
>>>>>> the ioctl number, so I don't think that makes a lot of sense :).
>>>>>>
>>>>>> Just make the ioctl have a u32 as incoming argument. No fancy
>>>>>> structs, no complicated code.
>>>>>>
>>>>>> The same applies for a number of structs below.
>>>>> ok. Will do in next revision.
>>>> Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl command
>>>> always has "argsz". I guess it was used as checker by Alex.W. Do you really
>>>> want remove "argsz" ?
>>>
>>> IIRC, this was actually a suggestion incorporated from David Gibson, but
>>> using _IO with an argsz and flags field we can maintain compatibility
>>> without bumping the ioctl number.  It really only makes sense if we have
>>> a flags field so we can identify what additional information is being
>>> provided.  Flags can be used as a bitmap of trailing structures or as
>>> revision if we want a set of trailing structures that may change over
>>> time.  Unless you can come up with a good argument against it that would
>>> prevent us inventing a new ioctl as soon as we need a minor tweak, I'd
>>> prefer to keep it.  As I noted in a previous comment, the one ioctl we
>>> have for reset that doesn't take any options is likely going to be the
>>> first ioctl that we need to entirely replace.  If we don't keep argsz,
>>> it seems like we probably need a flags field and reserved structures.
>>>
>>>>>>> +    __u32 option;
>>>>>>> +};
>>>>>>> +
>>>>>>> +#define VFIO_EEH_PE_SET_OPTION        _IO(VFIO_TYPE, VFIO_BASE + 21)
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Each EEH PE should have unique address to be identified. The command
>>>>>>> + * helps to retrieve the address and the sharing mode of the PE.
>>>>>>> + */
>>>>>>> +struct vfio_eeh_pe_get_addr {
>>>>>>> +    __u32 argsz;
>>>>>>> +    __u32 option;
>>>>>>> +    __u32 info;
>>>>>> Any particular reason you need the info field? Can't the return value
>>>>>> of the ioctl hold this? Then you only have a single u32 argument left
>>>>>> to the ioctl again.
>>>>> ok. Will do in next revision.
>>>> If we eventually remove "argsz" and let ioctl() return value to hold
>>>> information (or negative number for errors), we don't need any data
>>>> struct because the 3rd parameter of ioctl() would be used as input
>>>> and I only need one input parameter. Do you want see this ?
>>>>
>>>> Hopefully, Alex.W saw this and hasn't objections :)
>>> I'm not sure why we're pushing for the minimal data set to pass to an
>>> ioctl.  Seems like a recipe for dead, useless ioctls.  Thanks,
>>>
>> The ioctl number includes sizeof(payload). So if a new parameter gets
>> added, that would be a different ioctl number.
> Not when we use _IO

I see. Now things start to make a little more sense :). But as I stated 
earlier, I'll leave the actual ioctl interface to your judgement.


Alex
Alex Williamson May 23, 2014, 2:29 p.m. UTC | #20
On Fri, 2014-05-23 at 14:37 +1000, Gavin Shan wrote:
> On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
> >On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
> >> The patch adds new IOCTL commands for VFIO PCI device to support
> >> EEH functionality for PCI devices, which have been passed through
> >> from host to somebody else via VFIO.
> >> 
> >> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> >> ---
> >>  Documentation/vfio.txt         |  88 ++++++++++-
> >>  arch/powerpc/include/asm/eeh.h |  17 +++
> >>  arch/powerpc/kernel/eeh.c      | 321 +++++++++++++++++++++++++++++++++++++++++
> >>  drivers/vfio/pci/vfio_pci.c    | 131 ++++++++++++++++-
> >>  include/uapi/linux/vfio.h      |  53 +++++++
> >>  5 files changed, 603 insertions(+), 7 deletions(-)
> >
> >Maybe a chicken and egg problem, but it seems like we could split the
> >platform code and vfio code into separate patches.
> >
> 
> Ok. I'll keep egg/chicken separated in next revision.
> 
> >> 
> >> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> >> index b9ca023..dd13db6 100644
> >> --- a/Documentation/vfio.txt
> >> +++ b/Documentation/vfio.txt
> >> @@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
> >>  an excellent performance which has limitations such as inability to do
> >>  locked pages accounting in real time.
> >>  
> >> -So 3 additional ioctls have been added:
> >> +4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
> >> +which works on the basis of additional ioctl commands.
> >> +
> >> +So 8 additional ioctls have been added:
> >>  
> >>  	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
> >>  		of the DMA window on the PCI bus.
> >> @@ -316,6 +319,20 @@ So 3 additional ioctls have been added:
> >>  
> >>  	VFIO_IOMMU_DISABLE - disables the container.
> >>  
> >> +	VFIO_EEH_PE_SET_OPTION - enables or disables EEH functinality on the
> >> +		specified device. Also, it can be used to remove IO or DMA
> >> +		stopped state on the frozen PE.
> >> +
> >> +	VFIO_EEH_PE_GET_ADDR - retrieve the unique address of the specified
> >> +		PE or query PE sharing mode.
> >> +
> >> +	VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
> >> +
> >> +	VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
> >> +		error recovering.
> >> +
> >> +	VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
> >> +		one of the major steps for error recoverying.
> >>  
> >>  The code flow from the example above should be slightly changed:
> >>  
> >> @@ -346,6 +363,75 @@ The code flow from the example above should be slightly changed:
> >>  	ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
> >>  	.....
> >>  
> >> +Based on the initial example we have, the following piece of code could be
> >> +reference for EEH setup and error handling:
> >> +
> >> +	struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
> >> +	struct vfio_eeh_pe_get_addr addr = { .argsz = sizeof(addr) };
> >> +	struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
> >> +	struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
> >> +	struct vfio_eeh_pe_configure config = { .argsz = sizeof(config) };
> >> +
> >> +	....
> >> +
> >> +	/* Get a file descriptor for the device */
> >> +	device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
> >> +
> >> +	/* Enable the EEH functionality on the device */
> >> +	option.option = EEH_OPT_ENABLE;
> >> +	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
> >> +
> >> +	/* Retrieve PE address and create and maintain PE by yourself */
> >> +	addr.option = EEH_OPT_GET_PE_ADDR;
> >> +	ioctl(device, VFIO_EEH_PE_GET_ADDR, &addr);
> >> +
> >> +	/* Assure EEH is supported on the PE and make PE functional */
> >> +	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
> >> +
> >> +	/* Save device's state. pci_save_state() would be good enough
> >> +	 * as an example.
> >> +	 */
> >> +
> >> +	/* Test and setup the device */
> >> +	ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
> >> +
> >> +	....
> >> +
> >> +	/* When 0xFF's returned from reading PCI config space or IO BARs
> >> +	 * of the PCI device. Check the PE state to see if that has been
> >> +	 * frozen.
> >> +	 */
> >> +	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
> >
> >There's no notification, the user needs to observe the return value an
> >poll?  Should we be enabling an eventfd to notify the user of the state
> >change?

Ok

> Yes. The user needs to monitor the return value. we should have one notification,
> but it's for later as we discussed :-)
> 
> >> +
> >> +	/* Waiting for pending PCI transactions to be completed and don't
> >> +	 * produce any more PCI traffic from/to the affected PE until
> >> +	 * recovery is finished.
> >> +	 */
> >> +
> >> +	/* Enable IO for the affected PE and collect logs. Usually, the
> >> +	 * standard part of PCI config space, AER registers are dumped
> >> +	 * as logs for further analysis.
> >> +	 */
> >> +	option.option = EEH_OPT_THAW_MMIO;
> >> +	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
> >
> >How does the guest learn about the error?  Does it need to?
> 
> When guest detects 0xFF's from reading PCI config space or IO, it's going
> check the device (PE) state. If the device (PE) has been put into frozen
> state, the recovery will be started.

No, sorry, I mean how does the user get information about the error?
The interface we have here is:
a) find that something bad has happened
b) kick it into working again
c) continue

How does the user figure out what happened and if it makes sense to
attempt to recover?  Where does the user learn that their disk is on
fire?

> >> +
> >> +	/* Issue PE reset */
> >> +	reset.option = EEH_RESET_HOT;
> >> +	ioctl(device, VFIO_EEH_PE_RESET, &reset);
> >> +
> >> +	/* Configure the PCI bridges for the affected PE */
> >> +	ioctl(device, VFIO_EEH_PE_CONFIGURE, NULL);
> >> +
> >
> >I'm not sure I see why we've split these into separate ioctls.  FWIW,
> >the one ioctl we currently have for reset that takes no options is
> >probably going to be the first to get deprecated because of it.
> >
> >> +	/* Restored state we saved at initialization time. pci_restore_state()
> >> +	 * is good enough as an example.
> >> +	 */
> >> +
> >> +	/* Hopefully, error is recovered successfully. Now, you can resume to
> >> +	 * start PCI traffic to/from the affected PE.
> >> +	 */
> >> +
> >> +	....
> >> +
> >>  -------------------------------------------------------------------------------
> >>  
> >>  [1] VFIO was originally an acronym for "Virtual Function I/O" in its
> >> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> >> index 34a2d83..dd5f1cf 100644
> >> --- a/arch/powerpc/include/asm/eeh.h
> >> +++ b/arch/powerpc/include/asm/eeh.h
> >> @@ -191,6 +191,11 @@ enum {
> >>  #define EEH_OPT_ENABLE		1	/* EEH enable	*/
> >>  #define EEH_OPT_THAW_MMIO	2	/* MMIO enable	*/
> >>  #define EEH_OPT_THAW_DMA	3	/* DMA enable	*/
> >> +#define EEH_OPT_GET_PE_ADDR	0	/* Get PE addr	*/
> >> +#define EEH_OPT_GET_PE_MODE	1	/* Get PE mode	*/
> >> +#define EEH_PE_MODE_NONE	0	/* Invalid mode	*/
> >> +#define EEH_PE_MODE_NOT_SHARED	1	/* Not shared	*/
> >> +#define EEH_PE_MODE_SHARED	2	/* Shared mode	*/
> >>  #define EEH_STATE_UNAVAILABLE	(1 << 0)	/* State unavailable	*/
> >>  #define EEH_STATE_NOT_SUPPORT	(1 << 1)	/* EEH not supported	*/
> >>  #define EEH_STATE_RESET_ACTIVE	(1 << 2)	/* Active reset		*/
> >> @@ -198,6 +203,11 @@ enum {
> >>  #define EEH_STATE_DMA_ACTIVE	(1 << 4)	/* Active DMA		*/
> >>  #define EEH_STATE_MMIO_ENABLED	(1 << 5)	/* MMIO enabled		*/
> >>  #define EEH_STATE_DMA_ENABLED	(1 << 6)	/* DMA enabled		*/
> >> +#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
> >> +#define EEH_PE_STATE_RESET		1		/* PE reset	*/
> >> +#define EEH_PE_STATE_STOPPED_IO_DMA	2		/* Stopped	*/
> >> +#define EEH_PE_STATE_STOPPED_DMA	4		/* Stopped DMA	*/
> >> +#define EEH_PE_STATE_UNAVAIL		5		/* Unavailable	*/
> >>  #define EEH_RESET_DEACTIVATE	0	/* Deactivate the PE reset	*/
> >>  #define EEH_RESET_HOT		1	/* Hot reset			*/
> >>  #define EEH_RESET_FUNDAMENTAL	3	/* Fundamental reset		*/
> >> @@ -305,6 +315,13 @@ void eeh_add_device_late(struct pci_dev *);
> >>  void eeh_add_device_tree_late(struct pci_bus *);
> >>  void eeh_add_sysfs_files(struct pci_bus *);
> >>  void eeh_remove_device(struct pci_dev *);
> >> +int eeh_dev_open(struct pci_dev *pdev);
> >> +void eeh_dev_release(struct pci_dev *pdev);
> >> +int eeh_pe_set_option(struct pci_dev *pdev, int option);
> >> +int eeh_pe_get_addr(struct pci_dev *pdev, int option);
> >> +int eeh_pe_get_state(struct pci_dev *pdev);
> >> +int eeh_pe_reset(struct pci_dev *pdev, int option);
> >> +int eeh_pe_configure(struct pci_dev *pdev);
> >>  
> >>  /**
> >>   * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
> >> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> >> index 9c6b899..b90a474 100644
> >> --- a/arch/powerpc/kernel/eeh.c
> >> +++ b/arch/powerpc/kernel/eeh.c
> >> @@ -108,6 +108,9 @@ struct eeh_ops *eeh_ops = NULL;
> >>  /* Lock to avoid races due to multiple reports of an error */
> >>  DEFINE_RAW_SPINLOCK(confirm_error_lock);
> >>  
> >> +/* Lock to protect passed flags */
> >> +static DEFINE_MUTEX(eeh_dev_mutex);
> >> +
> >>  /* Buffer for reporting pci register dumps. Its here in BSS, and
> >>   * not dynamically alloced, so that it ends up in RMO where RTAS
> >>   * can access it.
> >> @@ -1098,6 +1101,324 @@ void eeh_remove_device(struct pci_dev *dev)
> >>  	edev->mode &= ~EEH_DEV_SYSFS;
> >>  }
> >>  
> >> +/**
> >> + * eeh_dev_open - Mark EEH device and PE as passed through
> >> + * @pdev: PCI device
> >> + *
> >> + * Mark the indicated EEH device and PE as passed through.
> >> + * In the result, the EEH errors detected on the PE won't be
> >> + * reported. The owner of the device will be responsible for
> >> + * detection and recovery.
> >> + */
> >> +int eeh_dev_open(struct pci_dev *pdev)
> >> +{
> >> +	struct eeh_dev *edev;
> >> +
> >> +	mutex_lock(&eeh_dev_mutex);
> >> +
> >> +	/* No PCI device ? */
> >> +	if (!pdev) {
> >> +		mutex_unlock(&eeh_dev_mutex);
> >> +		return -ENODEV;
> >> +	}
> >> +
> >> +	/* No EEH device ? */
> >> +	edev = pci_dev_to_eeh_dev(pdev);
> >> +	if (!edev || !edev->pe) {
> >> +		mutex_unlock(&eeh_dev_mutex);
> >> +		return -ENODEV;
> >> +	}
> >> +
> >> +	eeh_dev_set_passed(edev, true);
> >> +	eeh_pe_set_passed(edev->pe, true);
> >> +	mutex_unlock(&eeh_dev_mutex);
> >> +
> >> +	return 0;
> >> +}
> >> +EXPORT_SYMBOL_GPL(eeh_dev_open);
> >> +
> >> +/**
> >> + * eeh_dev_release - Reclaim the ownership of EEH device
> >> + * @pdev: PCI device
> >> + *
> >> + * Reclaim ownership of EEH device, potentially the corresponding
> >> + * PE. In the result, the EEH errors detected on the PE will be
> >> + * reported and handled as usual.
> >> + */
> >> +void eeh_dev_release(struct pci_dev *pdev)
> >> +{
> >> +	bool release_pe = true;
> >> +	struct eeh_pe *pe = NULL;
> >> +	struct eeh_dev *tmp, *edev;
> >> +
> >> +	mutex_lock(&eeh_dev_mutex);
> >> +
> >> +	/* No PCI device ? */
> >> +	if (!pdev) {
> >> +		mutex_unlock(&eeh_dev_mutex);
> >> +		return;
> >> +	}
> >> +
> >> +	/* No EEH device ? */
> >> +	edev = pci_dev_to_eeh_dev(pdev);
> >> +	if (!edev || !eeh_dev_passed(edev) ||
> >> +	    !edev->pe || !eeh_pe_passed(pe)) {
> >> +		mutex_unlock(&eeh_dev_mutex);
> >> +		return;
> >> +	}
> >> +
> >> +	/* Release device */
> >> +	pe = edev->pe;
> >> +	eeh_dev_set_passed(edev, false);
> >> +
> >> +	/* Release PE */
> >> +	eeh_pe_for_each_dev(pe, edev, tmp) {
> >> +		if (eeh_dev_passed(edev)) {
> >> +			release_pe = false;
> >> +			break;
> >> +		}
> >> +	}
> >> +
> >> +	if (release_pe)
> >> +		eeh_pe_set_passed(pe, false);
> >> +
> >> +	mutex_unlock(&eeh_dev_mutex);
> >> +}
> >> +EXPORT_SYMBOL(eeh_dev_release);
> >> +
> >> +static int eeh_dev_check(struct pci_dev *pdev,
> >> +			 struct eeh_dev **pedev,
> >> +			 struct eeh_pe **ppe)
> >> +{
> >> +	struct eeh_dev *edev;
> >> +
> >> +	/* No device ? */
> >> +	if (!pdev)
> >> +		return -ENODEV;
> >> +
> >> +	edev = pci_dev_to_eeh_dev(pdev);
> >> +	if (!edev || !eeh_dev_passed(edev) ||
> >> +	    !edev->pe || !eeh_pe_passed(edev->pe))
> >> +		return -ENODEV;
> >> +
> >> +	if (pedev)
> >> +		*pedev = edev;
> >> +	if (ppe)
> >> +		*ppe = edev->pe;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +/**
> >> + * eeh_pe_set_option - Set options for the indicated PE
> >> + * @pdev: PCI device
> >> + * @option: requested option
> >> + *
> >> + * The routine is called to enable or disable EEH functionality
> >> + * on the indicated PE, to enable IO or DMA for the frozen PE.
> >> + */
> >> +int eeh_pe_set_option(struct pci_dev *pdev, int option)
> >> +{
> >> +	struct eeh_dev *edev;
> >> +	struct eeh_pe *pe;
> >> +	int ret = 0;
> >> +
> >> +	/* Device existing ? */
> >> +	ret = eeh_dev_check(pdev, &edev, &pe);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	switch (option) {
> >> +	case EEH_OPT_DISABLE:
> >> +	case EEH_OPT_ENABLE:
> >> +		break;
> >> +	case EEH_OPT_THAW_MMIO:
> >> +	case EEH_OPT_THAW_DMA:
> >> +		if (!eeh_ops || !eeh_ops->set_option) {
> >> +			ret = -ENOENT;
> >> +			break;
> >> +		}
> >> +
> >> +		ret = eeh_ops->set_option(pe, option);
> >> +		break;
> >> +	default:
> >> +		pr_debug("%s: Option %d out of range (%d, %d)\n",
> >> +			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
> >> +		ret = -EINVAL;
> >> +	}
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(eeh_pe_set_option);
> >> +
> >> +/**
> >> + * eeh_pe_get_addr - Retrieve the PE address or sharing mode
> >> + * @pdev: PCI device
> >> + * @option: option
> >> + *
> >> + * Retrieve the PE address or sharing mode.
> >> + */
> >> +int eeh_pe_get_addr(struct pci_dev *pdev, int option)
> >> +{
> >> +	struct pci_bus *bus;
> >> +	struct eeh_dev *edev;
> >> +	struct eeh_pe *pe;
> >> +	int ret = 0;
> >> +
> >> +	/* Device existing ? */
> >> +	ret = eeh_dev_check(pdev, &edev, &pe);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	switch (option) {
> >> +	case EEH_OPT_GET_PE_ADDR:
> >> +		bus = eeh_pe_bus_get(pe);
> >> +		if (!bus) {
> >> +			ret = -ENODEV;
> >> +			break;
> >> +		}
> >> +
> >> +		/* PE address has format "00BBSS00" */
> >> +		ret = bus->number << 16;
> >> +		break;
> >> +	case EEH_OPT_GET_PE_MODE:
> >> +		/* Wa always have shared PE */
> >> +		ret = EEH_PE_MODE_SHARED;
> >> +		break;
> >> +	default:
> >> +		pr_debug("%s: option %d out of range (0, 1)\n",
> >> +			__func__, option);
> >> +		ret = -EINVAL;
> >> +	}
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(eeh_pe_get_addr);
> >> +
> >> +/**
> >> + * eeh_pe_get_state - Retrieve PE's state
> >> + * @pdev: PCI device
> >> + *
> >> + * Retrieve the PE's state, which includes 3 aspects: enabled
> >> + * DMA, enabled IO and asserted reset.
> >> + */
> >> +int eeh_pe_get_state(struct pci_dev *pdev)
> >> +{
> >> +	struct eeh_dev *edev;
> >> +	struct eeh_pe *pe;
> >> +	int result, ret = 0;
> >> +
> >> +	/* Device existing ? */
> >> +	ret = eeh_dev_check(pdev, &edev, &pe);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	if (!eeh_ops || !eeh_ops->get_state)
> >> +		return -ENOENT;
> >> +
> >> +	result = eeh_ops->get_state(pe, NULL);
> >> +	if (!(result & EEH_STATE_RESET_ACTIVE) &&
> >> +	     (result & EEH_STATE_DMA_ENABLED) &&
> >> +	     (result & EEH_STATE_MMIO_ENABLED))
> >> +		ret = EEH_PE_STATE_NORMAL;
> >> +	else if (result & EEH_STATE_RESET_ACTIVE)
> >> +		ret = EEH_PE_STATE_RESET;
> >> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
> >> +		 !(result & EEH_STATE_DMA_ENABLED) &&
> >> +		 !(result & EEH_STATE_MMIO_ENABLED))
> >> +		ret = EEH_PE_STATE_STOPPED_IO_DMA;
> >> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
> >> +		 (result & EEH_STATE_DMA_ENABLED) &&
> >> +		 !(result & EEH_STATE_MMIO_ENABLED))
> >> +		ret = EEH_PE_STATE_STOPPED_DMA;
> >> +	else
> >> +		ret = EEH_PE_STATE_UNAVAIL;
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(eeh_pe_get_state);
> >> +
> >> +/**
> >> + * eeh_pe_reset - Issue PE reset according to specified type
> >> + * @pdev: PCI device
> >> + * @option: reset type
> >> + *
> >> + * The routine is called to reset the specified PE with the
> >> + * indicated type, either fundamental reset or hot reset.
> >> + * PE reset is the most important part for error recovery.
> >> + */
> >> +int eeh_pe_reset(struct pci_dev *pdev, int option)
> >> +{
> >> +	struct eeh_dev *edev;
> >> +	struct eeh_pe *pe;
> >> +	int ret = 0;
> >> +
> >> +	/* Device existing ? */
> >> +	ret = eeh_dev_check(pdev, &edev, &pe);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset)
> >> +		return -ENOENT;
> >> +
> >> +	switch (option) {
> >> +	case EEH_RESET_DEACTIVATE:
> >> +		ret = eeh_ops->reset(pe, option);
> >> +		if (ret)
> >> +			break;
> >> +
> >> +		/*
> >> +		 * The PE is still in frozen state and we need clear
> >> +		 * that. It's good to clear frozen state after deassert
> >> +		 * to avoid messy IO access during reset, which might
> >> +		 * cause recursive frozen PE.
> >> +		 */
> >> +		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
> >> +		if (!ret)
> >> +			ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
> >> +		if (!ret)
> >> +			eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
> >> +		break;
> >> +	case EEH_RESET_HOT:
> >> +	case EEH_RESET_FUNDAMENTAL:
> >> +		ret = eeh_ops->reset(pe, option);
> >> +		break;
> >> +	default:
> >> +		pr_debug("%s: Unsupported option %d\n",
> >> +			__func__, option);
> >> +		ret = -EINVAL;
> >> +	}
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(eeh_pe_reset);
> >> +
> >> +/**
> >> + * eeh_pe_configure - Configure PCI bridges after PE reset
> >> + * @pdev: PCI device
> >> + *
> >> + * The routine is called to restore the PCI config space for
> >> + * those PCI devices, especially PCI bridges affected by PE
> >> + * reset issued previously.
> >> + */
> >> +int eeh_pe_configure(struct pci_dev *pdev)
> >> +{
> >> +	struct eeh_dev *edev;
> >> +	struct eeh_pe *pe;
> >> +	int ret = 0;
> >> +
> >> +	/* Device existing ? */
> >> +	ret = eeh_dev_check(pdev, &edev, &pe);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	/* Restore config space for the affected devices */
> >> +	eeh_pe_restore_bars(pe);
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(eeh_pe_configure);
> >> +
> >>  static int proc_eeh_show(struct seq_file *m, void *v)
> >>  {
> >>  	if (!eeh_enabled()) {
> >> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> >> index 7ba0424..301ac18 100644
> >> --- a/drivers/vfio/pci/vfio_pci.c
> >> +++ b/drivers/vfio/pci/vfio_pci.c
> >> @@ -25,6 +25,9 @@
> >>  #include <linux/types.h>
> >>  #include <linux/uaccess.h>
> >>  #include <linux/vfio.h>
> >> +#ifdef CONFIG_EEH
> >> +#include <asm/eeh.h>
> >> +#endif
> >
> >Can we make a vfio_pci_eeh file that properly stubs out anything called
> >from common code?  I don't want to see all these inline ifdefs.
> >
> 
> well. do you want see drivers/vfio/vfio-pci/vfio_pci_eeh.c and export
> following functins for vfio_pci.c to use?
> 
> vfio_pci_eeh_open()
> vfio_pci_eeh_release()
> vfio_pci_eeh_ioctl()

Yes

> >>  
> >>  #include "vfio_pci_private.h"
> >>  
> >> @@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
> >>  	pci_restore_state(pdev);
> >>  }
> >>  
> >> +static void vfio_eeh_pci_release(struct pci_dev *pdev)
> >> +{
> >> +#ifdef CONFIG_EEH
> >> +	eeh_dev_release(pdev);
> >> +#endif
> >> +}
> >> +
> >>  static void vfio_pci_release(void *device_data)
> >>  {
> >>  	struct vfio_pci_device *vdev = device_data;
> >>  
> >> -	if (atomic_dec_and_test(&vdev->refcnt))
> >> +	if (atomic_dec_and_test(&vdev->refcnt)) {
> >> +		vfio_eeh_pci_release(vdev->pdev);
> >>  		vfio_pci_disable(vdev);
> >> +	}
> >>  
> >>  	module_put(THIS_MODULE);
> >>  }
> >>  
> >> +static int vfio_eeh_pci_open(struct pci_dev *pdev)
> >> +{
> >> +	int ret = 0;
> >> +
> >> +#ifdef CONFIG_EEH
> >> +	ret = eeh_dev_open(pdev);
> >> +#endif
> >> +	return ret;
> >> +}
> >> +
> >>  static int vfio_pci_open(void *device_data)
> >>  {
> >>  	struct vfio_pci_device *vdev = device_data;
> >> +	int ret;
> >>  
> >>  	if (!try_module_get(THIS_MODULE))
> >>  		return -ENODEV;
> >>  
> >>  	if (atomic_inc_return(&vdev->refcnt) == 1) {
> >> -		int ret = vfio_pci_enable(vdev);
> >> -		if (ret) {
> >> -			module_put(THIS_MODULE);
> >> -			return ret;
> >> -		}
> >> +		ret = vfio_pci_enable(vdev);
> >> +		if (ret)
> >> +			goto error;
> >> +
> >> +		ret = vfio_eeh_pci_open(vdev->pdev);
> >> +		if (ret)
> >> +			goto error;
> >>  	}
> >>  
> >>  	return 0;
> >> +error:
> >> +	module_put(THIS_MODULE);
> >> +	return ret;
> >>  }
> >>  
> >>  static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
> >> @@ -321,6 +349,91 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
> >>  	return walk.ret;
> >>  }
> >>  
> >> +static int vfio_eeh_pci_ioctl(struct pci_dev *pdev,
> >> +			      unsigned int cmd,
> >> +			      unsigned long arg)
> >> +{
> >> +	unsigned long minsz;
> >> +	int ret = 0;
> >> +
> >> +#ifdef CONFIG_EEH
> >> +	switch (cmd) {
> >> +	case VFIO_EEH_PE_SET_OPTION: {
> >> +		struct vfio_eeh_pe_set_option option;
> >> +
> >> +		minsz = offsetofend(struct vfio_eeh_pe_set_option, option);
> >> +		if (copy_from_user(&option, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +		if (option.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		ret = eeh_pe_set_option(pdev, option.option);
> >> +		break;
> >> +	}
> >> +	case VFIO_EEH_PE_GET_ADDR: {
> >> +		struct vfio_eeh_pe_get_addr addr;
> >> +
> >> +		minsz = offsetofend(struct vfio_eeh_pe_get_addr, info);
> >> +		if (copy_from_user(&addr, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +		if (addr.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		ret = eeh_pe_get_addr(pdev, addr.option);
> >> +		if (ret >= 0) {
> >> +			addr.info = ret;
> >> +			if (copy_to_user((void __user *)arg, &addr, minsz))
> >> +				return -EFAULT;
> >> +			ret = 0;
> >> +		}
> >> +
> >> +		break;
> >> +	}
> >> +	case VFIO_EEH_PE_GET_STATE: {
> >> +		struct vfio_eeh_pe_get_state state;
> >> +
> >> +		minsz = offsetofend(struct vfio_eeh_pe_get_state, state);
> >> +		if (copy_from_user(&state, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +		if (state.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		ret = eeh_pe_get_state(pdev);
> >> +		if (ret >= 0) {
> >> +			state.state = ret;
> >> +			if (copy_to_user((void __user *)arg, &state, minsz))
> >> +				return -EFAULT;
> >> +			ret = 0;
> >> +		}
> >> +		break;
> >> +	}
> >> +	case VFIO_EEH_PE_RESET: {
> >> +		struct vfio_eeh_pe_reset reset;
> >> +
> >> +		minsz = offsetofend(struct vfio_eeh_pe_reset, option);
> >> +		if (copy_from_user(&reset, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +		if (reset.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		ret = eeh_pe_reset(pdev, reset.option);
> >> +		break;
> >> +	}
> >> +	case VFIO_EEH_PE_CONFIGURE:
> >> +		ret = eeh_pe_configure(pdev);
> >> +		break;
> >> +	default:
> >> +		ret = -EINVAL;
> >> +		pr_debug("%s: Cannot handle command %d\n",
> >> +			__func__, cmd);
> >> +	}
> >> +#else
> >> +	ret = -ENOENT;
> >> +#endif
> >> +
> >> +	return ret;
> >> +}
> >> +
> >>  static long vfio_pci_ioctl(void *device_data,
> >>  			   unsigned int cmd, unsigned long arg)
> >>  {
> >> @@ -682,6 +795,12 @@ hot_reset_release:
> >>  
> >>  		kfree(groups);
> >>  		return ret;
> >> +	} else if (cmd == VFIO_EEH_PE_SET_OPTION ||
> >> +		   cmd == VFIO_EEH_PE_GET_ADDR ||
> >> +		   cmd == VFIO_EEH_PE_GET_STATE ||
> >> +		   cmd == VFIO_EEH_PE_RESET ||
> >> +		   cmd == VFIO_EEH_PE_CONFIGURE) {
> >> +			return vfio_eeh_pci_ioctl(vdev->pdev, cmd, arg);
> >>  	}
> >>  
> >>  	return -ENOTTY;
> >> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >> index cb9023d..ef55682 100644
> >> --- a/include/uapi/linux/vfio.h
> >> +++ b/include/uapi/linux/vfio.h
> >> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
> >>  
> >>  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
> >>  
> >> +/*
> >> + * EEH functionality can be enabled or disabled on one specific device.
> >> + * Also, the DMA or IO frozen state can be removed from the frozen PE
> >> + * if required.
> >> + */
> >> +struct vfio_eeh_pe_set_option {
> >> +	__u32 argsz;
> >> +	__u32 option;
> >> +};
> >> +
> >> +#define VFIO_EEH_PE_SET_OPTION		_IO(VFIO_TYPE, VFIO_BASE + 21)
> >
> >What proposed ioctls are making you jump to 21?
> >
> >argsz is probably not as useful without a flags field.  Otherwise the
> >caller can't indicate what the extra space is.
> >
> 
> The QEMU patches are based on Alexey's additional feature ("ddw"), which
> consumed several ioctl commands.
> 
> ok. So you also prefer to remove "argsz"?

No, I prefer to stay consistent with the rest of the VFIO API and use
argsz + flags.

> >> +
> >> +/*
> >> + * Each EEH PE should have unique address to be identified. The command
> >> + * helps to retrieve the address and the sharing mode of the PE.
> >> + */
> >> +struct vfio_eeh_pe_get_addr {
> >> +	__u32 argsz;
> >> +	__u32 option;
> >> +	__u32 info;
> >> +};
> >> +
> >> +#define VFIO_EEH_PE_GET_ADDR		_IO(VFIO_TYPE, VFIO_BASE + 22)
> >> +
> >> +/*
> >> + * EEH PE might have been frozen because of PCI errors. Also, it might
> >> + * be experiencing reset for error revoery. The following command helps
> >> + * to get the state.
> >> + */
> >> +struct vfio_eeh_pe_get_state {
> >> +	__u32 argsz;
> >> +	__u32 state;
> >> +};
> >> +
> >> +#define VFIO_EEH_PE_GET_STATE		_IO(VFIO_TYPE, VFIO_BASE + 23)
> >> +
> >> +/*
> >> + * Reset is the major step to recover problematic PE. The following
> >> + * command helps on that.
> >> + */
> >> +struct vfio_eeh_pe_reset {
> >> +	__u32 argsz;
> >> +	__u32 option;
> >> +};
> >> +
> >> +#define VFIO_EEH_PE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 24)
> >> +
> >> +/*
> >> + * One of the steps for recovery after PE reset is to configure the
> >> + * PCI bridges affected by the PE reset.
> >> + */
> >> +#define VFIO_EEH_PE_CONFIGURE		_IO(VFIO_TYPE, VFIO_BASE + 25)
> >
> >What can the user do differently by making these separate ioctls?
> >
> 
> hrm, I didn't understood as well. Alex.G could have the explaination.

As agraf noted, I'm asking why reset and configure are separate when
they seem to be used together.  Thanks,

Alex

> >> +
> >>  /* ***************************************************************** */
> >>  
> >>  #endif /* _UAPIVFIO_H */
> 
> Thanks,
> Gavin
>
Alex Williamson May 23, 2014, 2:36 p.m. UTC | #21
On Fri, 2014-05-23 at 15:00 +1000, Benjamin Herrenschmidt wrote:
> On Fri, 2014-05-23 at 14:37 +1000, Gavin Shan wrote:
> > >There's no notification, the user needs to observe the return value an
> > >poll?  Should we be enabling an eventfd to notify the user of the state
> > >change?
> > >
> > 
> > Yes. The user needs to monitor the return value. we should have one notification,
> > but it's for later as we discussed :-)
> 
>  ../..
> 
> > >How does the guest learn about the error?  Does it need to?
> > 
> > When guest detects 0xFF's from reading PCI config space or IO, it's going
> > check the device (PE) state. If the device (PE) has been put into frozen
> > state, the recovery will be started.
> 
> Quick recap for Alex W (we discussed that with Alex G).
> 
> While a notification looks like a worthwhile addition in the long run, it
> is not sufficient and not used today and I prefer that we keep that as something
> to add later for those two main reasons:
> 
>  - First, the kernel itself isn't always notified. For example, if we implement
> on top of an RTAS backend (PR KVM under pHyp) or if we are on top of PowerNV but
> the error is a PHB "fence" (the entire PCI Host bridge gets fenced out in hardware
> due to an internal error), then we get no notification. Only polling of the
> hardware or firmware will tell us. Since we don't want to have a polling timer
> in the kernel, that means that the userspace client of VFIO (or alternatively
> the KVM guest) is the one that polls.
> 
>  - Second, this is how our primary user expects it: The primary (and only initial)
> user of this will be qemu/KVM for PAPR guests and they don't have a notification
> mechanism. Instead they query the EEH state after detecting an all 1's return from
> MMIO or config space. This is how PAPR specifies it so we are just implementing the
> spec here :-)
> 
> Because of these, I think we shouldn't worry too much about notification at
> this stage.

Ok, I was asking more about an error log that indicates what error
occurred to freeze the hardware so that the user can make a more
educated guess whether recovery is an option.  Given that you have cases
where there may be no notification and your guest/user already handles
this, the plan to start with polling makes sense.  Thanks,

Alex
Alex Williamson May 23, 2014, 2:49 p.m. UTC | #22
On Fri, 2014-05-23 at 14:30 +0200, Alexander Graf wrote:
> On 23.05.14 13:58, Gavin Shan wrote:
> > On Fri, May 23, 2014 at 08:52:23AM +0200, Alexander Graf wrote:
> >>
> >>> Am 23.05.2014 um 05:23 schrieb Alex Williamson <alex.williamson@redhat.com>:
> >>>
> >>>> On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
> >>>>> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
> >>>>>> On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
> >>>>>> On 22.05.14 10:23, Gavin Shan wrote:
> >>>> .../...
> >>>>
> >>>>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >>>>>>> index cb9023d..ef55682 100644
> >>>>>>> --- a/include/uapi/linux/vfio.h
> >>>>>>> +++ b/include/uapi/linux/vfio.h
> >>>>>>> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
> >>>>>>> #define VFIO_IOMMU_SPAPR_TCE_GET_INFO    _IO(VFIO_TYPE, VFIO_BASE + 12)
> >>>>>>> +/*
> >>>>>>> + * EEH functionality can be enabled or disabled on one specific device.
> >>>>>>> + * Also, the DMA or IO frozen state can be removed from the frozen PE
> >>>>>>> + * if required.
> >>>>>>> + */
> >>>>>>> +struct vfio_eeh_pe_set_option {
> >>>>>>> +    __u32 argsz;
> >>>>>> What is this argsz thing? Is this your way of maintaining backwards
> >>>>>> compatibility when we introduce new fields? A new field will change
> >>>>>> the ioctl number, so I don't think that makes a lot of sense :).
> >>>>>>
> >>>>>> Just make the ioctl have a u32 as incoming argument. No fancy
> >>>>>> structs, no complicated code.
> >>>>>>
> >>>>>> The same applies for a number of structs below.
> >>>>> ok. Will do in next revision.
> >>>> Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl command
> >>>> always has "argsz". I guess it was used as checker by Alex.W. Do you really
> >>>> want remove "argsz" ?
> >>>
> >>> IIRC, this was actually a suggestion incorporated from David Gibson, but
> >>> using _IO with an argsz and flags field we can maintain compatibility
> >>> without bumping the ioctl number.  It really only makes sense if we have
> >>> a flags field so we can identify what additional information is being
> >>> provided.  Flags can be used as a bitmap of trailing structures or as
> >>> revision if we want a set of trailing structures that may change over
> >>> time.  Unless you can come up with a good argument against it that would
> >>> prevent us inventing a new ioctl as soon as we need a minor tweak, I'd
> >>> prefer to keep it.  As I noted in a previous comment, the one ioctl we
> >>> have for reset that doesn't take any options is likely going to be the
> >>> first ioctl that we need to entirely replace.  If we don't keep argsz,
> >>> it seems like we probably need a flags field and reserved structures.
> >>>
> >>>>>>> +    __u32 option;
> >>>>>>> +};
> >>>>>>> +
> >>>>>>> +#define VFIO_EEH_PE_SET_OPTION        _IO(VFIO_TYPE, VFIO_BASE + 21)
> >>>>>>> +
> >>>>>>> +/*
> >>>>>>> + * Each EEH PE should have unique address to be identified. The command
> >>>>>>> + * helps to retrieve the address and the sharing mode of the PE.
> >>>>>>> + */
> >>>>>>> +struct vfio_eeh_pe_get_addr {
> >>>>>>> +    __u32 argsz;
> >>>>>>> +    __u32 option;
> >>>>>>> +    __u32 info;
> >>>>>> Any particular reason you need the info field? Can't the return value
> >>>>>> of the ioctl hold this? Then you only have a single u32 argument left
> >>>>>> to the ioctl again.
> >>>>> ok. Will do in next revision.
> >>>> If we eventually remove "argsz" and let ioctl() return value to hold
> >>>> information (or negative number for errors), we don't need any data
> >>>> struct because the 3rd parameter of ioctl() would be used as input
> >>>> and I only need one input parameter. Do you want see this ?
> >>>>
> >>>> Hopefully, Alex.W saw this and hasn't objections :)
> >>> I'm not sure why we're pushing for the minimal data set to pass to an
> >>> ioctl.  Seems like a recipe for dead, useless ioctls.  Thanks,
> >>>
> >> The ioctl number includes sizeof(payload). So if a new parameter gets added, that would be a different ioctl number.
> >>
> >> If you want to maintain backwards compatibility ioctl number wise in the kernel, you'll have to have a "flags" field to indicate whether new data is available and a "pad" field, prefarably in a union, that ensures the size of the struct doesn't change.
> >>
> >> I'm not sure it's really necessary here to have identical ioctl numbers if we add parameters, since we can always just define a new ioctl with a bigger payload that can then become the default handler and a shim backwards compatible handler with the old number.
> >>
> >> But if you think it is important, let's do it for real, not just halfway.
> >>
> > So I need add additional field "flags" ?
> 
> This is Alex' call on how he prefers the interface to look like. The 
> struct size that you pass into an ioctl is part of the ioctl number,

Only when using _IOR/W/RW, the size is not part of the ioctl number for
_IO, which is why VFIO uses _IO for interfaces with argsz + flags.

>  so 
> putting that into an argsz field only makes sense when you do ugly 
> things like
> 
> struct foo *x;
> uint32_t *y;
> 
> x = malloc(sizeof(*x) + sizeof(*y));
> y = (void*)&x[1];
> 
> ioctl(..., x);
> 
> But I'd prefer not to have such nasty code really. It breaks assumptions 
> on *anything* that wraps ioctls, such as strace or qemu's linux-user 
> emulation.

If there's a reason to use something other than _IO+argsz+flags, we can
discuss it, but this is not it.

> >   Also, I need keep the return value from
> > ioctl() less or equal to 0 ? :-)
> 
> Usually return values are
> 
>    < 0 means error
>    == 0 means success
>    > 0 means return payload

Agree, >0 only makes sense if the return value of the ioctl can be
expressed as a positive int, ex. VFIO_EEH_ERROR_COUNT (if such a thing
was needed).  Thanks,

Alex
Gavin Shan May 24, 2014, 1:37 a.m. UTC | #23
On Fri, May 23, 2014 at 08:49:19AM -0600, Alex Williamson wrote:
>On Fri, 2014-05-23 at 14:30 +0200, Alexander Graf wrote:
>> On 23.05.14 13:58, Gavin Shan wrote:
>> > On Fri, May 23, 2014 at 08:52:23AM +0200, Alexander Graf wrote:
>> >>> Am 23.05.2014 um 05:23 schrieb Alex Williamson <alex.williamson@redhat.com>:
>> >>>> On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
>> >>>>> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
>> >>>>>> On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
>> >>>>>> On 22.05.14 10:23, Gavin Shan wrote:

.../...

>
>If there's a reason to use something other than _IO+argsz+flags, we can
>discuss it, but this is not it.
>

If I just need a command, need I define a struct for the command, or I
should pass "NULL" for the 3rd parameter of ioctl() ?

struct foo {
	uint32_t argsz;
	uint32_t flags;
};

>> >   Also, I need keep the return value from
>> > ioctl() less or equal to 0 ? :-)
>> 
>> Usually return values are
>> 
>>    < 0 means error
>>    == 0 means success
>>    > 0 means return payload
>
>Agree, >0 only makes sense if the return value of the ioctl can be
>expressed as a positive int, ex. VFIO_EEH_ERROR_COUNT (if such a thing
>was needed).  Thanks,
>

What if I have to return "0" ? For "0", I need have a field in the
data struct to carry it. For "> 0", to carry it with return value
from ioctl(). Why not make it unified to have additional field in
the data struct to carry the return values. In this case, ioctl()
just returns:

<  0: error
== 0: success, check return value from the data struct.

Thanks,
Gavin
Gavin Shan May 24, 2014, 1:46 a.m. UTC | #24
On Fri, May 23, 2014 at 02:49:13PM +0200, Alexander Graf wrote:

.../...

>
>VFIO_EEH_RECOVER?
>

Then it should contain VFIO_EEH_PE_RESET, which is part of the
recovery. I don't expect the recovery to be done with one ioctl
command :-)

I personally prefer to keep VFIO_EEH_PE_CONFIUGRE, which means
the PE need configurtion/reconfiguration before error log retrieval
or after PE reset as defined in PAPR spec.

>So what if user space accesses config space while the device is
>broken? What if it accesses an mmap'ed BAR while the device is in
>broken state and BARs haven't been recovered yet?
>

If the "broken" means device can't funtion properly, any config,
IO, DMA traffic would cause "frozen" device (PE).

If the "broken" means "frozen" here. When the device is put into
frozen state, the config or IO access are dropped from hardware or
software level. So those requests can't reach to hardware under the
situation.

Thanks,
Gavin
Gavin Shan May 24, 2014, 2:06 a.m. UTC | #25
On Fri, May 23, 2014 at 08:29:59AM -0600, Alex Williamson wrote:
>On Fri, 2014-05-23 at 14:37 +1000, Gavin Shan wrote:
>> On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
>> >On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:

.../...

>No, sorry, I mean how does the user get information about the error?
>The interface we have here is:
>a) find that something bad has happened
>b) kick it into working again
>c) continue
>
>How does the user figure out what happened and if it makes sense to
>attempt to recover?  Where does the user learn that their disk is on
>fire?
>

When 0xFF's returned from config or IO read, user should check the
device (PE)'s state with ioctl command VFIO_EEH_PE_GET_STATE. If the
device (PE) has been put into "frozen" state, It's confirmed the device
("disk" you mentioned) is on fire. User should kick off recovery, which
includes:

- User stops any operatins (config, IO, DMA) on the device because any
  PCI traffic to "frozen" device will be dropped from software or hardware
  level. Also, we don't expect DMA traffic during recovery. Otherwise,
  we will bump into recursive errors and the recovery should fail.
- VFIO_EEH_PE_SET_OPTION to enable I/O path ("DMA" path is still under frozen
  state). EEH_VFIO_PE_CONFIGURE to reconfigure affected PCI bridges and then
  do error log retrieval.
- VFIO_EEH_PE_RESET to reset the affected device (PE). EEH_VFIO_PE_CONFIUGRE
  to restore BARs.
- User resumes the device to start PCI traffic and device is brought to
  funtional state.

.../...

>
>No, I prefer to stay consistent with the rest of the VFIO API and use
>argsz + flags.
>

Here's the recap for previous reply: I have several cases for ioctl().

- ioctl(fd, cmd, NULL):   I needn't any input info.
- ioctl(fd, cmd, &data):  I need input info

For all the cases, should I simply have a data struct to include "argsz+flags"?

For return value from ioctl(), can we simply to have additional field in the
above data struct to carry it? "0" is the information I have to return for
some of the cases.


.../...

>As agraf noted, I'm asking why reset and configure are separate when
>they seem to be used together.
>

Ok. It's the recap: they're 2 separate steps of error recovery as
defined in PAPR spec. Also, they correspond to 2 separate RTAS calls.
So I don't think we can put them together.

Thanks,
Gavin
Alex Williamson May 27, 2014, 5:39 p.m. UTC | #26
On Sat, 2014-05-24 at 12:06 +1000, Gavin Shan wrote:
> On Fri, May 23, 2014 at 08:29:59AM -0600, Alex Williamson wrote:
> >On Fri, 2014-05-23 at 14:37 +1000, Gavin Shan wrote:
> >> On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
> >> >On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
> 
> .../...
> 
> >No, sorry, I mean how does the user get information about the error?
> >The interface we have here is:
> >a) find that something bad has happened
> >b) kick it into working again
> >c) continue
> >
> >How does the user figure out what happened and if it makes sense to
> >attempt to recover?  Where does the user learn that their disk is on
> >fire?
> >
> 
> When 0xFF's returned from config or IO read, user should check the
> device (PE)'s state with ioctl command VFIO_EEH_PE_GET_STATE. If the
> device (PE) has been put into "frozen" state, It's confirmed the device
> ("disk" you mentioned) is on fire.

No, this only confirms that something bad happened, not _what_ bad thing
happened.

>  User should kick off recovery, which
> includes:

And here you're just describing the kick operation again...

> 
> - User stops any operatins (config, IO, DMA) on the device because any
>   PCI traffic to "frozen" device will be dropped from software or hardware
>   level. Also, we don't expect DMA traffic during recovery. Otherwise,
>   we will bump into recursive errors and the recovery should fail.
> - VFIO_EEH_PE_SET_OPTION to enable I/O path ("DMA" path is still under frozen
>   state). EEH_VFIO_PE_CONFIGURE to reconfigure affected PCI bridges and then
>   do error log retrieval.

These logs, where do they go?  How does the user get access?  That's
what I'm trying to ask about.

> - VFIO_EEH_PE_RESET to reset the affected device (PE). EEH_VFIO_PE_CONFIUGRE
>   to restore BARs.
> - User resumes the device to start PCI traffic and device is brought to
>   funtional state.
> 
> .../...
> 
> >
> >No, I prefer to stay consistent with the rest of the VFIO API and use
> >argsz + flags.
> >
> 
> Here's the recap for previous reply: I have several cases for ioctl().
> 
> - ioctl(fd, cmd, NULL):   I needn't any input info.
> - ioctl(fd, cmd, &data):  I need input info
> 
> For all the cases, should I simply have a data struct to include "argsz+flags"?

Anything that requires data should have argsz+flags, if it doesn't
require data, it doesn't need them, but think long an hard about whether
there's any possibility that we'll need parameters in the future.

> For return value from ioctl(), can we simply to have additional field in the
> above data struct to carry it? "0" is the information I have to return for
> some of the cases.

If for instance your ioctl is returning something like "number of
errors", then it's perfectly fine to use that as the ioctl return.  <0
is error, >= zero is a success with value.
diff mbox

Patch

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index b9ca023..dd13db6 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -305,7 +305,10 @@  faster, the map/unmap handling has been implemented in real mode which provides
 an excellent performance which has limitations such as inability to do
 locked pages accounting in real time.
 
-So 3 additional ioctls have been added:
+4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
+which works on the basis of additional ioctl commands.
+
+So 8 additional ioctls have been added:
 
 	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
 		of the DMA window on the PCI bus.
@@ -316,6 +319,20 @@  So 3 additional ioctls have been added:
 
 	VFIO_IOMMU_DISABLE - disables the container.
 
+	VFIO_EEH_PE_SET_OPTION - enables or disables EEH functinality on the
+		specified device. Also, it can be used to remove IO or DMA
+		stopped state on the frozen PE.
+
+	VFIO_EEH_PE_GET_ADDR - retrieve the unique address of the specified
+		PE or query PE sharing mode.
+
+	VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
+
+	VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
+		error recovering.
+
+	VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
+		one of the major steps for error recoverying.
 
 The code flow from the example above should be slightly changed:
 
@@ -346,6 +363,75 @@  The code flow from the example above should be slightly changed:
 	ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
 	.....
 
+Based on the initial example we have, the following piece of code could be
+reference for EEH setup and error handling:
+
+	struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
+	struct vfio_eeh_pe_get_addr addr = { .argsz = sizeof(addr) };
+	struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
+	struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
+	struct vfio_eeh_pe_configure config = { .argsz = sizeof(config) };
+
+	....
+
+	/* Get a file descriptor for the device */
+	device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
+
+	/* Enable the EEH functionality on the device */
+	option.option = EEH_OPT_ENABLE;
+	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
+
+	/* Retrieve PE address and create and maintain PE by yourself */
+	addr.option = EEH_OPT_GET_PE_ADDR;
+	ioctl(device, VFIO_EEH_PE_GET_ADDR, &addr);
+
+	/* Assure EEH is supported on the PE and make PE functional */
+	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
+
+	/* Save device's state. pci_save_state() would be good enough
+	 * as an example.
+	 */
+
+	/* Test and setup the device */
+	ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
+
+	....
+
+	/* When 0xFF's returned from reading PCI config space or IO BARs
+	 * of the PCI device. Check the PE state to see if that has been
+	 * frozen.
+	 */
+	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
+
+	/* Waiting for pending PCI transactions to be completed and don't
+	 * produce any more PCI traffic from/to the affected PE until
+	 * recovery is finished.
+	 */
+
+	/* Enable IO for the affected PE and collect logs. Usually, the
+	 * standard part of PCI config space, AER registers are dumped
+	 * as logs for further analysis.
+	 */
+	option.option = EEH_OPT_THAW_MMIO;
+	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
+
+	/* Issue PE reset */
+	reset.option = EEH_RESET_HOT;
+	ioctl(device, VFIO_EEH_PE_RESET, &reset);
+
+	/* Configure the PCI bridges for the affected PE */
+	ioctl(device, VFIO_EEH_PE_CONFIGURE, NULL);
+
+	/* Restored state we saved at initialization time. pci_restore_state()
+	 * is good enough as an example.
+	 */
+
+	/* Hopefully, error is recovered successfully. Now, you can resume to
+	 * start PCI traffic to/from the affected PE.
+	 */
+
+	....
+
 -------------------------------------------------------------------------------
 
 [1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 34a2d83..dd5f1cf 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -191,6 +191,11 @@  enum {
 #define EEH_OPT_ENABLE		1	/* EEH enable	*/
 #define EEH_OPT_THAW_MMIO	2	/* MMIO enable	*/
 #define EEH_OPT_THAW_DMA	3	/* DMA enable	*/
+#define EEH_OPT_GET_PE_ADDR	0	/* Get PE addr	*/
+#define EEH_OPT_GET_PE_MODE	1	/* Get PE mode	*/
+#define EEH_PE_MODE_NONE	0	/* Invalid mode	*/
+#define EEH_PE_MODE_NOT_SHARED	1	/* Not shared	*/
+#define EEH_PE_MODE_SHARED	2	/* Shared mode	*/
 #define EEH_STATE_UNAVAILABLE	(1 << 0)	/* State unavailable	*/
 #define EEH_STATE_NOT_SUPPORT	(1 << 1)	/* EEH not supported	*/
 #define EEH_STATE_RESET_ACTIVE	(1 << 2)	/* Active reset		*/
@@ -198,6 +203,11 @@  enum {
 #define EEH_STATE_DMA_ACTIVE	(1 << 4)	/* Active DMA		*/
 #define EEH_STATE_MMIO_ENABLED	(1 << 5)	/* MMIO enabled		*/
 #define EEH_STATE_DMA_ENABLED	(1 << 6)	/* DMA enabled		*/
+#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
+#define EEH_PE_STATE_RESET		1		/* PE reset	*/
+#define EEH_PE_STATE_STOPPED_IO_DMA	2		/* Stopped	*/
+#define EEH_PE_STATE_STOPPED_DMA	4		/* Stopped DMA	*/
+#define EEH_PE_STATE_UNAVAIL		5		/* Unavailable	*/
 #define EEH_RESET_DEACTIVATE	0	/* Deactivate the PE reset	*/
 #define EEH_RESET_HOT		1	/* Hot reset			*/
 #define EEH_RESET_FUNDAMENTAL	3	/* Fundamental reset		*/
@@ -305,6 +315,13 @@  void eeh_add_device_late(struct pci_dev *);
 void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_add_sysfs_files(struct pci_bus *);
 void eeh_remove_device(struct pci_dev *);
+int eeh_dev_open(struct pci_dev *pdev);
+void eeh_dev_release(struct pci_dev *pdev);
+int eeh_pe_set_option(struct pci_dev *pdev, int option);
+int eeh_pe_get_addr(struct pci_dev *pdev, int option);
+int eeh_pe_get_state(struct pci_dev *pdev);
+int eeh_pe_reset(struct pci_dev *pdev, int option);
+int eeh_pe_configure(struct pci_dev *pdev);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9c6b899..b90a474 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -108,6 +108,9 @@  struct eeh_ops *eeh_ops = NULL;
 /* Lock to avoid races due to multiple reports of an error */
 DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
+/* Lock to protect passed flags */
+static DEFINE_MUTEX(eeh_dev_mutex);
+
 /* Buffer for reporting pci register dumps. Its here in BSS, and
  * not dynamically alloced, so that it ends up in RMO where RTAS
  * can access it.
@@ -1098,6 +1101,324 @@  void eeh_remove_device(struct pci_dev *dev)
 	edev->mode &= ~EEH_DEV_SYSFS;
 }
 
+/**
+ * eeh_dev_open - Mark EEH device and PE as passed through
+ * @pdev: PCI device
+ *
+ * Mark the indicated EEH device and PE as passed through.
+ * In the result, the EEH errors detected on the PE won't be
+ * reported. The owner of the device will be responsible for
+ * detection and recovery.
+ */
+int eeh_dev_open(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev;
+
+	mutex_lock(&eeh_dev_mutex);
+
+	/* No PCI device ? */
+	if (!pdev) {
+		mutex_unlock(&eeh_dev_mutex);
+		return -ENODEV;
+	}
+
+	/* No EEH device ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !edev->pe) {
+		mutex_unlock(&eeh_dev_mutex);
+		return -ENODEV;
+	}
+
+	eeh_dev_set_passed(edev, true);
+	eeh_pe_set_passed(edev->pe, true);
+	mutex_unlock(&eeh_dev_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(eeh_dev_open);
+
+/**
+ * eeh_dev_release - Reclaim the ownership of EEH device
+ * @pdev: PCI device
+ *
+ * Reclaim ownership of EEH device, potentially the corresponding
+ * PE. In the result, the EEH errors detected on the PE will be
+ * reported and handled as usual.
+ */
+void eeh_dev_release(struct pci_dev *pdev)
+{
+	bool release_pe = true;
+	struct eeh_pe *pe = NULL;
+	struct eeh_dev *tmp, *edev;
+
+	mutex_lock(&eeh_dev_mutex);
+
+	/* No PCI device ? */
+	if (!pdev) {
+		mutex_unlock(&eeh_dev_mutex);
+		return;
+	}
+
+	/* No EEH device ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !eeh_dev_passed(edev) ||
+	    !edev->pe || !eeh_pe_passed(pe)) {
+		mutex_unlock(&eeh_dev_mutex);
+		return;
+	}
+
+	/* Release device */
+	pe = edev->pe;
+	eeh_dev_set_passed(edev, false);
+
+	/* Release PE */
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		if (eeh_dev_passed(edev)) {
+			release_pe = false;
+			break;
+		}
+	}
+
+	if (release_pe)
+		eeh_pe_set_passed(pe, false);
+
+	mutex_unlock(&eeh_dev_mutex);
+}
+EXPORT_SYMBOL(eeh_dev_release);
+
+static int eeh_dev_check(struct pci_dev *pdev,
+			 struct eeh_dev **pedev,
+			 struct eeh_pe **ppe)
+{
+	struct eeh_dev *edev;
+
+	/* No device ? */
+	if (!pdev)
+		return -ENODEV;
+
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !eeh_dev_passed(edev) ||
+	    !edev->pe || !eeh_pe_passed(edev->pe))
+		return -ENODEV;
+
+	if (pedev)
+		*pedev = edev;
+	if (ppe)
+		*ppe = edev->pe;
+
+	return 0;
+}
+
+/**
+ * eeh_pe_set_option - Set options for the indicated PE
+ * @pdev: PCI device
+ * @option: requested option
+ *
+ * The routine is called to enable or disable EEH functionality
+ * on the indicated PE, to enable IO or DMA for the frozen PE.
+ */
+int eeh_pe_set_option(struct pci_dev *pdev, int option)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_dev_check(pdev, &edev, &pe);
+	if (ret)
+		return ret;
+
+	switch (option) {
+	case EEH_OPT_DISABLE:
+	case EEH_OPT_ENABLE:
+		break;
+	case EEH_OPT_THAW_MMIO:
+	case EEH_OPT_THAW_DMA:
+		if (!eeh_ops || !eeh_ops->set_option) {
+			ret = -ENOENT;
+			break;
+		}
+
+		ret = eeh_ops->set_option(pe, option);
+		break;
+	default:
+		pr_debug("%s: Option %d out of range (%d, %d)\n",
+			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_set_option);
+
+/**
+ * eeh_pe_get_addr - Retrieve the PE address or sharing mode
+ * @pdev: PCI device
+ * @option: option
+ *
+ * Retrieve the PE address or sharing mode.
+ */
+int eeh_pe_get_addr(struct pci_dev *pdev, int option)
+{
+	struct pci_bus *bus;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_dev_check(pdev, &edev, &pe);
+	if (ret)
+		return ret;
+
+	switch (option) {
+	case EEH_OPT_GET_PE_ADDR:
+		bus = eeh_pe_bus_get(pe);
+		if (!bus) {
+			ret = -ENODEV;
+			break;
+		}
+
+		/* PE address has format "00BBSS00" */
+		ret = bus->number << 16;
+		break;
+	case EEH_OPT_GET_PE_MODE:
+		/* Wa always have shared PE */
+		ret = EEH_PE_MODE_SHARED;
+		break;
+	default:
+		pr_debug("%s: option %d out of range (0, 1)\n",
+			__func__, option);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_get_addr);
+
+/**
+ * eeh_pe_get_state - Retrieve PE's state
+ * @pdev: PCI device
+ *
+ * Retrieve the PE's state, which includes 3 aspects: enabled
+ * DMA, enabled IO and asserted reset.
+ */
+int eeh_pe_get_state(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int result, ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_dev_check(pdev, &edev, &pe);
+	if (ret)
+		return ret;
+
+	if (!eeh_ops || !eeh_ops->get_state)
+		return -ENOENT;
+
+	result = eeh_ops->get_state(pe, NULL);
+	if (!(result & EEH_STATE_RESET_ACTIVE) &&
+	     (result & EEH_STATE_DMA_ENABLED) &&
+	     (result & EEH_STATE_MMIO_ENABLED))
+		ret = EEH_PE_STATE_NORMAL;
+	else if (result & EEH_STATE_RESET_ACTIVE)
+		ret = EEH_PE_STATE_RESET;
+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+		 !(result & EEH_STATE_DMA_ENABLED) &&
+		 !(result & EEH_STATE_MMIO_ENABLED))
+		ret = EEH_PE_STATE_STOPPED_IO_DMA;
+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+		 (result & EEH_STATE_DMA_ENABLED) &&
+		 !(result & EEH_STATE_MMIO_ENABLED))
+		ret = EEH_PE_STATE_STOPPED_DMA;
+	else
+		ret = EEH_PE_STATE_UNAVAIL;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_get_state);
+
+/**
+ * eeh_pe_reset - Issue PE reset according to specified type
+ * @pdev: PCI device
+ * @option: reset type
+ *
+ * The routine is called to reset the specified PE with the
+ * indicated type, either fundamental reset or hot reset.
+ * PE reset is the most important part for error recovery.
+ */
+int eeh_pe_reset(struct pci_dev *pdev, int option)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_dev_check(pdev, &edev, &pe);
+	if (ret)
+		return ret;
+
+	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset)
+		return -ENOENT;
+
+	switch (option) {
+	case EEH_RESET_DEACTIVATE:
+		ret = eeh_ops->reset(pe, option);
+		if (ret)
+			break;
+
+		/*
+		 * The PE is still in frozen state and we need clear
+		 * that. It's good to clear frozen state after deassert
+		 * to avoid messy IO access during reset, which might
+		 * cause recursive frozen PE.
+		 */
+		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
+		if (!ret)
+			ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
+		if (!ret)
+			eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+		break;
+	case EEH_RESET_HOT:
+	case EEH_RESET_FUNDAMENTAL:
+		ret = eeh_ops->reset(pe, option);
+		break;
+	default:
+		pr_debug("%s: Unsupported option %d\n",
+			__func__, option);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_reset);
+
+/**
+ * eeh_pe_configure - Configure PCI bridges after PE reset
+ * @pdev: PCI device
+ *
+ * The routine is called to restore the PCI config space for
+ * those PCI devices, especially PCI bridges affected by PE
+ * reset issued previously.
+ */
+int eeh_pe_configure(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_dev_check(pdev, &edev, &pe);
+	if (ret)
+		return ret;
+
+	/* Restore config space for the affected devices */
+	eeh_pe_restore_bars(pe);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_configure);
+
 static int proc_eeh_show(struct seq_file *m, void *v)
 {
 	if (!eeh_enabled()) {
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7ba0424..301ac18 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -25,6 +25,9 @@ 
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
+#ifdef CONFIG_EEH
+#include <asm/eeh.h>
+#endif
 
 #include "vfio_pci_private.h"
 
@@ -152,32 +155,57 @@  static void vfio_pci_disable(struct vfio_pci_device *vdev)
 	pci_restore_state(pdev);
 }
 
+static void vfio_eeh_pci_release(struct pci_dev *pdev)
+{
+#ifdef CONFIG_EEH
+	eeh_dev_release(pdev);
+#endif
+}
+
 static void vfio_pci_release(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
 
-	if (atomic_dec_and_test(&vdev->refcnt))
+	if (atomic_dec_and_test(&vdev->refcnt)) {
+		vfio_eeh_pci_release(vdev->pdev);
 		vfio_pci_disable(vdev);
+	}
 
 	module_put(THIS_MODULE);
 }
 
+static int vfio_eeh_pci_open(struct pci_dev *pdev)
+{
+	int ret = 0;
+
+#ifdef CONFIG_EEH
+	ret = eeh_dev_open(pdev);
+#endif
+	return ret;
+}
+
 static int vfio_pci_open(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
+	int ret;
 
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
 	if (atomic_inc_return(&vdev->refcnt) == 1) {
-		int ret = vfio_pci_enable(vdev);
-		if (ret) {
-			module_put(THIS_MODULE);
-			return ret;
-		}
+		ret = vfio_pci_enable(vdev);
+		if (ret)
+			goto error;
+
+		ret = vfio_eeh_pci_open(vdev->pdev);
+		if (ret)
+			goto error;
 	}
 
 	return 0;
+error:
+	module_put(THIS_MODULE);
+	return ret;
 }
 
 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
@@ -321,6 +349,91 @@  static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
 	return walk.ret;
 }
 
+static int vfio_eeh_pci_ioctl(struct pci_dev *pdev,
+			      unsigned int cmd,
+			      unsigned long arg)
+{
+	unsigned long minsz;
+	int ret = 0;
+
+#ifdef CONFIG_EEH
+	switch (cmd) {
+	case VFIO_EEH_PE_SET_OPTION: {
+		struct vfio_eeh_pe_set_option option;
+
+		minsz = offsetofend(struct vfio_eeh_pe_set_option, option);
+		if (copy_from_user(&option, (void __user *)arg, minsz))
+			return -EFAULT;
+		if (option.argsz < minsz)
+			return -EINVAL;
+
+		ret = eeh_pe_set_option(pdev, option.option);
+		break;
+	}
+	case VFIO_EEH_PE_GET_ADDR: {
+		struct vfio_eeh_pe_get_addr addr;
+
+		minsz = offsetofend(struct vfio_eeh_pe_get_addr, info);
+		if (copy_from_user(&addr, (void __user *)arg, minsz))
+			return -EFAULT;
+		if (addr.argsz < minsz)
+			return -EINVAL;
+
+		ret = eeh_pe_get_addr(pdev, addr.option);
+		if (ret >= 0) {
+			addr.info = ret;
+			if (copy_to_user((void __user *)arg, &addr, minsz))
+				return -EFAULT;
+			ret = 0;
+		}
+
+		break;
+	}
+	case VFIO_EEH_PE_GET_STATE: {
+		struct vfio_eeh_pe_get_state state;
+
+		minsz = offsetofend(struct vfio_eeh_pe_get_state, state);
+		if (copy_from_user(&state, (void __user *)arg, minsz))
+			return -EFAULT;
+		if (state.argsz < minsz)
+			return -EINVAL;
+
+		ret = eeh_pe_get_state(pdev);
+		if (ret >= 0) {
+			state.state = ret;
+			if (copy_to_user((void __user *)arg, &state, minsz))
+				return -EFAULT;
+			ret = 0;
+		}
+		break;
+	}
+	case VFIO_EEH_PE_RESET: {
+		struct vfio_eeh_pe_reset reset;
+
+		minsz = offsetofend(struct vfio_eeh_pe_reset, option);
+		if (copy_from_user(&reset, (void __user *)arg, minsz))
+			return -EFAULT;
+		if (reset.argsz < minsz)
+			return -EINVAL;
+
+		ret = eeh_pe_reset(pdev, reset.option);
+		break;
+	}
+	case VFIO_EEH_PE_CONFIGURE:
+		ret = eeh_pe_configure(pdev);
+		break;
+	default:
+		ret = -EINVAL;
+		pr_debug("%s: Cannot handle command %d\n",
+			__func__, cmd);
+	}
+#else
+	ret = -ENOENT;
+#endif
+
+	return ret;
+}
+
 static long vfio_pci_ioctl(void *device_data,
 			   unsigned int cmd, unsigned long arg)
 {
@@ -682,6 +795,12 @@  hot_reset_release:
 
 		kfree(groups);
 		return ret;
+	} else if (cmd == VFIO_EEH_PE_SET_OPTION ||
+		   cmd == VFIO_EEH_PE_GET_ADDR ||
+		   cmd == VFIO_EEH_PE_GET_STATE ||
+		   cmd == VFIO_EEH_PE_RESET ||
+		   cmd == VFIO_EEH_PE_CONFIGURE) {
+			return vfio_eeh_pci_ioctl(vdev->pdev, cmd, arg);
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index cb9023d..ef55682 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -455,6 +455,59 @@  struct vfio_iommu_spapr_tce_info {
 
 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
 
+/*
+ * EEH functionality can be enabled or disabled on one specific device.
+ * Also, the DMA or IO frozen state can be removed from the frozen PE
+ * if required.
+ */
+struct vfio_eeh_pe_set_option {
+	__u32 argsz;
+	__u32 option;
+};
+
+#define VFIO_EEH_PE_SET_OPTION		_IO(VFIO_TYPE, VFIO_BASE + 21)
+
+/*
+ * Each EEH PE should have unique address to be identified. The command
+ * helps to retrieve the address and the sharing mode of the PE.
+ */
+struct vfio_eeh_pe_get_addr {
+	__u32 argsz;
+	__u32 option;
+	__u32 info;
+};
+
+#define VFIO_EEH_PE_GET_ADDR		_IO(VFIO_TYPE, VFIO_BASE + 22)
+
+/*
+ * EEH PE might have been frozen because of PCI errors. Also, it might
+ * be experiencing reset for error revoery. The following command helps
+ * to get the state.
+ */
+struct vfio_eeh_pe_get_state {
+	__u32 argsz;
+	__u32 state;
+};
+
+#define VFIO_EEH_PE_GET_STATE		_IO(VFIO_TYPE, VFIO_BASE + 23)
+
+/*
+ * Reset is the major step to recover problematic PE. The following
+ * command helps on that.
+ */
+struct vfio_eeh_pe_reset {
+	__u32 argsz;
+	__u32 option;
+};
+
+#define VFIO_EEH_PE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 24)
+
+/*
+ * One of the steps for recovery after PE reset is to configure the
+ * PCI bridges affected by the PE reset.
+ */
+#define VFIO_EEH_PE_CONFIGURE		_IO(VFIO_TYPE, VFIO_BASE + 25)
+
 /* ***************************************************************** */
 
 #endif /* _UAPIVFIO_H */