diff mbox

[v5,3/4] drivers/vfio: EEH support for VFIO PCI device

Message ID 1400648623-9127-4-git-send-email-gwshan@linux.vnet.ibm.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Gavin Shan May 21, 2014, 5:03 a.m. UTC
The patch adds new IOCTL command VFIO_EEH_OP to VFIO PCI device
to support EEH functionality for PCI devices, which have been
passed from host to guest via VFIO.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 Documentation/vfio.txt         |   6 +-
 arch/powerpc/include/asm/eeh.h |  10 ++
 arch/powerpc/kernel/eeh.c      | 323 +++++++++++++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci.c    |  99 ++++++++++++-
 include/uapi/linux/vfio.h      |  43 ++++++
 5 files changed, 474 insertions(+), 7 deletions(-)

Comments

Alexander Graf May 21, 2014, 1:07 p.m. UTC | #1
On 21.05.14 07:03, Gavin Shan wrote:
> The patch adds new IOCTL command VFIO_EEH_OP to VFIO PCI device
> to support EEH functionality for PCI devices, which have been
> passed from host to guest via VFIO.
>
> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> ---
>   Documentation/vfio.txt         |   6 +-
>   arch/powerpc/include/asm/eeh.h |  10 ++
>   arch/powerpc/kernel/eeh.c      | 323 +++++++++++++++++++++++++++++++++++++++++
>   drivers/vfio/pci/vfio_pci.c    |  99 ++++++++++++-
>   include/uapi/linux/vfio.h      |  43 ++++++
>   5 files changed, 474 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index b9ca023..bb17ec7 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
>   an excellent performance which has limitations such as inability to do
>   locked pages accounting in real time.
>   
> -So 3 additional ioctls have been added:
> +4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
> +which works on the basis of additional ioctl command VFIO_EEH_OP.
> +
> +So 4 additional ioctls have been added:
>   
>   	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>   		of the DMA window on the PCI bus.
> @@ -316,6 +319,7 @@ So 3 additional ioctls have been added:
>   
>   	VFIO_IOMMU_DISABLE - disables the container.
>   
> +	VFIO_EEH_OP - EEH dependent operations

Please document exactly what the ioctl does. In an ideal world, a VFIO 
user will just look at the documentation and be able to write a program 
against the API with it.

>   
>   The code flow from the example above should be slightly changed:
>   
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 34a2d83..93922ef 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -305,6 +305,16 @@ void eeh_add_device_late(struct pci_dev *);
>   void eeh_add_device_tree_late(struct pci_bus *);
>   void eeh_add_sysfs_files(struct pci_bus *);
>   void eeh_remove_device(struct pci_dev *);
> +#ifdef CONFIG_VFIO_PCI_EEH
> +int eeh_vfio_open(struct pci_dev *pdev);
> +void eeh_vfio_release(struct pci_dev *pdev);
> +int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval);
> +int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
> +			 int *retval, int *info);
> +int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state);
> +int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval);
> +int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval);
> +#endif
>   
>   /**
>    * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 9c6b899..2aaf90e 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -1098,6 +1098,329 @@ void eeh_remove_device(struct pci_dev *dev)
>   	edev->mode &= ~EEH_DEV_SYSFS;
>   }
>   
> +#ifdef CONFIG_VFIO_PCI_EEH
> +int eeh_vfio_open(struct pci_dev *pdev)

Why vfio? Also that config option will not be set if vfio is compiled as 
a module.

> +{
> +	struct eeh_dev *edev;
> +
> +	/* No PCI device ? */
> +	if (!pdev)
> +		return -ENODEV;
> +
> +	/* No EEH device ? */
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !edev->pe)
> +		return -ENODEV;
> +
> +	eeh_dev_set_passed(edev, true);
> +	eeh_pe_set_passed(edev->pe, true);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_open);
> +
> +void eeh_vfio_release(struct pci_dev *pdev)
> +{
> +	bool release_pe = true;
> +	struct eeh_pe *pe = NULL;
> +	struct eeh_dev *tmp, *edev;
> +
> +	/* No PCI device ? */
> +	if (!pdev)
> +		return;
> +
> +	/* No EEH device ? */
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !eeh_dev_passed(edev) ||
> +	    !edev->pe || !eeh_pe_passed(pe))
> +		return;
> +
> +	/* Release device */
> +	pe = edev->pe;
> +	eeh_dev_set_passed(edev, false);
> +
> +	/* Release PE */
> +	eeh_pe_for_each_dev(pe, edev, tmp) {
> +		if (eeh_dev_passed(edev)) {
> +			release_pe = false;
> +			break;
> +		}
> +	}
> +
> +	if (release_pe)
> +		eeh_pe_set_passed(pe, false);
> +}
> +EXPORT_SYMBOL(eeh_vfio_release);
> +
> +static int eeh_vfio_check_dev(struct pci_dev *pdev,
> +			      struct eeh_dev **pedev,
> +			      struct eeh_pe **ppe)
> +{
> +	struct eeh_dev *edev;
> +
> +	/* No device ? */
> +	if (!pdev)
> +		return -ENODEV;
> +
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !eeh_dev_passed(edev) ||
> +	    !edev->pe || !eeh_pe_passed(edev->pe))
> +		return -ENODEV;
> +
> +	if (pedev)
> +		*pedev = edev;
> +	if (ppe)
> +		*ppe = edev->pe;
> +
> +	return 0;
> +}
> +
> +int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> +	if (ret) {
> +		pr_debug("%s: Cannot find device %s\n",
> +			__func__, pdev ? pci_name(pdev) : "NULL");
> +		*retval = -7;

What are these? Please use proper kernel internal return values for 
errors. I don't want to see anything even remotely tied to RTAS in any 
of these patches.

> +		goto out;
> +	}
> +
> +	/* Invalid option ? */
> +	if (option < EEH_OPT_DISABLE ||
> +	    option > EEH_OPT_THAW_DMA) {

This is quite confusing to read because it's not obvious what is in 
between these. Just make this a switch() statement that lists the 
allowed options. Gcc will be smart enough to optimize that into a bounds 
check.

> +		pr_debug("%s: Option %d out of range (%d, %d)\n",
> +			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
> +		*retval = -3;
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (option == EEH_OPT_DISABLE ||
> +	    option == EEH_OPT_ENABLE) {
> +		*retval = 0;
> +	} else {
> +		if (!eeh_ops || !eeh_ops->set_option) {
> +			*retval = -7;
> +			ret = -ENOENT;
> +			goto out;
> +		}
> +
> +		ret = eeh_ops->set_option(pe, option);
> +		if (ret) {
> +			pr_debug("%s: Failure %d from backend\n",
> +				__func__, ret);
> +			*retval = -1;
> +			goto out;
> +		}
> +
> +		*retval = 0;
> +	}
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_set_pe_option);
> +
> +int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
> +			 int *retval, int *info)
> +{
> +	struct pci_bus *bus;
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> +	if (ret) {
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	/* Invalid option ? */
> +	if (option != 0 && option != 1) {

0? 1? What? Don't these have names? And again, please use a switch() for 
this function.

> +		pr_debug("%s: option %d out of range (0, 1)\n",
> +			__func__, option);
> +		*retval = -3;
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Fill result according to option. We don't differentiate
> +	 * PCI bus and device dependent PE here. So all PEs are
> +	 * built in "shared" mode. Also, the PE address has the format
> +	 * of "00BBSS00".
> +	 */
> +	if (option == 0) {
> +		bus = eeh_pe_bus_get(pe);
> +		if (!bus) {
> +			*retval = -3;
> +			ret = -ENODEV;
> +			goto out;
> +		}
> +
> +		*retval = 0;
> +		*info = bus->number << 16;

How about positive numbers for the number and negative ones for errors?

> +	} else {
> +		*retval = 0;
> +		*info = 1;
> +	}
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_addr);
> +
> +int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int result, ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> +	if (ret) {
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	if (!eeh_ops || !eeh_ops->get_state) {
> +		pr_debug("%s: Unsupported request\n",
> +			__func__);
> +		ret = -ENOENT;
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	result = eeh_ops->get_state(pe, NULL);
> +	if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +	     (result & EEH_STATE_DMA_ENABLED) &&
> +	     (result & EEH_STATE_MMIO_ENABLED))
> +		*state = 0;
> +	else if (result & EEH_STATE_RESET_ACTIVE)
> +		*state = 1;
> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +		 !(result & EEH_STATE_DMA_ENABLED) &&
> +		 !(result & EEH_STATE_MMIO_ENABLED))
> +		*state = 2;
> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +		 (result & EEH_STATE_DMA_ENABLED) &&
> +		 !(result & EEH_STATE_MMIO_ENABLED))
> +		*state = 4;
> +	else
> +		*state = 5;

What are these numbers?

> +
> +	*retval = 0;
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_state);
> +
> +int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> +	if (ret) {
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	/* Invalid option ? */
> +	if (option != EEH_RESET_DEACTIVATE &&
> +	    option != EEH_RESET_HOT &&
> +	    option != EEH_RESET_FUNDAMENTAL) {
> +		pr_debug("%s: Unsupported option %d\n",
> +			__func__, option);
> +		ret = -EINVAL;
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset) {
> +		pr_debug("%s: Unsupported request\n",
> +			__func__);
> +		ret = -ENOENT;
> +		*retval = -7;
> +		goto out;
> +	}
> +
> +	ret = eeh_ops->reset(pe, option);
> +	if (ret) {
> +		pr_debug("%s: Failure %d from backend\n",
> +			 __func__, ret);
> +		*retval = -1;
> +		goto out;
> +	}
> +
> +	/*
> +	 * The PE is still in frozen state and we need clear that.
> +	 * It's good to clear frozen state after deassert to avoid
> +	 * messy IO access during reset, which might cause recrusive

recursive

> +	 * frozen PE.
> +	 */
> +	if (option == EEH_RESET_DEACTIVATE) {
> +		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
> +		if (ret) {
> +			pr_debug("%s: Cannot enable IO for PHB#%d-PE#%d (%d)\n",
> +				__func__, pe->phb->global_number, pe->addr, ret);
> +			*retval = -1;
> +			goto out;
> +		}
> +
> +		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
> +		if (ret) {
> +			pr_debug("%s: Cannot enable DMA for PHB#%d-PE#%d (%d)\n",
> +				__func__, pe->phb->global_number, pe->addr, ret);
> +			*retval = -1;
> +			goto out;
> +		}
> +
> +		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
> +	}
> +
> +	*retval = 0;
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_reset_pe);
> +
> +int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> +	if (ret) {
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	/*
> +	 * The access to PCI config space on VFIO device has some
> +	 * limitations. Part of PCI config space, including BAR
> +	 * registers are not readable and writable. So the guest
> +	 * should have stale values for those registers and we have
> +	 * to restore them in host side.

I don't understand this comment. When is "configure_pe" called in the 
first place? Please provide proper function descriptions for each of 
these exported functions that tell someone who may want to use them what 
they do.

Also, don't mention VFIO or guests in any function inside this file.

> +	 */
> +	eeh_pe_restore_bars(pe);
> +	*retval = 0;
> +
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_configure_pe);
> +
> +#endif /* CONFIG_VFIO_PCI_EEH */
> +
>   static int proc_eeh_show(struct seq_file *m, void *v)
>   {
>   	if (!eeh_enabled()) {
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 7ba0424..05c3dde 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -25,6 +25,9 @@
>   #include <linux/types.h>
>   #include <linux/uaccess.h>
>   #include <linux/vfio.h>
> +#ifdef CONFIG_VFIO_PCI_EEH
> +#include <asm/eeh.h>
> +#endif
>   
>   #include "vfio_pci_private.h"
>   
> @@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>   	pci_restore_state(pdev);
>   }
>   
> +static void vfio_eeh_pci_release(struct pci_dev *pdev)
> +{
> +#ifdef CONFIG_VFIO_PCI_EEH
> +	eeh_vfio_release(pdev);
> +#endif
> +}
> +
>   static void vfio_pci_release(void *device_data)
>   {
>   	struct vfio_pci_device *vdev = device_data;
>   
> -	if (atomic_dec_and_test(&vdev->refcnt))
> +	if (atomic_dec_and_test(&vdev->refcnt)) {
> +		vfio_eeh_pci_release(vdev->pdev);
>   		vfio_pci_disable(vdev);
> +	}
>   
>   	module_put(THIS_MODULE);
>   }
>   
> +static int vfio_eeh_pci_open(struct pci_dev *pdev)
> +{
> +	int ret = 0;
> +
> +#ifdef CONFIG_VFIO_PCI_EEH
> +	ret = eeh_vfio_open(pdev);
> +#endif
> +	return ret;
> +}
> +
>   static int vfio_pci_open(void *device_data)
>   {
>   	struct vfio_pci_device *vdev = device_data;
> +	int ret;
>   
>   	if (!try_module_get(THIS_MODULE))
>   		return -ENODEV;
>   
>   	if (atomic_inc_return(&vdev->refcnt) == 1) {
> -		int ret = vfio_pci_enable(vdev);
> -		if (ret) {
> -			module_put(THIS_MODULE);
> -			return ret;
> -		}
> +		ret = vfio_pci_enable(vdev);
> +		if (ret)
> +			goto error;
> +
> +		ret = vfio_eeh_pci_open(vdev->pdev);
> +		if (ret)
> +			goto error;
>   	}
>   
>   	return 0;
> +error:
> +	module_put(THIS_MODULE);
> +	return ret;
>   }
>   
>   static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
> @@ -321,6 +349,51 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>   	return walk.ret;
>   }
>   
> +static int vfio_eeh_pci_ioctl(struct pci_dev *pdev, struct vfio_eeh_op *info)

I still don't like the idea of that multiplexing ioctl. I don't see any 
benefit in it whatsoever. Just create 5 individual ioctls with their own 
simple interfaces.

Also, this interface has nothing to do with RTAS. So don't sneak in RTAS 
error numbers anywhere ;). It's QEMU's task to convert from kernel error 
codes to RTAS error codes.


Alex

> +{
> +	int ret = 0;
> +
> +#ifdef CONFIG_VFIO_PCI_EEH
> +	switch (info->op) {
> +	case VFIO_EEH_OP_SET_OPTION:
> +		ret = eeh_vfio_set_pe_option(pdev,
> +					     info->option.option,
> +					     &info->option.ret);
> +		break;
> +	case VFIO_EEH_OP_GET_ADDR:
> +		ret = eeh_vfio_get_pe_addr(pdev,
> +					   info->addr.option,
> +					   &info->addr.ret,
> +					   &info->addr.info);
> +		break;
> +	case VFIO_EEH_OP_GET_STATE:
> +		ret = eeh_vfio_get_pe_state(pdev,
> +					    &info->state.ret,
> +					    &info->state.reset_state);
> +		info->state.cfg_cap = 1;
> +		info->state.pe_unavail_info = 1000;
> +		info->state.pe_recovery_info = 0;
> +		break;
> +	case VFIO_EEH_OP_PE_RESET:
> +		ret = eeh_vfio_reset_pe(pdev,
> +					info->reset.option,
> +					&info->reset.ret);
> +		break;
> +	case VFIO_EEH_OP_PE_CONFIG:
> +		ret = eeh_vfio_configure_pe(pdev,
> +					    &info->config.ret);
> +	default:
> +		ret = -EINVAL;
> +		pr_debug("%s: Cannot handle op#%d\n",
> +			__func__, info->op);
> +	}
> +#else
> +	ret = -ENOENT;
> +#endif
> +
> +	return ret;
> +}
> +
>   static long vfio_pci_ioctl(void *device_data,
>   			   unsigned int cmd, unsigned long arg)
>   {
> @@ -682,6 +755,20 @@ hot_reset_release:
>   
>   		kfree(groups);
>   		return ret;
> +	} else if (cmd == VFIO_EEH_OP) {
> +		struct vfio_eeh_op info;
> +		int ret = 0;
> +
> +		minsz = sizeof(info);
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = vfio_eeh_pci_ioctl(vdev->pdev, &info);
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			ret = -EFAULT;
> +		return ret;
>   	}
>   
>   	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index cb9023d..518961d 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -455,6 +455,49 @@ struct vfio_iommu_spapr_tce_info {
>   
>   #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>   
> +/*
> + * The VFIO operation struct provides way to support EEH functionality
> + * for PCI device that is passed from host to guest via VFIO.
> + */
> +#define VFIO_EEH_OP_SET_OPTION	0
> +#define VFIO_EEH_OP_GET_ADDR	1
> +#define VFIO_EEH_OP_GET_STATE	2
> +#define VFIO_EEH_OP_PE_RESET	3
> +#define VFIO_EEH_OP_PE_CONFIG	4
> +
> +struct vfio_eeh_op {
> +	__u32 argsz;
> +	__u32 op;
> +
> +	union {
> +		struct vfio_eeh_set_option {
> +			__u32 option;
> +			__s32 ret;
> +		} option;
> +		struct vfio_eeh_pe_addr {
> +			__u32 option;
> +			__s32 ret;
> +			__u32 info;
> +		} addr;
> +		struct vfio_eeh_pe_state {
> +			__s32 ret;
> +			__u32 reset_state;
> +			__u32 cfg_cap;
> +			__u32 pe_unavail_info;
> +			__u32 pe_recovery_info;
> +		} state;
> +		struct vfio_eeh_reset {
> +			__u32 option;
> +			__s32 ret;
> +		} reset;
> +		struct vfio_eeh_config {
> +			__s32 ret;
> +		} config;
> +	};
> +};
> +
> +#define VFIO_EEH_OP	_IO(VFIO_TYPE, VFIO_BASE + 21)
> +
>   /* ***************************************************************** */
>   
>   #endif /* _UAPIVFIO_H */
Benjamin Herrenschmidt May 21, 2014, 9:56 p.m. UTC | #2
On Wed, 2014-05-21 at 15:07 +0200, Alexander Graf wrote:

> > +#ifdef CONFIG_VFIO_PCI_EEH
> > +int eeh_vfio_open(struct pci_dev *pdev)
> 
> Why vfio? Also that config option will not be set if vfio is compiled as 
> a module.
> 
> > +{
> > +	struct eeh_dev *edev;
> > +
> > +	/* No PCI device ? */
> > +	if (!pdev)
> > +		return -ENODEV;
> > +
> > +	/* No EEH device ? */
> > +	edev = pci_dev_to_eeh_dev(pdev);
> > +	if (!edev || !edev->pe)
> > +		return -ENODEV;
> > +
> > +	eeh_dev_set_passed(edev, true);
> > +	eeh_pe_set_passed(edev->pe, true);
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(eeh_vfio_open);

Additionally, shouldn't we have some locking here ? (and in release too)

I don't like relying on the caller locking (if it does it at all).

> > +	/* Device existing ? */
> > +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> > +	if (ret) {
> > +		pr_debug("%s: Cannot find device %s\n",
> > +			__func__, pdev ? pci_name(pdev) : "NULL");
> > +		*retval = -7;
> 
> What are these? Please use proper kernel internal return values for 
> errors. I don't want to see anything even remotely tied to RTAS in any 
> of these patches.

Hint: -ENODEV

Cheers,
Ben.
Gavin Shan May 21, 2014, 11:48 p.m. UTC | #3
On Wed, May 21, 2014 at 03:07:26PM +0200, Alexander Graf wrote:
>
>On 21.05.14 07:03, Gavin Shan wrote:
>>The patch adds new IOCTL command VFIO_EEH_OP to VFIO PCI device
>>to support EEH functionality for PCI devices, which have been
>>passed from host to guest via VFIO.
>>
>>Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>---
>>  Documentation/vfio.txt         |   6 +-
>>  arch/powerpc/include/asm/eeh.h |  10 ++
>>  arch/powerpc/kernel/eeh.c      | 323 +++++++++++++++++++++++++++++++++++++++++
>>  drivers/vfio/pci/vfio_pci.c    |  99 ++++++++++++-
>>  include/uapi/linux/vfio.h      |  43 ++++++
>>  5 files changed, 474 insertions(+), 7 deletions(-)
>>
>>diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>>index b9ca023..bb17ec7 100644
>>--- a/Documentation/vfio.txt
>>+++ b/Documentation/vfio.txt
>>@@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
>>  an excellent performance which has limitations such as inability to do
>>  locked pages accounting in real time.
>>-So 3 additional ioctls have been added:
>>+4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
>>+which works on the basis of additional ioctl command VFIO_EEH_OP.
>>+
>>+So 4 additional ioctls have been added:
>>  	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>>  		of the DMA window on the PCI bus.
>>@@ -316,6 +319,7 @@ So 3 additional ioctls have been added:
>>  	VFIO_IOMMU_DISABLE - disables the container.
>>+	VFIO_EEH_OP - EEH dependent operations
>
>Please document exactly what the ioctl does. In an ideal world, a
>VFIO user will just look at the documentation and be able to write a
>program against the API with it.
>

Ok. I'll amend it. Also, I'll split it to 5 ioctl commands in next revision.

>>  The code flow from the example above should be slightly changed:
>>diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>>index 34a2d83..93922ef 100644
>>--- a/arch/powerpc/include/asm/eeh.h
>>+++ b/arch/powerpc/include/asm/eeh.h
>>@@ -305,6 +305,16 @@ void eeh_add_device_late(struct pci_dev *);
>>  void eeh_add_device_tree_late(struct pci_bus *);
>>  void eeh_add_sysfs_files(struct pci_bus *);
>>  void eeh_remove_device(struct pci_dev *);
>>+#ifdef CONFIG_VFIO_PCI_EEH
>>+int eeh_vfio_open(struct pci_dev *pdev);
>>+void eeh_vfio_release(struct pci_dev *pdev);
>>+int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval);
>>+int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
>>+			 int *retval, int *info);
>>+int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state);
>>+int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval);
>>+int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval);
>>+#endif
>>  /**
>>   * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
>>diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
>>index 9c6b899..2aaf90e 100644
>>--- a/arch/powerpc/kernel/eeh.c
>>+++ b/arch/powerpc/kernel/eeh.c
>>@@ -1098,6 +1098,329 @@ void eeh_remove_device(struct pci_dev *dev)
>>  	edev->mode &= ~EEH_DEV_SYSFS;
>>  }
>>+#ifdef CONFIG_VFIO_PCI_EEH
>>+int eeh_vfio_open(struct pci_dev *pdev)
>
>Why vfio? Also that config option will not be set if vfio is compiled
>as a module.
>

The interface is expected to be used by VFIO-PCI. I'll change the function
names to following ones in next revision. No "VFIO" will be seen :-)

eeh_dev_open();
eeh_dev_release();
static eeh_dev_check();
eeh_pe_set_option();
eeh_pe_get_addr();
eeh_pe_get_state();
eeh_pe_reset();
eeh_pe_configure(); 

Yeah, "#ifdef CONFIG_VFIO_PCI_EEH" can be removed safely in next revision.

>>+{
>>+	struct eeh_dev *edev;
>>+
>>+	/* No PCI device ? */
>>+	if (!pdev)
>>+		return -ENODEV;
>>+
>>+	/* No EEH device ? */
>>+	edev = pci_dev_to_eeh_dev(pdev);
>>+	if (!edev || !edev->pe)
>>+		return -ENODEV;
>>+
>>+	eeh_dev_set_passed(edev, true);
>>+	eeh_pe_set_passed(edev->pe, true);
>>+
>>+	return 0;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_vfio_open);
>>+
>>+void eeh_vfio_release(struct pci_dev *pdev)
>>+{
>>+	bool release_pe = true;
>>+	struct eeh_pe *pe = NULL;
>>+	struct eeh_dev *tmp, *edev;
>>+
>>+	/* No PCI device ? */
>>+	if (!pdev)
>>+		return;
>>+
>>+	/* No EEH device ? */
>>+	edev = pci_dev_to_eeh_dev(pdev);
>>+	if (!edev || !eeh_dev_passed(edev) ||
>>+	    !edev->pe || !eeh_pe_passed(pe))
>>+		return;
>>+
>>+	/* Release device */
>>+	pe = edev->pe;
>>+	eeh_dev_set_passed(edev, false);
>>+
>>+	/* Release PE */
>>+	eeh_pe_for_each_dev(pe, edev, tmp) {
>>+		if (eeh_dev_passed(edev)) {
>>+			release_pe = false;
>>+			break;
>>+		}
>>+	}
>>+
>>+	if (release_pe)
>>+		eeh_pe_set_passed(pe, false);
>>+}
>>+EXPORT_SYMBOL(eeh_vfio_release);
>>+
>>+static int eeh_vfio_check_dev(struct pci_dev *pdev,
>>+			      struct eeh_dev **pedev,
>>+			      struct eeh_pe **ppe)
>>+{
>>+	struct eeh_dev *edev;
>>+
>>+	/* No device ? */
>>+	if (!pdev)
>>+		return -ENODEV;
>>+
>>+	edev = pci_dev_to_eeh_dev(pdev);
>>+	if (!edev || !eeh_dev_passed(edev) ||
>>+	    !edev->pe || !eeh_pe_passed(edev->pe))
>>+		return -ENODEV;
>>+
>>+	if (pedev)
>>+		*pedev = edev;
>>+	if (ppe)
>>+		*ppe = edev->pe;
>>+
>>+	return 0;
>>+}
>>+
>>+int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval)
>>+{
>>+	struct eeh_dev *edev;
>>+	struct eeh_pe *pe;
>>+	int ret = 0;
>>+
>>+	/* Device existing ? */
>>+	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
>>+	if (ret) {
>>+		pr_debug("%s: Cannot find device %s\n",
>>+			__func__, pdev ? pci_name(pdev) : "NULL");
>>+		*retval = -7;
>
>What are these? Please use proper kernel internal return values for
>errors. I don't want to see anything even remotely tied to RTAS in
>any of these patches.
>

It's the return value for RTAS call "ibm,set-eeh-option". Yeah, it
makes sense to return kernel internal values for errors and will
amend in next revision.

>>+		goto out;
>>+	}
>>+
>>+	/* Invalid option ? */
>>+	if (option < EEH_OPT_DISABLE ||
>>+	    option > EEH_OPT_THAW_DMA) {
>
>This is quite confusing to read because it's not obvious what is in
>between these. Just make this a switch() statement that lists the
>allowed options. Gcc will be smart enough to optimize that into a
>bounds check.
>

Ok. Will use "switch()" in next revision.

>>+		pr_debug("%s: Option %d out of range (%d, %d)\n",
>>+			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
>>+		*retval = -3;
>>+		ret = -EINVAL;
>>+		goto out;
>>+	}
>>+
>>+	if (option == EEH_OPT_DISABLE ||
>>+	    option == EEH_OPT_ENABLE) {
>>+		*retval = 0;
>>+	} else {
>>+		if (!eeh_ops || !eeh_ops->set_option) {
>>+			*retval = -7;
>>+			ret = -ENOENT;
>>+			goto out;
>>+		}
>>+
>>+		ret = eeh_ops->set_option(pe, option);
>>+		if (ret) {
>>+			pr_debug("%s: Failure %d from backend\n",
>>+				__func__, ret);
>>+			*retval = -1;
>>+			goto out;
>>+		}
>>+
>>+		*retval = 0;
>>+	}
>>+out:
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_vfio_set_pe_option);
>>+
>>+int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
>>+			 int *retval, int *info)
>>+{
>>+	struct pci_bus *bus;
>>+	struct eeh_dev *edev;
>>+	struct eeh_pe *pe;
>>+	int ret = 0;
>>+
>>+	/* Device existing ? */
>>+	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
>>+	if (ret) {
>>+		*retval = -3;
>>+		goto out;
>>+	}
>>+
>>+	/* Invalid option ? */
>>+	if (option != 0 && option != 1) {
>
>0? 1? What? Don't these have names? And again, please use a switch()
>for this function.
>

Will have names in next revision and use "switch()", thanks.

>>+		pr_debug("%s: option %d out of range (0, 1)\n",
>>+			__func__, option);
>>+		*retval = -3;
>>+		ret = -EINVAL;
>>+		goto out;
>>+	}
>>+
>>+	/*
>>+	 * Fill result according to option. We don't differentiate
>>+	 * PCI bus and device dependent PE here. So all PEs are
>>+	 * built in "shared" mode. Also, the PE address has the format
>>+	 * of "00BBSS00".
>>+	 */
>>+	if (option == 0) {
>>+		bus = eeh_pe_bus_get(pe);
>>+		if (!bus) {
>>+			*retval = -3;
>>+			ret = -ENODEV;
>>+			goto out;
>>+		}
>>+
>>+		*retval = 0;
>>+		*info = bus->number << 16;
>
>How about positive numbers for the number and negative ones for errors?
>

We needn't carry error numbers by "info" because "retval" or "ret" already
had that information :-)

>>+	} else {
>>+		*retval = 0;
>>+		*info = 1;
>>+	}
>>+out:
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_addr);
>>+
>>+int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state)
>>+{
>>+	struct eeh_dev *edev;
>>+	struct eeh_pe *pe;
>>+	int result, ret = 0;
>>+
>>+	/* Device existing ? */
>>+	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
>>+	if (ret) {
>>+		*retval = -3;
>>+		goto out;
>>+	}
>>+
>>+	if (!eeh_ops || !eeh_ops->get_state) {
>>+		pr_debug("%s: Unsupported request\n",
>>+			__func__);
>>+		ret = -ENOENT;
>>+		*retval = -3;
>>+		goto out;
>>+	}
>>+
>>+	result = eeh_ops->get_state(pe, NULL);
>>+	if (!(result & EEH_STATE_RESET_ACTIVE) &&
>>+	     (result & EEH_STATE_DMA_ENABLED) &&
>>+	     (result & EEH_STATE_MMIO_ENABLED))
>>+		*state = 0;
>>+	else if (result & EEH_STATE_RESET_ACTIVE)
>>+		*state = 1;
>>+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
>>+		 !(result & EEH_STATE_DMA_ENABLED) &&
>>+		 !(result & EEH_STATE_MMIO_ENABLED))
>>+		*state = 2;
>>+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
>>+		 (result & EEH_STATE_DMA_ENABLED) &&
>>+		 !(result & EEH_STATE_MMIO_ENABLED))
>>+		*state = 4;
>>+	else
>>+		*state = 5;
>
>What are these numbers?
>

Will have names in next revision :)

>>+
>>+	*retval = 0;
>>+out:
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_state);
>>+
>>+int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval)
>>+{
>>+	struct eeh_dev *edev;
>>+	struct eeh_pe *pe;
>>+	int ret = 0;
>>+
>>+	/* Device existing ? */
>>+	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
>>+	if (ret) {
>>+		*retval = -3;
>>+		goto out;
>>+	}
>>+
>>+	/* Invalid option ? */
>>+	if (option != EEH_RESET_DEACTIVATE &&
>>+	    option != EEH_RESET_HOT &&
>>+	    option != EEH_RESET_FUNDAMENTAL) {
>>+		pr_debug("%s: Unsupported option %d\n",
>>+			__func__, option);
>>+		ret = -EINVAL;
>>+		*retval = -3;
>>+		goto out;
>>+	}
>>+
>>+	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset) {
>>+		pr_debug("%s: Unsupported request\n",
>>+			__func__);
>>+		ret = -ENOENT;
>>+		*retval = -7;
>>+		goto out;
>>+	}
>>+
>>+	ret = eeh_ops->reset(pe, option);
>>+	if (ret) {
>>+		pr_debug("%s: Failure %d from backend\n",
>>+			 __func__, ret);
>>+		*retval = -1;
>>+		goto out;
>>+	}
>>+
>>+	/*
>>+	 * The PE is still in frozen state and we need clear that.
>>+	 * It's good to clear frozen state after deassert to avoid
>>+	 * messy IO access during reset, which might cause recrusive
>
>recursive
>

Thanks.

>>+	 * frozen PE.
>>+	 */
>>+	if (option == EEH_RESET_DEACTIVATE) {
>>+		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
>>+		if (ret) {
>>+			pr_debug("%s: Cannot enable IO for PHB#%d-PE#%d (%d)\n",
>>+				__func__, pe->phb->global_number, pe->addr, ret);
>>+			*retval = -1;
>>+			goto out;
>>+		}
>>+
>>+		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
>>+		if (ret) {
>>+			pr_debug("%s: Cannot enable DMA for PHB#%d-PE#%d (%d)\n",
>>+				__func__, pe->phb->global_number, pe->addr, ret);
>>+			*retval = -1;
>>+			goto out;
>>+		}
>>+
>>+		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
>>+	}
>>+
>>+	*retval = 0;
>>+out:
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_vfio_reset_pe);
>>+
>>+int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval)
>>+{
>>+	struct eeh_dev *edev;
>>+	struct eeh_pe *pe;
>>+	int ret = 0;
>>+
>>+	/* Device existing ? */
>>+	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
>>+	if (ret) {
>>+		*retval = -3;
>>+		goto out;
>>+	}
>>+
>>+	/*
>>+	 * The access to PCI config space on VFIO device has some
>>+	 * limitations. Part of PCI config space, including BAR
>>+	 * registers are not readable and writable. So the guest
>>+	 * should have stale values for those registers and we have
>>+	 * to restore them in host side.
>
>I don't understand this comment. When is "configure_pe" called in the
>first place? Please provide proper function descriptions for each of
>these exported functions that tell someone who may want to use them
>what they do.
>

Yeah, I'll add the description here (also in Documentation/vfio.txt).

>Also, don't mention VFIO or guests in any function inside this file.
>

Ok. I'll avoid mentioning "VFIO" and "guest".

>>+	 */
>>+	eeh_pe_restore_bars(pe);
>>+	*retval = 0;
>>+
>>+out:
>>+	return ret;
>>+}
>>+EXPORT_SYMBOL_GPL(eeh_vfio_configure_pe);
>>+
>>+#endif /* CONFIG_VFIO_PCI_EEH */
>>+
>>  static int proc_eeh_show(struct seq_file *m, void *v)
>>  {
>>  	if (!eeh_enabled()) {
>>diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>>index 7ba0424..05c3dde 100644
>>--- a/drivers/vfio/pci/vfio_pci.c
>>+++ b/drivers/vfio/pci/vfio_pci.c
>>@@ -25,6 +25,9 @@
>>  #include <linux/types.h>
>>  #include <linux/uaccess.h>
>>  #include <linux/vfio.h>
>>+#ifdef CONFIG_VFIO_PCI_EEH
>>+#include <asm/eeh.h>
>>+#endif
>>  #include "vfio_pci_private.h"
>>@@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>>  	pci_restore_state(pdev);
>>  }
>>+static void vfio_eeh_pci_release(struct pci_dev *pdev)
>>+{
>>+#ifdef CONFIG_VFIO_PCI_EEH
>>+	eeh_vfio_release(pdev);
>>+#endif
>>+}
>>+
>>  static void vfio_pci_release(void *device_data)
>>  {
>>  	struct vfio_pci_device *vdev = device_data;
>>-	if (atomic_dec_and_test(&vdev->refcnt))
>>+	if (atomic_dec_and_test(&vdev->refcnt)) {
>>+		vfio_eeh_pci_release(vdev->pdev);
>>  		vfio_pci_disable(vdev);
>>+	}
>>  	module_put(THIS_MODULE);
>>  }
>>+static int vfio_eeh_pci_open(struct pci_dev *pdev)
>>+{
>>+	int ret = 0;
>>+
>>+#ifdef CONFIG_VFIO_PCI_EEH
>>+	ret = eeh_vfio_open(pdev);
>>+#endif
>>+	return ret;
>>+}
>>+
>>  static int vfio_pci_open(void *device_data)
>>  {
>>  	struct vfio_pci_device *vdev = device_data;
>>+	int ret;
>>  	if (!try_module_get(THIS_MODULE))
>>  		return -ENODEV;
>>  	if (atomic_inc_return(&vdev->refcnt) == 1) {
>>-		int ret = vfio_pci_enable(vdev);
>>-		if (ret) {
>>-			module_put(THIS_MODULE);
>>-			return ret;
>>-		}
>>+		ret = vfio_pci_enable(vdev);
>>+		if (ret)
>>+			goto error;
>>+
>>+		ret = vfio_eeh_pci_open(vdev->pdev);
>>+		if (ret)
>>+			goto error;
>>  	}
>>  	return 0;
>>+error:
>>+	module_put(THIS_MODULE);
>>+	return ret;
>>  }
>>  static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
>>@@ -321,6 +349,51 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>>  	return walk.ret;
>>  }
>>+static int vfio_eeh_pci_ioctl(struct pci_dev *pdev, struct vfio_eeh_op *info)
>
>I still don't like the idea of that multiplexing ioctl. I don't see
>any benefit in it whatsoever. Just create 5 individual ioctls with
>their own simple interfaces.
>

Ok. Will split in next revision. Thanks.

>Also, this interface has nothing to do with RTAS. So don't sneak in
>RTAS error numbers anywhere ;). It's QEMU's task to convert from
>kernel error codes to RTAS error codes.
>

Ok. Will do in next revision. Thanks for your comments and time, Alex :-)

Thanks,
Gavin

>>+{
>>+	int ret = 0;
>>+
>>+#ifdef CONFIG_VFIO_PCI_EEH
>>+	switch (info->op) {
>>+	case VFIO_EEH_OP_SET_OPTION:
>>+		ret = eeh_vfio_set_pe_option(pdev,
>>+					     info->option.option,
>>+					     &info->option.ret);
>>+		break;
>>+	case VFIO_EEH_OP_GET_ADDR:
>>+		ret = eeh_vfio_get_pe_addr(pdev,
>>+					   info->addr.option,
>>+					   &info->addr.ret,
>>+					   &info->addr.info);
>>+		break;
>>+	case VFIO_EEH_OP_GET_STATE:
>>+		ret = eeh_vfio_get_pe_state(pdev,
>>+					    &info->state.ret,
>>+					    &info->state.reset_state);
>>+		info->state.cfg_cap = 1;
>>+		info->state.pe_unavail_info = 1000;
>>+		info->state.pe_recovery_info = 0;
>>+		break;
>>+	case VFIO_EEH_OP_PE_RESET:
>>+		ret = eeh_vfio_reset_pe(pdev,
>>+					info->reset.option,
>>+					&info->reset.ret);
>>+		break;
>>+	case VFIO_EEH_OP_PE_CONFIG:
>>+		ret = eeh_vfio_configure_pe(pdev,
>>+					    &info->config.ret);
>>+	default:
>>+		ret = -EINVAL;
>>+		pr_debug("%s: Cannot handle op#%d\n",
>>+			__func__, info->op);
>>+	}
>>+#else
>>+	ret = -ENOENT;
>>+#endif
>>+
>>+	return ret;
>>+}
>>+
>>  static long vfio_pci_ioctl(void *device_data,
>>  			   unsigned int cmd, unsigned long arg)
>>  {
>>@@ -682,6 +755,20 @@ hot_reset_release:
>>  		kfree(groups);
>>  		return ret;
>>+	} else if (cmd == VFIO_EEH_OP) {
>>+		struct vfio_eeh_op info;
>>+		int ret = 0;
>>+
>>+		minsz = sizeof(info);
>>+		if (copy_from_user(&info, (void __user *)arg, minsz))
>>+			return -EFAULT;
>>+		if (info.argsz < minsz)
>>+			return -EINVAL;
>>+
>>+		ret = vfio_eeh_pci_ioctl(vdev->pdev, &info);
>>+		if (copy_to_user((void __user *)arg, &info, minsz))
>>+			ret = -EFAULT;
>>+		return ret;
>>  	}
>>  	return -ENOTTY;
>>diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>index cb9023d..518961d 100644
>>--- a/include/uapi/linux/vfio.h
>>+++ b/include/uapi/linux/vfio.h
>>@@ -455,6 +455,49 @@ struct vfio_iommu_spapr_tce_info {
>>  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>>+/*
>>+ * The VFIO operation struct provides way to support EEH functionality
>>+ * for PCI device that is passed from host to guest via VFIO.
>>+ */
>>+#define VFIO_EEH_OP_SET_OPTION	0
>>+#define VFIO_EEH_OP_GET_ADDR	1
>>+#define VFIO_EEH_OP_GET_STATE	2
>>+#define VFIO_EEH_OP_PE_RESET	3
>>+#define VFIO_EEH_OP_PE_CONFIG	4
>>+
>>+struct vfio_eeh_op {
>>+	__u32 argsz;
>>+	__u32 op;
>>+
>>+	union {
>>+		struct vfio_eeh_set_option {
>>+			__u32 option;
>>+			__s32 ret;
>>+		} option;
>>+		struct vfio_eeh_pe_addr {
>>+			__u32 option;
>>+			__s32 ret;
>>+			__u32 info;
>>+		} addr;
>>+		struct vfio_eeh_pe_state {
>>+			__s32 ret;
>>+			__u32 reset_state;
>>+			__u32 cfg_cap;
>>+			__u32 pe_unavail_info;
>>+			__u32 pe_recovery_info;
>>+		} state;
>>+		struct vfio_eeh_reset {
>>+			__u32 option;
>>+			__s32 ret;
>>+		} reset;
>>+		struct vfio_eeh_config {
>>+			__s32 ret;
>>+		} config;
>>+	};
>>+};
>>+
>>+#define VFIO_EEH_OP	_IO(VFIO_TYPE, VFIO_BASE + 21)
>>+
>>  /* ***************************************************************** */
>>  #endif /* _UAPIVFIO_H */
>
>--
>To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Gavin Shan May 22, 2014, 8:11 a.m. UTC | #4
On Thu, May 22, 2014 at 07:56:31AM +1000, Benjamin Herrenschmidt wrote:
>On Wed, 2014-05-21 at 15:07 +0200, Alexander Graf wrote:
>
>> > +#ifdef CONFIG_VFIO_PCI_EEH
>> > +int eeh_vfio_open(struct pci_dev *pdev)
>> 
>> Why vfio? Also that config option will not be set if vfio is compiled as 
>> a module.
>> 
>> > +{
>> > +	struct eeh_dev *edev;
>> > +
>> > +	/* No PCI device ? */
>> > +	if (!pdev)
>> > +		return -ENODEV;
>> > +
>> > +	/* No EEH device ? */
>> > +	edev = pci_dev_to_eeh_dev(pdev);
>> > +	if (!edev || !edev->pe)
>> > +		return -ENODEV;
>> > +
>> > +	eeh_dev_set_passed(edev, true);
>> > +	eeh_pe_set_passed(edev->pe, true);
>> > +
>> > +	return 0;
>> > +}
>> > +EXPORT_SYMBOL_GPL(eeh_vfio_open);
>
>Additionally, shouldn't we have some locking here ? (and in release too)
>
>I don't like relying on the caller locking (if it does it at all).
>

Ok. I'll add one mutex for open() and release() in next revision.
Thanks for the comment.

>> > +	/* Device existing ? */
>> > +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
>> > +	if (ret) {
>> > +		pr_debug("%s: Cannot find device %s\n",
>> > +			__func__, pdev ? pci_name(pdev) : "NULL");
>> > +		*retval = -7;
>> 
>> What are these? Please use proper kernel internal return values for 
>> errors. I don't want to see anything even remotely tied to RTAS in any 
>> of these patches.
>
>Hint: -ENODEV
>

In next revision, Those exported functions will have return value as:

>= 0:	carrried information to caller.
<  0:   error number.

Thanks,
Gavin
diff mbox

Patch

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index b9ca023..bb17ec7 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -305,7 +305,10 @@  faster, the map/unmap handling has been implemented in real mode which provides
 an excellent performance which has limitations such as inability to do
 locked pages accounting in real time.
 
-So 3 additional ioctls have been added:
+4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
+which works on the basis of additional ioctl command VFIO_EEH_OP.
+
+So 4 additional ioctls have been added:
 
 	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
 		of the DMA window on the PCI bus.
@@ -316,6 +319,7 @@  So 3 additional ioctls have been added:
 
 	VFIO_IOMMU_DISABLE - disables the container.
 
+	VFIO_EEH_OP - EEH dependent operations
 
 The code flow from the example above should be slightly changed:
 
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 34a2d83..93922ef 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -305,6 +305,16 @@  void eeh_add_device_late(struct pci_dev *);
 void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_add_sysfs_files(struct pci_bus *);
 void eeh_remove_device(struct pci_dev *);
+#ifdef CONFIG_VFIO_PCI_EEH
+int eeh_vfio_open(struct pci_dev *pdev);
+void eeh_vfio_release(struct pci_dev *pdev);
+int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval);
+int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
+			 int *retval, int *info);
+int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state);
+int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval);
+int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval);
+#endif
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9c6b899..2aaf90e 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1098,6 +1098,329 @@  void eeh_remove_device(struct pci_dev *dev)
 	edev->mode &= ~EEH_DEV_SYSFS;
 }
 
+#ifdef CONFIG_VFIO_PCI_EEH
+int eeh_vfio_open(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev;
+
+	/* No PCI device ? */
+	if (!pdev)
+		return -ENODEV;
+
+	/* No EEH device ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !edev->pe)
+		return -ENODEV;
+
+	eeh_dev_set_passed(edev, true);
+	eeh_pe_set_passed(edev->pe, true);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_open);
+
+void eeh_vfio_release(struct pci_dev *pdev)
+{
+	bool release_pe = true;
+	struct eeh_pe *pe = NULL;
+	struct eeh_dev *tmp, *edev;
+
+	/* No PCI device ? */
+	if (!pdev)
+		return;
+
+	/* No EEH device ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !eeh_dev_passed(edev) ||
+	    !edev->pe || !eeh_pe_passed(pe))
+		return;
+
+	/* Release device */
+	pe = edev->pe;
+	eeh_dev_set_passed(edev, false);
+
+	/* Release PE */
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		if (eeh_dev_passed(edev)) {
+			release_pe = false;
+			break;
+		}
+	}
+
+	if (release_pe)
+		eeh_pe_set_passed(pe, false);
+}
+EXPORT_SYMBOL(eeh_vfio_release);
+
+static int eeh_vfio_check_dev(struct pci_dev *pdev,
+			      struct eeh_dev **pedev,
+			      struct eeh_pe **ppe)
+{
+	struct eeh_dev *edev;
+
+	/* No device ? */
+	if (!pdev)
+		return -ENODEV;
+
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !eeh_dev_passed(edev) ||
+	    !edev->pe || !eeh_pe_passed(edev->pe))
+		return -ENODEV;
+
+	if (pedev)
+		*pedev = edev;
+	if (ppe)
+		*ppe = edev->pe;
+
+	return 0;
+}
+
+int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+	if (ret) {
+		pr_debug("%s: Cannot find device %s\n",
+			__func__, pdev ? pci_name(pdev) : "NULL");
+		*retval = -7;
+		goto out;
+	}
+
+	/* Invalid option ? */
+	if (option < EEH_OPT_DISABLE ||
+	    option > EEH_OPT_THAW_DMA) {
+		pr_debug("%s: Option %d out of range (%d, %d)\n",
+			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
+		*retval = -3;
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (option == EEH_OPT_DISABLE ||
+	    option == EEH_OPT_ENABLE) {
+		*retval = 0;
+	} else {
+		if (!eeh_ops || !eeh_ops->set_option) {
+			*retval = -7;
+			ret = -ENOENT;
+			goto out;
+		}
+
+		ret = eeh_ops->set_option(pe, option);
+		if (ret) {
+			pr_debug("%s: Failure %d from backend\n",
+				__func__, ret);
+			*retval = -1;
+			goto out;
+		}
+
+		*retval = 0;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_set_pe_option);
+
+int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
+			 int *retval, int *info)
+{
+	struct pci_bus *bus;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+	if (ret) {
+		*retval = -3;
+		goto out;
+	}
+
+	/* Invalid option ? */
+	if (option != 0 && option != 1) {
+		pr_debug("%s: option %d out of range (0, 1)\n",
+			__func__, option);
+		*retval = -3;
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Fill result according to option. We don't differentiate
+	 * PCI bus and device dependent PE here. So all PEs are
+	 * built in "shared" mode. Also, the PE address has the format
+	 * of "00BBSS00".
+	 */
+	if (option == 0) {
+		bus = eeh_pe_bus_get(pe);
+		if (!bus) {
+			*retval = -3;
+			ret = -ENODEV;
+			goto out;
+		}
+
+		*retval = 0;
+		*info = bus->number << 16;
+	} else {
+		*retval = 0;
+		*info = 1;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_addr);
+
+int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int result, ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+	if (ret) {
+		*retval = -3;
+		goto out;
+	}
+
+	if (!eeh_ops || !eeh_ops->get_state) {
+		pr_debug("%s: Unsupported request\n",
+			__func__);
+		ret = -ENOENT;
+		*retval = -3;
+		goto out;
+	}
+
+	result = eeh_ops->get_state(pe, NULL);
+	if (!(result & EEH_STATE_RESET_ACTIVE) &&
+	     (result & EEH_STATE_DMA_ENABLED) &&
+	     (result & EEH_STATE_MMIO_ENABLED))
+		*state = 0;
+	else if (result & EEH_STATE_RESET_ACTIVE)
+		*state = 1;
+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+		 !(result & EEH_STATE_DMA_ENABLED) &&
+		 !(result & EEH_STATE_MMIO_ENABLED))
+		*state = 2;
+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+		 (result & EEH_STATE_DMA_ENABLED) &&
+		 !(result & EEH_STATE_MMIO_ENABLED))
+		*state = 4;
+	else
+		*state = 5;
+
+	*retval = 0;
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_state);
+
+int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+	if (ret) {
+		*retval = -3;
+		goto out;
+	}
+
+	/* Invalid option ? */
+	if (option != EEH_RESET_DEACTIVATE &&
+	    option != EEH_RESET_HOT &&
+	    option != EEH_RESET_FUNDAMENTAL) {
+		pr_debug("%s: Unsupported option %d\n",
+			__func__, option);
+		ret = -EINVAL;
+		*retval = -3;
+		goto out;
+	}
+
+	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset) {
+		pr_debug("%s: Unsupported request\n",
+			__func__);
+		ret = -ENOENT;
+		*retval = -7;
+		goto out;
+	}
+
+	ret = eeh_ops->reset(pe, option);
+	if (ret) {
+		pr_debug("%s: Failure %d from backend\n",
+			 __func__, ret);
+		*retval = -1;
+		goto out;
+	}
+
+	/*
+	 * The PE is still in frozen state and we need clear that.
+	 * It's good to clear frozen state after deassert to avoid
+	 * messy IO access during reset, which might cause recrusive
+	 * frozen PE.
+	 */
+	if (option == EEH_RESET_DEACTIVATE) {
+		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
+		if (ret) {
+			pr_debug("%s: Cannot enable IO for PHB#%d-PE#%d (%d)\n",
+				__func__, pe->phb->global_number, pe->addr, ret);
+			*retval = -1;
+			goto out;
+		}
+
+		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
+		if (ret) {
+			pr_debug("%s: Cannot enable DMA for PHB#%d-PE#%d (%d)\n",
+				__func__, pe->phb->global_number, pe->addr, ret);
+			*retval = -1;
+			goto out;
+		}
+
+		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+	}
+
+	*retval = 0;
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_reset_pe);
+
+int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+	if (ret) {
+		*retval = -3;
+		goto out;
+	}
+
+	/*
+	 * The access to PCI config space on VFIO device has some
+	 * limitations. Part of PCI config space, including BAR
+	 * registers are not readable and writable. So the guest
+	 * should have stale values for those registers and we have
+	 * to restore them in host side.
+	 */
+	eeh_pe_restore_bars(pe);
+	*retval = 0;
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_configure_pe);
+
+#endif /* CONFIG_VFIO_PCI_EEH */
+
 static int proc_eeh_show(struct seq_file *m, void *v)
 {
 	if (!eeh_enabled()) {
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7ba0424..05c3dde 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -25,6 +25,9 @@ 
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
+#ifdef CONFIG_VFIO_PCI_EEH
+#include <asm/eeh.h>
+#endif
 
 #include "vfio_pci_private.h"
 
@@ -152,32 +155,57 @@  static void vfio_pci_disable(struct vfio_pci_device *vdev)
 	pci_restore_state(pdev);
 }
 
+static void vfio_eeh_pci_release(struct pci_dev *pdev)
+{
+#ifdef CONFIG_VFIO_PCI_EEH
+	eeh_vfio_release(pdev);
+#endif
+}
+
 static void vfio_pci_release(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
 
-	if (atomic_dec_and_test(&vdev->refcnt))
+	if (atomic_dec_and_test(&vdev->refcnt)) {
+		vfio_eeh_pci_release(vdev->pdev);
 		vfio_pci_disable(vdev);
+	}
 
 	module_put(THIS_MODULE);
 }
 
+static int vfio_eeh_pci_open(struct pci_dev *pdev)
+{
+	int ret = 0;
+
+#ifdef CONFIG_VFIO_PCI_EEH
+	ret = eeh_vfio_open(pdev);
+#endif
+	return ret;
+}
+
 static int vfio_pci_open(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
+	int ret;
 
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
 	if (atomic_inc_return(&vdev->refcnt) == 1) {
-		int ret = vfio_pci_enable(vdev);
-		if (ret) {
-			module_put(THIS_MODULE);
-			return ret;
-		}
+		ret = vfio_pci_enable(vdev);
+		if (ret)
+			goto error;
+
+		ret = vfio_eeh_pci_open(vdev->pdev);
+		if (ret)
+			goto error;
 	}
 
 	return 0;
+error:
+	module_put(THIS_MODULE);
+	return ret;
 }
 
 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
@@ -321,6 +349,51 @@  static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
 	return walk.ret;
 }
 
+static int vfio_eeh_pci_ioctl(struct pci_dev *pdev, struct vfio_eeh_op *info)
+{
+	int ret = 0;
+
+#ifdef CONFIG_VFIO_PCI_EEH
+	switch (info->op) {
+	case VFIO_EEH_OP_SET_OPTION:
+		ret = eeh_vfio_set_pe_option(pdev,
+					     info->option.option,
+					     &info->option.ret);
+		break;
+	case VFIO_EEH_OP_GET_ADDR:
+		ret = eeh_vfio_get_pe_addr(pdev,
+					   info->addr.option,
+					   &info->addr.ret,
+					   &info->addr.info);
+		break;
+	case VFIO_EEH_OP_GET_STATE:
+		ret = eeh_vfio_get_pe_state(pdev,
+					    &info->state.ret,
+					    &info->state.reset_state);
+		info->state.cfg_cap = 1;
+		info->state.pe_unavail_info = 1000;
+		info->state.pe_recovery_info = 0;
+		break;
+	case VFIO_EEH_OP_PE_RESET:
+		ret = eeh_vfio_reset_pe(pdev,
+					info->reset.option,
+					&info->reset.ret);
+		break;
+	case VFIO_EEH_OP_PE_CONFIG:
+		ret = eeh_vfio_configure_pe(pdev,
+					    &info->config.ret);
+	default:
+		ret = -EINVAL;
+		pr_debug("%s: Cannot handle op#%d\n",
+			__func__, info->op);
+	}
+#else
+	ret = -ENOENT;
+#endif
+
+	return ret;
+}
+
 static long vfio_pci_ioctl(void *device_data,
 			   unsigned int cmd, unsigned long arg)
 {
@@ -682,6 +755,20 @@  hot_reset_release:
 
 		kfree(groups);
 		return ret;
+	} else if (cmd == VFIO_EEH_OP) {
+		struct vfio_eeh_op info;
+		int ret = 0;
+
+		minsz = sizeof(info);
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		ret = vfio_eeh_pci_ioctl(vdev->pdev, &info);
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			ret = -EFAULT;
+		return ret;
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index cb9023d..518961d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -455,6 +455,49 @@  struct vfio_iommu_spapr_tce_info {
 
 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
 
+/*
+ * The VFIO operation struct provides way to support EEH functionality
+ * for PCI device that is passed from host to guest via VFIO.
+ */
+#define VFIO_EEH_OP_SET_OPTION	0
+#define VFIO_EEH_OP_GET_ADDR	1
+#define VFIO_EEH_OP_GET_STATE	2
+#define VFIO_EEH_OP_PE_RESET	3
+#define VFIO_EEH_OP_PE_CONFIG	4
+
+struct vfio_eeh_op {
+	__u32 argsz;
+	__u32 op;
+
+	union {
+		struct vfio_eeh_set_option {
+			__u32 option;
+			__s32 ret;
+		} option;
+		struct vfio_eeh_pe_addr {
+			__u32 option;
+			__s32 ret;
+			__u32 info;
+		} addr;
+		struct vfio_eeh_pe_state {
+			__s32 ret;
+			__u32 reset_state;
+			__u32 cfg_cap;
+			__u32 pe_unavail_info;
+			__u32 pe_recovery_info;
+		} state;
+		struct vfio_eeh_reset {
+			__u32 option;
+			__s32 ret;
+		} reset;
+		struct vfio_eeh_config {
+			__s32 ret;
+		} config;
+	};
+};
+
+#define VFIO_EEH_OP	_IO(VFIO_TYPE, VFIO_BASE + 21)
+
 /* ***************************************************************** */
 
 #endif /* _UAPIVFIO_H */