diff mbox series

[04/12] pci-p2p: Clear ACS P2P flags for all client devices

Message ID 20180104190137.7654-5-logang@deltatee.com
State Changes Requested
Headers show
Series Copy Offload in NVMe Fabrics with P2P PCI Memory | expand

Commit Message

Logan Gunthorpe Jan. 4, 2018, 7:01 p.m. UTC
When the ACS P2P flags are set in the downstream port of the switch,
any P2P TLPs will be sent back to the root complex. The whole point of
the P2P work is to have TLPs avoid the RC seeing it may not support
P2P transactions or, at best, it will perform poorly. So we clear these
flags for all devices involved in transactions with the p2pmem.

A count of the number of requests to disable the flags is maintained.
When the count transitions from 1 to 0, the old flags are restored.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/pci/p2p.c   | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/pci.h |   2 +
 2 files changed, 143 insertions(+), 3 deletions(-)

Comments

Bjorn Helgaas Jan. 4, 2018, 9:57 p.m. UTC | #1
[+cc Alex]

On Thu, Jan 04, 2018 at 12:01:29PM -0700, Logan Gunthorpe wrote:
> When the ACS P2P flags are set in the downstream port of the switch,
> any P2P TLPs will be sent back to the root complex. The whole point of
> the P2P work is to have TLPs avoid the RC seeing it may not support
> P2P transactions or, at best, it will perform poorly. So we clear these

"It will perform poorly" seems like an empirical statement about the
hardware you've seen, not something that's driven by the spec.  So I'm
not sure it adds information that will be useful long-term.

> flags for all devices involved in transactions with the p2pmem.

I'm not sure how this coordinates with other uses of ACS, e.g., VFIO.
I think that should be addressed here somehow.

> A count of the number of requests to disable the flags is maintained.
> When the count transitions from 1 to 0, the old flags are restored.
> 
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> ---
>  drivers/pci/p2p.c   | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  include/linux/pci.h |   2 +
>  2 files changed, 143 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/pci/p2p.c b/drivers/pci/p2p.c
> index 87cec87b02e3..617adaa905d2 100644
> --- a/drivers/pci/p2p.c
> +++ b/drivers/pci/p2p.c
> @@ -227,6 +227,89 @@ static struct pci_dev *find_parent_pci_dev(struct device *dev)
>  }
>  
>  /*
> + * The ACS flags for P2P Request Redirect and P2P Completion Redirect need
> + * to be disabled in the downstream port of each device in order for
> + * the TLPs to not be forwarded up to the RC.
> + */
> +#define PCI_P2P_ACS_FLAGS (PCI_ACS_RR | PCI_ACS_CR)
> +
> +static int pci_p2pmem_disable_acs(struct pci_dev *pdev)
> +{
> +	int pos;
> +	u16 ctrl;
> +	struct pci_dev *downstream;
> +
> +	downstream = pci_dev_get(pci_upstream_bridge(pdev));
> +	if (!downstream) {
> +		dev_err(&pdev->dev, "could not find downstream port\n");
> +		return -ENODEV;
> +	}
> +
> +	device_lock(&downstream->dev);
> +	if (downstream->p2p_acs_requests++)
> +		goto unlock_and_return;
> +
> +	pos = pci_find_ext_capability(downstream, PCI_EXT_CAP_ID_ACS);
> +	if (!pos)
> +		goto unlock_and_return;
> +
> +	pci_read_config_word(downstream, pos + PCI_ACS_CTRL, &ctrl);
> +
> +	downstream->p2p_old_acs_flags = ctrl & PCI_P2P_ACS_FLAGS;
> +
> +	if (downstream->p2p_old_acs_flags)
> +		dev_info(&pdev->dev, "disabling p2p acs flags: %x\n", ctrl);
> +
> +	ctrl &= ~PCI_P2P_ACS_FLAGS;
> +
> +	pci_write_config_word(downstream, pos + PCI_ACS_CTRL, ctrl);
> +
> +unlock_and_return:
> +	device_unlock(&downstream->dev);
> +	pci_dev_put(downstream);
> +	return 0;
> +}
> +
> +static int pci_p2pmem_reset_acs(struct pci_dev *pdev)
> +{
> +	int pos;
> +	u16 ctrl;
> +	struct pci_dev *downstream;
> +
> +	downstream = pci_dev_get(pci_upstream_bridge(pdev));
> +	if (!downstream)
> +		return -ENODEV;
> +
> +	device_lock(&downstream->dev);
> +
> +	/* Only actually reset the flags on a 1->0 transition */
> +	if (!downstream->p2p_acs_requests)
> +		goto unlock_and_return;
> +
> +	if (--downstream->p2p_acs_requests)
> +		goto unlock_and_return;
> +
> +	pos = pci_find_ext_capability(downstream, PCI_EXT_CAP_ID_ACS);
> +	if (!pos)
> +		goto unlock_and_return;
> +
> +	pci_read_config_word(downstream, pos + PCI_ACS_CTRL, &ctrl);
> +
> +	ctrl &= ~PCI_P2P_ACS_FLAGS;
> +	ctrl |= downstream->p2p_old_acs_flags;
> +
> +	if (downstream->p2p_old_acs_flags)
> +		dev_info(&pdev->dev, "resetting p2p acs flags: %x\n", ctrl);
> +
> +	pci_write_config_word(downstream, pos + PCI_ACS_CTRL, ctrl);
> +
> +unlock_and_return:
> +	device_unlock(&downstream->dev);
> +	pci_dev_put(downstream);
> +	return 0;
> +}
> +
> +/*
>   * If a device is behind a switch, we try to find the upstream bridge
>   * port of the switch. This requires two calls to pci_upstream_bridge:
>   * one for the upstream port on the switch, one on the upstream port
> @@ -340,6 +423,10 @@ int pci_p2pmem_add_client(struct list_head *head, struct device *dev)
>  			ret = -EXDEV;
>  			goto put_client;
>  		}
> +
> +		ret = pci_p2pmem_disable_acs(client);
> +		if (!ret)
> +			goto put_client;
>  	}
>  
>  	new_item = kzalloc(sizeof(*new_item), GFP_KERNEL);
> @@ -363,6 +450,9 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_add_client);
>  
>  static void pci_p2pmem_client_free(struct pci_p2pmem_client *item)
>  {
> +	if (item->p2pmem)
> +		pci_p2pmem_reset_acs(item->client);
> +
>  	list_del(&item->list);
>  	pci_dev_put(item->client);
>  	pci_dev_put(item->p2pmem);
> @@ -383,6 +473,7 @@ void pci_p2pmem_remove_client(struct list_head *head, struct device *dev)
>  {
>  	struct pci_p2pmem_client *pos, *tmp;
>  	struct pci_dev *pdev;
> +	struct pci_dev *p2pmem = NULL;
>  
>  	pdev = find_parent_pci_dev(dev);
>  	if (!pdev)
> @@ -392,9 +483,16 @@ void pci_p2pmem_remove_client(struct list_head *head, struct device *dev)
>  		if (pos->client != pdev)
>  			continue;
>  
> +		if (!p2pmem)
> +			p2pmem = pci_dev_get(pos->p2pmem);
> +
>  		pci_p2pmem_client_free(pos);
>  	}
>  
> +	if (p2pmem && list_empty(head))
> +		pci_p2pmem_reset_acs(p2pmem);
> +
> +	pci_dev_put(p2pmem);
>  	pci_dev_put(pdev);
>  }
>  EXPORT_SYMBOL_GPL(pci_p2pmem_remove_client);
> @@ -412,6 +510,10 @@ void pci_p2pmem_client_list_free(struct list_head *head)
>  {
>  	struct pci_p2pmem_client *pos, *tmp;
>  
> +	pos = list_first_entry_or_null(head, struct pci_p2pmem_client, list);
> +	if (pos && pos->p2pmem)
> +		pci_p2pmem_reset_acs(pos->p2pmem);
> +
>  	list_for_each_entry_safe(pos, tmp, head, list)
>  		pci_p2pmem_client_free(pos);
>  }
> @@ -440,6 +542,40 @@ static bool upstream_bridges_match_list(struct pci_dev *pdev,
>  	return ret;
>  }
>  
> +static int bind_clients(struct pci_dev *p2pmem, struct list_head *clients)
> +{
> +	int ret;
> +	struct pci_p2pmem_client *pos, *unwind_pos;
> +
> +	ret = pci_p2pmem_disable_acs(p2pmem);
> +	if (ret)
> +		return ret;
> +
> +	list_for_each_entry(pos, clients, list) {
> +		ret = pci_p2pmem_disable_acs(pos->client);
> +		if (ret)
> +			goto unwind;
> +
> +		pos->p2pmem = pci_dev_get(p2pmem);
> +	}
> +
> +	return 0;
> +
> +unwind:
> +	list_for_each_entry(unwind_pos, clients, list) {
> +		if (pos == unwind_pos)
> +			break;
> +
> +		pci_p2pmem_reset_acs(unwind_pos->client);
> +		pci_dev_put(unwind_pos->p2pmem);
> +		unwind_pos->p2pmem = NULL;
> +	}
> +
> +	pci_p2pmem_reset_acs(p2pmem);
> +
> +	return ret;
> +}
> +
>  /**
>   * pci_p2pmem_find - find a p2p mem device compatible with the specified device
>   * @dev: list of device to check (NULL-terminated)
> @@ -450,11 +586,13 @@ static bool upstream_bridges_match_list(struct pci_dev *pdev,
>   *
>   * Returns a pointer to the PCI device with a reference taken (use pci_dev_put
>   * to return the reference) or NULL if no compatible device is found.
> + *
> + * The P2P ACS flags on all applicable PCI devices will be cleared and
> + * reset when the client is removed from the list.
>   */
>  struct pci_dev *pci_p2pmem_find(struct list_head *clients)
>  {
>  	struct pci_dev *pdev = NULL;
> -	struct pci_p2pmem_client *pos;
>  
>  	while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev))) {
>  		if (!pdev->p2p || !pdev->p2p->published)
> @@ -463,8 +601,8 @@ struct pci_dev *pci_p2pmem_find(struct list_head *clients)
>  		if (!upstream_bridges_match_list(pdev, clients))
>  			continue;
>  
> -		list_for_each_entry(pos, clients, list)
> -			pos->p2pmem = pdev;
> +		if (bind_clients(pdev, clients))
> +			continue;
>  
>  		return pdev;
>  	}
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 047aea679e87..cdd4d3c3e562 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -435,6 +435,8 @@ struct pci_dev {
>  #endif
>  #ifdef CONFIG_PCI_P2P
>  	struct pci_p2p *p2p;
> +	unsigned int p2p_acs_requests;
> +	u16 p2p_old_acs_flags;
>  #endif
>  	phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */
>  	size_t romlen; /* Length of ROM if it's not from the BAR */
> -- 
> 2.11.0
>
Alex Williamson Jan. 4, 2018, 10:35 p.m. UTC | #2
On Thu, 4 Jan 2018 15:57:21 -0600
Bjorn Helgaas <helgaas@kernel.org> wrote:

> [+cc Alex]
> 
> On Thu, Jan 04, 2018 at 12:01:29PM -0700, Logan Gunthorpe wrote:
> > When the ACS P2P flags are set in the downstream port of the switch,
> > any P2P TLPs will be sent back to the root complex. The whole point of
> > the P2P work is to have TLPs avoid the RC seeing it may not support
> > P2P transactions or, at best, it will perform poorly. So we clear these  
> 
> "It will perform poorly" seems like an empirical statement about the
> hardware you've seen, not something that's driven by the spec.  So I'm
> not sure it adds information that will be useful long-term.
> 
> > flags for all devices involved in transactions with the p2pmem.  
> 
> I'm not sure how this coordinates with other uses of ACS, e.g., VFIO.
> I think that should be addressed here somehow.

Yep, flipping these ACS bits invalidates any IOMMU groups that depend
on the isolation of that downstream port and I suspect also any peers
within the same PCI slot of that port and their downstream devices.  The
entire sub-hierarchy grouping needs to be re-evaluated.  This
potentially affects running devices that depend on that isolation, so
I'm not sure how that happens dynamically.  A boot option might be
easier.  Thanks,

Alex

> > A count of the number of requests to disable the flags is maintained.
> > When the count transitions from 1 to 0, the old flags are restored.
> > 
> > Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> > ---
> >  drivers/pci/p2p.c   | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++--
> >  include/linux/pci.h |   2 +
> >  2 files changed, 143 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/pci/p2p.c b/drivers/pci/p2p.c
> > index 87cec87b02e3..617adaa905d2 100644
> > --- a/drivers/pci/p2p.c
> > +++ b/drivers/pci/p2p.c
> > @@ -227,6 +227,89 @@ static struct pci_dev *find_parent_pci_dev(struct device *dev)
> >  }
> >  
> >  /*
> > + * The ACS flags for P2P Request Redirect and P2P Completion Redirect need
> > + * to be disabled in the downstream port of each device in order for
> > + * the TLPs to not be forwarded up to the RC.
> > + */
> > +#define PCI_P2P_ACS_FLAGS (PCI_ACS_RR | PCI_ACS_CR)
> > +
> > +static int pci_p2pmem_disable_acs(struct pci_dev *pdev)
> > +{
> > +	int pos;
> > +	u16 ctrl;
> > +	struct pci_dev *downstream;
> > +
> > +	downstream = pci_dev_get(pci_upstream_bridge(pdev));
> > +	if (!downstream) {
> > +		dev_err(&pdev->dev, "could not find downstream port\n");
> > +		return -ENODEV;
> > +	}
> > +
> > +	device_lock(&downstream->dev);
> > +	if (downstream->p2p_acs_requests++)
> > +		goto unlock_and_return;
> > +
> > +	pos = pci_find_ext_capability(downstream, PCI_EXT_CAP_ID_ACS);
> > +	if (!pos)
> > +		goto unlock_and_return;
> > +
> > +	pci_read_config_word(downstream, pos + PCI_ACS_CTRL, &ctrl);
> > +
> > +	downstream->p2p_old_acs_flags = ctrl & PCI_P2P_ACS_FLAGS;
> > +
> > +	if (downstream->p2p_old_acs_flags)
> > +		dev_info(&pdev->dev, "disabling p2p acs flags: %x\n", ctrl);
> > +
> > +	ctrl &= ~PCI_P2P_ACS_FLAGS;
> > +
> > +	pci_write_config_word(downstream, pos + PCI_ACS_CTRL, ctrl);
> > +
> > +unlock_and_return:
> > +	device_unlock(&downstream->dev);
> > +	pci_dev_put(downstream);
> > +	return 0;
> > +}
> > +
> > +static int pci_p2pmem_reset_acs(struct pci_dev *pdev)
> > +{
> > +	int pos;
> > +	u16 ctrl;
> > +	struct pci_dev *downstream;
> > +
> > +	downstream = pci_dev_get(pci_upstream_bridge(pdev));
> > +	if (!downstream)
> > +		return -ENODEV;
> > +
> > +	device_lock(&downstream->dev);
> > +
> > +	/* Only actually reset the flags on a 1->0 transition */
> > +	if (!downstream->p2p_acs_requests)
> > +		goto unlock_and_return;
> > +
> > +	if (--downstream->p2p_acs_requests)
> > +		goto unlock_and_return;
> > +
> > +	pos = pci_find_ext_capability(downstream, PCI_EXT_CAP_ID_ACS);
> > +	if (!pos)
> > +		goto unlock_and_return;
> > +
> > +	pci_read_config_word(downstream, pos + PCI_ACS_CTRL, &ctrl);
> > +
> > +	ctrl &= ~PCI_P2P_ACS_FLAGS;
> > +	ctrl |= downstream->p2p_old_acs_flags;
> > +
> > +	if (downstream->p2p_old_acs_flags)
> > +		dev_info(&pdev->dev, "resetting p2p acs flags: %x\n", ctrl);
> > +
> > +	pci_write_config_word(downstream, pos + PCI_ACS_CTRL, ctrl);
> > +
> > +unlock_and_return:
> > +	device_unlock(&downstream->dev);
> > +	pci_dev_put(downstream);
> > +	return 0;
> > +}
> > +
> > +/*
> >   * If a device is behind a switch, we try to find the upstream bridge
> >   * port of the switch. This requires two calls to pci_upstream_bridge:
> >   * one for the upstream port on the switch, one on the upstream port
> > @@ -340,6 +423,10 @@ int pci_p2pmem_add_client(struct list_head *head, struct device *dev)
> >  			ret = -EXDEV;
> >  			goto put_client;
> >  		}
> > +
> > +		ret = pci_p2pmem_disable_acs(client);
> > +		if (!ret)
> > +			goto put_client;
> >  	}
> >  
> >  	new_item = kzalloc(sizeof(*new_item), GFP_KERNEL);
> > @@ -363,6 +450,9 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_add_client);
> >  
> >  static void pci_p2pmem_client_free(struct pci_p2pmem_client *item)
> >  {
> > +	if (item->p2pmem)
> > +		pci_p2pmem_reset_acs(item->client);
> > +
> >  	list_del(&item->list);
> >  	pci_dev_put(item->client);
> >  	pci_dev_put(item->p2pmem);
> > @@ -383,6 +473,7 @@ void pci_p2pmem_remove_client(struct list_head *head, struct device *dev)
> >  {
> >  	struct pci_p2pmem_client *pos, *tmp;
> >  	struct pci_dev *pdev;
> > +	struct pci_dev *p2pmem = NULL;
> >  
> >  	pdev = find_parent_pci_dev(dev);
> >  	if (!pdev)
> > @@ -392,9 +483,16 @@ void pci_p2pmem_remove_client(struct list_head *head, struct device *dev)
> >  		if (pos->client != pdev)
> >  			continue;
> >  
> > +		if (!p2pmem)
> > +			p2pmem = pci_dev_get(pos->p2pmem);
> > +
> >  		pci_p2pmem_client_free(pos);
> >  	}
> >  
> > +	if (p2pmem && list_empty(head))
> > +		pci_p2pmem_reset_acs(p2pmem);
> > +
> > +	pci_dev_put(p2pmem);
> >  	pci_dev_put(pdev);
> >  }
> >  EXPORT_SYMBOL_GPL(pci_p2pmem_remove_client);
> > @@ -412,6 +510,10 @@ void pci_p2pmem_client_list_free(struct list_head *head)
> >  {
> >  	struct pci_p2pmem_client *pos, *tmp;
> >  
> > +	pos = list_first_entry_or_null(head, struct pci_p2pmem_client, list);
> > +	if (pos && pos->p2pmem)
> > +		pci_p2pmem_reset_acs(pos->p2pmem);
> > +
> >  	list_for_each_entry_safe(pos, tmp, head, list)
> >  		pci_p2pmem_client_free(pos);
> >  }
> > @@ -440,6 +542,40 @@ static bool upstream_bridges_match_list(struct pci_dev *pdev,
> >  	return ret;
> >  }
> >  
> > +static int bind_clients(struct pci_dev *p2pmem, struct list_head *clients)
> > +{
> > +	int ret;
> > +	struct pci_p2pmem_client *pos, *unwind_pos;
> > +
> > +	ret = pci_p2pmem_disable_acs(p2pmem);
> > +	if (ret)
> > +		return ret;
> > +
> > +	list_for_each_entry(pos, clients, list) {
> > +		ret = pci_p2pmem_disable_acs(pos->client);
> > +		if (ret)
> > +			goto unwind;
> > +
> > +		pos->p2pmem = pci_dev_get(p2pmem);
> > +	}
> > +
> > +	return 0;
> > +
> > +unwind:
> > +	list_for_each_entry(unwind_pos, clients, list) {
> > +		if (pos == unwind_pos)
> > +			break;
> > +
> > +		pci_p2pmem_reset_acs(unwind_pos->client);
> > +		pci_dev_put(unwind_pos->p2pmem);
> > +		unwind_pos->p2pmem = NULL;
> > +	}
> > +
> > +	pci_p2pmem_reset_acs(p2pmem);
> > +
> > +	return ret;
> > +}
> > +
> >  /**
> >   * pci_p2pmem_find - find a p2p mem device compatible with the specified device
> >   * @dev: list of device to check (NULL-terminated)
> > @@ -450,11 +586,13 @@ static bool upstream_bridges_match_list(struct pci_dev *pdev,
> >   *
> >   * Returns a pointer to the PCI device with a reference taken (use pci_dev_put
> >   * to return the reference) or NULL if no compatible device is found.
> > + *
> > + * The P2P ACS flags on all applicable PCI devices will be cleared and
> > + * reset when the client is removed from the list.
> >   */
> >  struct pci_dev *pci_p2pmem_find(struct list_head *clients)
> >  {
> >  	struct pci_dev *pdev = NULL;
> > -	struct pci_p2pmem_client *pos;
> >  
> >  	while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev))) {
> >  		if (!pdev->p2p || !pdev->p2p->published)
> > @@ -463,8 +601,8 @@ struct pci_dev *pci_p2pmem_find(struct list_head *clients)
> >  		if (!upstream_bridges_match_list(pdev, clients))
> >  			continue;
> >  
> > -		list_for_each_entry(pos, clients, list)
> > -			pos->p2pmem = pdev;
> > +		if (bind_clients(pdev, clients))
> > +			continue;
> >  
> >  		return pdev;
> >  	}
> > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > index 047aea679e87..cdd4d3c3e562 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -435,6 +435,8 @@ struct pci_dev {
> >  #endif
> >  #ifdef CONFIG_PCI_P2P
> >  	struct pci_p2p *p2p;
> > +	unsigned int p2p_acs_requests;
> > +	u16 p2p_old_acs_flags;
> >  #endif
> >  	phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */
> >  	size_t romlen; /* Length of ROM if it's not from the BAR */
> > -- 
> > 2.11.0
> >
Logan Gunthorpe Jan. 5, 2018, midnight UTC | #3
On 04/01/18 03:35 PM, Alex Williamson wrote:
> Yep, flipping these ACS bits invalidates any IOMMU groups that depend
> on the isolation of that downstream port and I suspect also any peers
> within the same PCI slot of that port and their downstream devices.  The
> entire sub-hierarchy grouping needs to be re-evaluated.  This
> potentially affects running devices that depend on that isolation, so
> I'm not sure how that happens dynamically.  A boot option might be
> easier.  Thanks,

I don't see how this is the case in current kernel code. It appears to 
only enable ACS globally if the IOMMU requests it.

I also don't see how turning off ACS isolation for a specific device is 
going to hurt anything. The IOMMU should still be able to keep going on 
unaware that anything has changed. The only worry is that a security 
hole may now be created if a user was relying on the isolation between 
two devices that are in different VMs or something. However, if a user 
was relying on this, they probably shouldn't have turned on P2P in the 
first place.

We started with a fairly unintelligent choice to simply disable ACS on 
any kernel that had CONFIG_PCI_P2P set. However, this did not seem like 
a good idea going forward. Instead, we now selectively disable the ACS 
bit only on the downstream ports that are involved in P2P transactions. 
This seems like the safest choice and still allows people to (carefully) 
use P2P adjacent to other devices that need to be isolated.

I don't think anyone wants another boot option that must be set in order 
to use this functionality (and only some hardware would require this). 
That's just a huge pain for users.

Logan
Logan Gunthorpe Jan. 5, 2018, 1:09 a.m. UTC | #4
On 04/01/18 05:00 PM, Logan Gunthorpe wrote:
> 
> 
> On 04/01/18 03:35 PM, Alex Williamson wrote:
>> Yep, flipping these ACS bits invalidates any IOMMU groups that depend
>> on the isolation of that downstream port and I suspect also any peers
>> within the same PCI slot of that port and their downstream devices.  The
>> entire sub-hierarchy grouping needs to be re-evaluated.  This
>> potentially affects running devices that depend on that isolation, so
>> I'm not sure how that happens dynamically.  A boot option might be
>> easier.  Thanks,
> 

Also, considering this further: if you can point me to a way to tell if
a device is in an IOMMU group it should be pretty easy to disallow a
device that has been assigned to a group from being used in P2P
transactions.

Logan
Alex Williamson Jan. 5, 2018, 3:33 a.m. UTC | #5
On Thu, 4 Jan 2018 17:00:47 -0700
Logan Gunthorpe <logang@deltatee.com> wrote:

> On 04/01/18 03:35 PM, Alex Williamson wrote:
> > Yep, flipping these ACS bits invalidates any IOMMU groups that depend
> > on the isolation of that downstream port and I suspect also any peers
> > within the same PCI slot of that port and their downstream devices.  The
> > entire sub-hierarchy grouping needs to be re-evaluated.  This
> > potentially affects running devices that depend on that isolation, so
> > I'm not sure how that happens dynamically.  A boot option might be
> > easier.  Thanks,  
> 
> I don't see how this is the case in current kernel code. It appears to 
> only enable ACS globally if the IOMMU requests it.

IOMMU groups don't exist unless the IOMMU is enabled and x86 and ARM
both request ACS be enabled if an IOMMU is present, so I'm not sure
what you're getting at here.  Also, in reply to your other email, if
the IOMMU is enabled, every device handled by the IOMMU is a member of
an IOMMU group, see struct device.iommu_group.  There's an
iommu_group_get() accessor to get a reference to it.
 
> I also don't see how turning off ACS isolation for a specific device is 
> going to hurt anything. The IOMMU should still be able to keep going on 
> unaware that anything has changed. The only worry is that a security 
> hole may now be created if a user was relying on the isolation between 
> two devices that are in different VMs or something. However, if a user 
> was relying on this, they probably shouldn't have turned on P2P in the 
> first place.

That's exactly what IOMMU groups represent, the smallest set of devices
which have DMA isolation from other devices.  By poking this hole, the
IOMMU group is invalid.  We cannot turn off ACS only for a specific
device, in order to enable p2p it needs to be disabled at every
downstream port between the devices where we want to enable p2p.
Depending on the topology, that could mean we're also enabling p2p for
unrelated devices.  Those unrelated devices might be in active use and
the p2p IOVAs now have a different destination which is no longer IOMMU
translated.
 
> We started with a fairly unintelligent choice to simply disable ACS on 
> any kernel that had CONFIG_PCI_P2P set. However, this did not seem like 
> a good idea going forward. Instead, we now selectively disable the ACS 
> bit only on the downstream ports that are involved in P2P transactions. 
> This seems like the safest choice and still allows people to (carefully) 
> use P2P adjacent to other devices that need to be isolated.

I don't see that the code is doing much checking that adjacent devices
are also affected by the p2p change and of course the IOMMU group is
entirely invalid once the p2p holes start getting poked.

> I don't think anyone wants another boot option that must be set in order 
> to use this functionality (and only some hardware would require this). 
> That's just a huge pain for users.

No, but nor do we need IOMMU groups that no longer represent what
they're intended to describe or runtime, unchecked routing changes
through the topology for devices that might already be using
conflicting IOVA ranges.  Maybe soft hotplugs are another possibility,
designate a sub-hierarchy to be removed and re-scanned with ACS
disabled.  Otherwise it seems like disabling and re-enabling ACS needs
to also handle merging and splitting groups dynamically.  Thanks,

Alex
Jerome Glisse Jan. 5, 2018, 6:47 a.m. UTC | #6
On Thu, Jan 04, 2018 at 08:33:00PM -0700, Alex Williamson wrote:
> On Thu, 4 Jan 2018 17:00:47 -0700
> Logan Gunthorpe <logang@deltatee.com> wrote:
> 
> > On 04/01/18 03:35 PM, Alex Williamson wrote:
> > > Yep, flipping these ACS bits invalidates any IOMMU groups that depend
> > > on the isolation of that downstream port and I suspect also any peers
> > > within the same PCI slot of that port and their downstream devices.  The
> > > entire sub-hierarchy grouping needs to be re-evaluated.  This
> > > potentially affects running devices that depend on that isolation, so
> > > I'm not sure how that happens dynamically.  A boot option might be
> > > easier.  Thanks,  
> > 
> > I don't see how this is the case in current kernel code. It appears to 
> > only enable ACS globally if the IOMMU requests it.
> 
> IOMMU groups don't exist unless the IOMMU is enabled and x86 and ARM
> both request ACS be enabled if an IOMMU is present, so I'm not sure
> what you're getting at here.  Also, in reply to your other email, if
> the IOMMU is enabled, every device handled by the IOMMU is a member of
> an IOMMU group, see struct device.iommu_group.  There's an
> iommu_group_get() accessor to get a reference to it.
>  
> > I also don't see how turning off ACS isolation for a specific device is 
> > going to hurt anything. The IOMMU should still be able to keep going on 
> > unaware that anything has changed. The only worry is that a security 
> > hole may now be created if a user was relying on the isolation between 
> > two devices that are in different VMs or something. However, if a user 
> > was relying on this, they probably shouldn't have turned on P2P in the 
> > first place.
> 
> That's exactly what IOMMU groups represent, the smallest set of devices
> which have DMA isolation from other devices.  By poking this hole, the
> IOMMU group is invalid.  We cannot turn off ACS only for a specific
> device, in order to enable p2p it needs to be disabled at every
> downstream port between the devices where we want to enable p2p.
> Depending on the topology, that could mean we're also enabling p2p for
> unrelated devices.  Those unrelated devices might be in active use and
> the p2p IOVAs now have a different destination which is no longer IOMMU
> translated.
>  
> > We started with a fairly unintelligent choice to simply disable ACS on 
> > any kernel that had CONFIG_PCI_P2P set. However, this did not seem like 
> > a good idea going forward. Instead, we now selectively disable the ACS 
> > bit only on the downstream ports that are involved in P2P transactions. 
> > This seems like the safest choice and still allows people to (carefully) 
> > use P2P adjacent to other devices that need to be isolated.
> 
> I don't see that the code is doing much checking that adjacent devices
> are also affected by the p2p change and of course the IOMMU group is
> entirely invalid once the p2p holes start getting poked.
> 
> > I don't think anyone wants another boot option that must be set in order 
> > to use this functionality (and only some hardware would require this). 
> > That's just a huge pain for users.
> 
> No, but nor do we need IOMMU groups that no longer represent what
> they're intended to describe or runtime, unchecked routing changes
> through the topology for devices that might already be using
> conflicting IOVA ranges.  Maybe soft hotplugs are another possibility,
> designate a sub-hierarchy to be removed and re-scanned with ACS
> disabled.  Otherwise it seems like disabling and re-enabling ACS needs
> to also handle merging and splitting groups dynamically.  Thanks,
> 

Dumb question, can we use a PCI bar address of one device into the
IOMMU page table of another address ie like we would DMA map a
regular system page ?

It would be much better in my view to follow down such path if that
is at all possible from hardware point of view (i am not sure where
to dig in the specification to answer my above question).

Cheers,
Jérôme
Alex Williamson Jan. 5, 2018, 3:41 p.m. UTC | #7
On Fri, 5 Jan 2018 01:47:01 -0500
Jerome Glisse <jglisse@redhat.com> wrote:

> On Thu, Jan 04, 2018 at 08:33:00PM -0700, Alex Williamson wrote:
> > On Thu, 4 Jan 2018 17:00:47 -0700
> > Logan Gunthorpe <logang@deltatee.com> wrote:
> >   
> > > On 04/01/18 03:35 PM, Alex Williamson wrote:  
> > > > Yep, flipping these ACS bits invalidates any IOMMU groups that depend
> > > > on the isolation of that downstream port and I suspect also any peers
> > > > within the same PCI slot of that port and their downstream devices.  The
> > > > entire sub-hierarchy grouping needs to be re-evaluated.  This
> > > > potentially affects running devices that depend on that isolation, so
> > > > I'm not sure how that happens dynamically.  A boot option might be
> > > > easier.  Thanks,    
> > > 
> > > I don't see how this is the case in current kernel code. It appears to 
> > > only enable ACS globally if the IOMMU requests it.  
> > 
> > IOMMU groups don't exist unless the IOMMU is enabled and x86 and ARM
> > both request ACS be enabled if an IOMMU is present, so I'm not sure
> > what you're getting at here.  Also, in reply to your other email, if
> > the IOMMU is enabled, every device handled by the IOMMU is a member of
> > an IOMMU group, see struct device.iommu_group.  There's an
> > iommu_group_get() accessor to get a reference to it.
> >    
> > > I also don't see how turning off ACS isolation for a specific device is 
> > > going to hurt anything. The IOMMU should still be able to keep going on 
> > > unaware that anything has changed. The only worry is that a security 
> > > hole may now be created if a user was relying on the isolation between 
> > > two devices that are in different VMs or something. However, if a user 
> > > was relying on this, they probably shouldn't have turned on P2P in the 
> > > first place.  
> > 
> > That's exactly what IOMMU groups represent, the smallest set of devices
> > which have DMA isolation from other devices.  By poking this hole, the
> > IOMMU group is invalid.  We cannot turn off ACS only for a specific
> > device, in order to enable p2p it needs to be disabled at every
> > downstream port between the devices where we want to enable p2p.
> > Depending on the topology, that could mean we're also enabling p2p for
> > unrelated devices.  Those unrelated devices might be in active use and
> > the p2p IOVAs now have a different destination which is no longer IOMMU
> > translated.
> >    
> > > We started with a fairly unintelligent choice to simply disable ACS on 
> > > any kernel that had CONFIG_PCI_P2P set. However, this did not seem like 
> > > a good idea going forward. Instead, we now selectively disable the ACS 
> > > bit only on the downstream ports that are involved in P2P transactions. 
> > > This seems like the safest choice and still allows people to (carefully) 
> > > use P2P adjacent to other devices that need to be isolated.  
> > 
> > I don't see that the code is doing much checking that adjacent devices
> > are also affected by the p2p change and of course the IOMMU group is
> > entirely invalid once the p2p holes start getting poked.
> >   
> > > I don't think anyone wants another boot option that must be set in order 
> > > to use this functionality (and only some hardware would require this). 
> > > That's just a huge pain for users.  
> > 
> > No, but nor do we need IOMMU groups that no longer represent what
> > they're intended to describe or runtime, unchecked routing changes
> > through the topology for devices that might already be using
> > conflicting IOVA ranges.  Maybe soft hotplugs are another possibility,
> > designate a sub-hierarchy to be removed and re-scanned with ACS
> > disabled.  Otherwise it seems like disabling and re-enabling ACS needs
> > to also handle merging and splitting groups dynamically.  Thanks,
> >   
> 
> Dumb question, can we use a PCI bar address of one device into the
> IOMMU page table of another address ie like we would DMA map a
> regular system page ?
> 
> It would be much better in my view to follow down such path if that
> is at all possible from hardware point of view (i am not sure where
> to dig in the specification to answer my above question).

Yes, we can bounce device MMIO through the IOMMU, or at least vfio does
enable these mappings to allow p2p between devices assigned to the same
VM and we've seen evidence that they work, but like p2p in general,
there are likely platform dependencies.  For Logan this doesn't really
solve the problem unless the device and downstream port support ATS and
ACS Direct Translated P2P is enable.  That would allow the device to do
p2p with addresses pre-translated by the IOMMU and cached by ATS on the
device without the transaction needing to traverse all the way to the
IOMMU on the root bus.  The security of ATS is pretty questionable in
general, but that would likely be a solution that wouldn't require
manipulating the groupings.  Thanks,

Alex
Logan Gunthorpe Jan. 5, 2018, 5:10 p.m. UTC | #8
On 04/01/18 08:33 PM, Alex Williamson wrote:
> That's exactly what IOMMU groups represent, the smallest set of devices
> which have DMA isolation from other devices.  By poking this hole, the
> IOMMU group is invalid.  We cannot turn off ACS only for a specific
> device, in order to enable p2p it needs to be disabled at every
> downstream port between the devices where we want to enable p2p.
> Depending on the topology, that could mean we're also enabling p2p for
> unrelated devices.  Those unrelated devices might be in active use and
> the p2p IOVAs now have a different destination which is no longer IOMMU
> translated.

Oh, so IOMMU groups are created based on the existing hierarchy at boot 
time and not based on the user's needs for isolation?

Logan
Alex Williamson Jan. 5, 2018, 5:18 p.m. UTC | #9
On Fri, 5 Jan 2018 10:10:51 -0700
Logan Gunthorpe <logang@deltatee.com> wrote:

> On 04/01/18 08:33 PM, Alex Williamson wrote:
> > That's exactly what IOMMU groups represent, the smallest set of devices
> > which have DMA isolation from other devices.  By poking this hole, the
> > IOMMU group is invalid.  We cannot turn off ACS only for a specific
> > device, in order to enable p2p it needs to be disabled at every
> > downstream port between the devices where we want to enable p2p.
> > Depending on the topology, that could mean we're also enabling p2p for
> > unrelated devices.  Those unrelated devices might be in active use and
> > the p2p IOVAs now have a different destination which is no longer IOMMU
> > translated.  
> 
> Oh, so IOMMU groups are created based on the existing hierarchy at boot 
> time and not based on the user's needs for isolation?

Yes, IOMMU groups expose the isolation of the system as devices are
discovered.  Nothing currently accounts for intentionally decreasing
the isolation between devices.  Thanks,

Alex
diff mbox series

Patch

diff --git a/drivers/pci/p2p.c b/drivers/pci/p2p.c
index 87cec87b02e3..617adaa905d2 100644
--- a/drivers/pci/p2p.c
+++ b/drivers/pci/p2p.c
@@ -227,6 +227,89 @@  static struct pci_dev *find_parent_pci_dev(struct device *dev)
 }
 
 /*
+ * The ACS flags for P2P Request Redirect and P2P Completion Redirect need
+ * to be disabled in the downstream port of each device in order for
+ * the TLPs to not be forwarded up to the RC.
+ */
+#define PCI_P2P_ACS_FLAGS (PCI_ACS_RR | PCI_ACS_CR)
+
+static int pci_p2pmem_disable_acs(struct pci_dev *pdev)
+{
+	int pos;
+	u16 ctrl;
+	struct pci_dev *downstream;
+
+	downstream = pci_dev_get(pci_upstream_bridge(pdev));
+	if (!downstream) {
+		dev_err(&pdev->dev, "could not find downstream port\n");
+		return -ENODEV;
+	}
+
+	device_lock(&downstream->dev);
+	if (downstream->p2p_acs_requests++)
+		goto unlock_and_return;
+
+	pos = pci_find_ext_capability(downstream, PCI_EXT_CAP_ID_ACS);
+	if (!pos)
+		goto unlock_and_return;
+
+	pci_read_config_word(downstream, pos + PCI_ACS_CTRL, &ctrl);
+
+	downstream->p2p_old_acs_flags = ctrl & PCI_P2P_ACS_FLAGS;
+
+	if (downstream->p2p_old_acs_flags)
+		dev_info(&pdev->dev, "disabling p2p acs flags: %x\n", ctrl);
+
+	ctrl &= ~PCI_P2P_ACS_FLAGS;
+
+	pci_write_config_word(downstream, pos + PCI_ACS_CTRL, ctrl);
+
+unlock_and_return:
+	device_unlock(&downstream->dev);
+	pci_dev_put(downstream);
+	return 0;
+}
+
+static int pci_p2pmem_reset_acs(struct pci_dev *pdev)
+{
+	int pos;
+	u16 ctrl;
+	struct pci_dev *downstream;
+
+	downstream = pci_dev_get(pci_upstream_bridge(pdev));
+	if (!downstream)
+		return -ENODEV;
+
+	device_lock(&downstream->dev);
+
+	/* Only actually reset the flags on a 1->0 transition */
+	if (!downstream->p2p_acs_requests)
+		goto unlock_and_return;
+
+	if (--downstream->p2p_acs_requests)
+		goto unlock_and_return;
+
+	pos = pci_find_ext_capability(downstream, PCI_EXT_CAP_ID_ACS);
+	if (!pos)
+		goto unlock_and_return;
+
+	pci_read_config_word(downstream, pos + PCI_ACS_CTRL, &ctrl);
+
+	ctrl &= ~PCI_P2P_ACS_FLAGS;
+	ctrl |= downstream->p2p_old_acs_flags;
+
+	if (downstream->p2p_old_acs_flags)
+		dev_info(&pdev->dev, "resetting p2p acs flags: %x\n", ctrl);
+
+	pci_write_config_word(downstream, pos + PCI_ACS_CTRL, ctrl);
+
+unlock_and_return:
+	device_unlock(&downstream->dev);
+	pci_dev_put(downstream);
+	return 0;
+}
+
+/*
  * If a device is behind a switch, we try to find the upstream bridge
  * port of the switch. This requires two calls to pci_upstream_bridge:
  * one for the upstream port on the switch, one on the upstream port
@@ -340,6 +423,10 @@  int pci_p2pmem_add_client(struct list_head *head, struct device *dev)
 			ret = -EXDEV;
 			goto put_client;
 		}
+
+		ret = pci_p2pmem_disable_acs(client);
+		if (!ret)
+			goto put_client;
 	}
 
 	new_item = kzalloc(sizeof(*new_item), GFP_KERNEL);
@@ -363,6 +450,9 @@  EXPORT_SYMBOL_GPL(pci_p2pmem_add_client);
 
 static void pci_p2pmem_client_free(struct pci_p2pmem_client *item)
 {
+	if (item->p2pmem)
+		pci_p2pmem_reset_acs(item->client);
+
 	list_del(&item->list);
 	pci_dev_put(item->client);
 	pci_dev_put(item->p2pmem);
@@ -383,6 +473,7 @@  void pci_p2pmem_remove_client(struct list_head *head, struct device *dev)
 {
 	struct pci_p2pmem_client *pos, *tmp;
 	struct pci_dev *pdev;
+	struct pci_dev *p2pmem = NULL;
 
 	pdev = find_parent_pci_dev(dev);
 	if (!pdev)
@@ -392,9 +483,16 @@  void pci_p2pmem_remove_client(struct list_head *head, struct device *dev)
 		if (pos->client != pdev)
 			continue;
 
+		if (!p2pmem)
+			p2pmem = pci_dev_get(pos->p2pmem);
+
 		pci_p2pmem_client_free(pos);
 	}
 
+	if (p2pmem && list_empty(head))
+		pci_p2pmem_reset_acs(p2pmem);
+
+	pci_dev_put(p2pmem);
 	pci_dev_put(pdev);
 }
 EXPORT_SYMBOL_GPL(pci_p2pmem_remove_client);
@@ -412,6 +510,10 @@  void pci_p2pmem_client_list_free(struct list_head *head)
 {
 	struct pci_p2pmem_client *pos, *tmp;
 
+	pos = list_first_entry_or_null(head, struct pci_p2pmem_client, list);
+	if (pos && pos->p2pmem)
+		pci_p2pmem_reset_acs(pos->p2pmem);
+
 	list_for_each_entry_safe(pos, tmp, head, list)
 		pci_p2pmem_client_free(pos);
 }
@@ -440,6 +542,40 @@  static bool upstream_bridges_match_list(struct pci_dev *pdev,
 	return ret;
 }
 
+static int bind_clients(struct pci_dev *p2pmem, struct list_head *clients)
+{
+	int ret;
+	struct pci_p2pmem_client *pos, *unwind_pos;
+
+	ret = pci_p2pmem_disable_acs(p2pmem);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(pos, clients, list) {
+		ret = pci_p2pmem_disable_acs(pos->client);
+		if (ret)
+			goto unwind;
+
+		pos->p2pmem = pci_dev_get(p2pmem);
+	}
+
+	return 0;
+
+unwind:
+	list_for_each_entry(unwind_pos, clients, list) {
+		if (pos == unwind_pos)
+			break;
+
+		pci_p2pmem_reset_acs(unwind_pos->client);
+		pci_dev_put(unwind_pos->p2pmem);
+		unwind_pos->p2pmem = NULL;
+	}
+
+	pci_p2pmem_reset_acs(p2pmem);
+
+	return ret;
+}
+
 /**
  * pci_p2pmem_find - find a p2p mem device compatible with the specified device
  * @dev: list of device to check (NULL-terminated)
@@ -450,11 +586,13 @@  static bool upstream_bridges_match_list(struct pci_dev *pdev,
  *
  * Returns a pointer to the PCI device with a reference taken (use pci_dev_put
  * to return the reference) or NULL if no compatible device is found.
+ *
+ * The P2P ACS flags on all applicable PCI devices will be cleared and
+ * reset when the client is removed from the list.
  */
 struct pci_dev *pci_p2pmem_find(struct list_head *clients)
 {
 	struct pci_dev *pdev = NULL;
-	struct pci_p2pmem_client *pos;
 
 	while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev))) {
 		if (!pdev->p2p || !pdev->p2p->published)
@@ -463,8 +601,8 @@  struct pci_dev *pci_p2pmem_find(struct list_head *clients)
 		if (!upstream_bridges_match_list(pdev, clients))
 			continue;
 
-		list_for_each_entry(pos, clients, list)
-			pos->p2pmem = pdev;
+		if (bind_clients(pdev, clients))
+			continue;
 
 		return pdev;
 	}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 047aea679e87..cdd4d3c3e562 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -435,6 +435,8 @@  struct pci_dev {
 #endif
 #ifdef CONFIG_PCI_P2P
 	struct pci_p2p *p2p;
+	unsigned int p2p_acs_requests;
+	u16 p2p_old_acs_flags;
 #endif
 	phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */
 	size_t romlen; /* Length of ROM if it's not from the BAR */