diff mbox

[v12,10/21] PCI: Consider additional PF's IOV BAR alignment in sizing and assigning

Message ID 20150224083406.32124.65957.stgit@bhelgaas-glaptop2.roam.corp.google.com
State Superseded
Headers show

Commit Message

Bjorn Helgaas Feb. 24, 2015, 8:34 a.m. UTC
From: Wei Yang <weiyang@linux.vnet.ibm.com>

When sizing and assigning resources, we divide the resources into two
lists: the requested list and the additional list.  We don't consider the
alignment of additional VF(n) BAR space.

This is reasonable because the alignment required for the VF(n) BAR space
is the size of an individual VF BAR, not the size of the space for *all*
VFs.  But some platforms, e.g., PowerNV, require additional alignment.

Consider the additional IOV BAR alignment when sizing and assigning
resources.  When there is not enough system MMIO space, the PF's IOV BAR
alignment will not contribute to the bridge.  When there is enough system
MMIO space, the additional alignment will contribute to the bridge.

Also, take advantage of pci_dev_resource::min_align to store this
additional alignment.

[bhelgaas: changelog, printk cast]
Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/setup-bus.c |   83 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 70 insertions(+), 13 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Bjorn Helgaas Feb. 24, 2015, 8:41 a.m. UTC | #1
On Tue, Feb 24, 2015 at 02:34:06AM -0600, Bjorn Helgaas wrote:
> From: Wei Yang <weiyang@linux.vnet.ibm.com>
> 
> When sizing and assigning resources, we divide the resources into two
> lists: the requested list and the additional list.  We don't consider the
> alignment of additional VF(n) BAR space.
> 
> This is reasonable because the alignment required for the VF(n) BAR space
> is the size of an individual VF BAR, not the size of the space for *all*
> VFs.  But some platforms, e.g., PowerNV, require additional alignment.
> 
> Consider the additional IOV BAR alignment when sizing and assigning
> resources.  When there is not enough system MMIO space, the PF's IOV BAR
> alignment will not contribute to the bridge.  When there is enough system
> MMIO space, the additional alignment will contribute to the bridge.

I don't understand the ""when there is not enough system MMIO space" part.
How do we tell if there's enough MMIO space?

> Also, take advantage of pci_dev_resource::min_align to store this
> additional alignment.

This comment doesn't seem to make sense; this patch doesn't save anything
in min_align.

Another question below...

> [bhelgaas: changelog, printk cast]
> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> ---
>  drivers/pci/setup-bus.c |   83 ++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 70 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
> index e3e17f3c0f0f..affbceae560f 100644
> --- a/drivers/pci/setup-bus.c
> +++ b/drivers/pci/setup-bus.c
> @@ -99,8 +99,8 @@ static void remove_from_list(struct list_head *head,
>  	}
>  }
>  
> -static resource_size_t get_res_add_size(struct list_head *head,
> -					struct resource *res)
> +static struct pci_dev_resource *res_to_dev_res(struct list_head *head,
> +					       struct resource *res)
>  {
>  	struct pci_dev_resource *dev_res;
>  
> @@ -109,17 +109,37 @@ static resource_size_t get_res_add_size(struct list_head *head,
>  			int idx = res - &dev_res->dev->resource[0];
>  
>  			dev_printk(KERN_DEBUG, &dev_res->dev->dev,
> -				 "res[%d]=%pR get_res_add_size add_size %llx\n",
> +				 "res[%d]=%pR res_to_dev_res add_size %llx min_align %llx\n",
>  				 idx, dev_res->res,
> -				 (unsigned long long)dev_res->add_size);
> +				 (unsigned long long)dev_res->add_size,
> +				 (unsigned long long)dev_res->min_align);
>  
> -			return dev_res->add_size;
> +			return dev_res;
>  		}
>  	}
>  
> -	return 0;
> +	return NULL;
> +}
> +
> +static resource_size_t get_res_add_size(struct list_head *head,
> +					struct resource *res)
> +{
> +	struct pci_dev_resource *dev_res;
> +
> +	dev_res = res_to_dev_res(head, res);
> +	return dev_res ? dev_res->add_size : 0;
> +}
> +
> +static resource_size_t get_res_add_align(struct list_head *head,
> +					 struct resource *res)
> +{
> +	struct pci_dev_resource *dev_res;
> +
> +	dev_res = res_to_dev_res(head, res);
> +	return dev_res ? dev_res->min_align : 0;
>  }
>  
> +
>  /* Sort resources by alignment */
>  static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
>  {
> @@ -368,8 +388,9 @@ static void __assign_resources_sorted(struct list_head *head,
>  	LIST_HEAD(save_head);
>  	LIST_HEAD(local_fail_head);
>  	struct pci_dev_resource *save_res;
> -	struct pci_dev_resource *dev_res, *tmp_res;
> +	struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
>  	unsigned long fail_type;
> +	resource_size_t add_align, align;
>  
>  	/* Check if optional add_size is there */
>  	if (!realloc_head || list_empty(realloc_head))
> @@ -384,10 +405,38 @@ static void __assign_resources_sorted(struct list_head *head,
>  	}
>  
>  	/* Update res in head list with add_size in realloc_head list */
> -	list_for_each_entry(dev_res, head, list)
> +	list_for_each_entry_safe(dev_res, tmp_res, head, list) {
>  		dev_res->res->end += get_res_add_size(realloc_head,
>  							dev_res->res);
>  
> +		/*
> +		 * There are two kinds of additional resources in the list:
> +		 * 1. bridge resource  -- IORESOURCE_STARTALIGN
> +		 * 2. SR-IOV resource   -- IORESOURCE_SIZEALIGN
> +		 * Here just fix the additional alignment for bridge
> +		 */
> +		if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
> +			continue;
> +
> +		add_align = get_res_add_align(realloc_head, dev_res->res);
> +
> +		/* Reorder the list by their alignment */

Why do we need to reorder the list by alignment?

> +		if (add_align > dev_res->res->start) {
> +			dev_res->res->start = add_align;
> +			dev_res->res->end = add_align +
> +				            resource_size(dev_res->res);
> +
> +			list_for_each_entry(dev_res2, head, list) {
> +				align = pci_resource_alignment(dev_res2->dev,
> +							       dev_res2->res);
> +				if (add_align > align)
> +					list_move_tail(&dev_res->list,
> +						       &dev_res2->list);
> +			}
> +               }
> +
> +	}
> +
>  	/* Try updated head list with add_size added */
>  	assign_requested_resources_sorted(head, &local_fail_head);
>  
> @@ -962,6 +1011,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>  	struct resource *b_res = find_free_bus_resource(bus,
>  					mask | IORESOURCE_PREFETCH, type);
>  	resource_size_t children_add_size = 0;
> +	resource_size_t children_add_align = 0;
> +	resource_size_t add_align = 0;
>  
>  	if (!b_res)
>  		return -ENOSPC;
> @@ -986,6 +1037,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>  			/* put SRIOV requested res to the optional list */
>  			if (realloc_head && i >= PCI_IOV_RESOURCES &&
>  					i <= PCI_IOV_RESOURCE_END) {
> +				add_align = max(pci_resource_alignment(dev, r), add_align);
>  				r->end = r->start - 1;
>  				add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
>  				children_add_size += r_size;
> @@ -1016,19 +1068,23 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>  			if (order > max_order)
>  				max_order = order;
>  
> -			if (realloc_head)
> +			if (realloc_head) {
>  				children_add_size += get_res_add_size(realloc_head, r);
> +				children_add_align = get_res_add_align(realloc_head, r);
> +				add_align = max(add_align, children_add_align);
> +			}
>  		}
>  	}
>  
>  	min_align = calculate_mem_align(aligns, max_order);
>  	min_align = max(min_align, window_alignment(bus, b_res->flags));
>  	size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
> +	add_align = max(min_align, add_align);
>  	if (children_add_size > add_size)
>  		add_size = children_add_size;
>  	size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
>  		calculate_memsize(size, min_size, add_size,
> -				resource_size(b_res), min_align);
> +				resource_size(b_res), add_align);
>  	if (!size0 && !size1) {
>  		if (b_res->start || b_res->end)
>  			dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n",
> @@ -1040,10 +1096,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>  	b_res->end = size0 + min_align - 1;
>  	b_res->flags |= IORESOURCE_STARTALIGN;
>  	if (size1 > size0 && realloc_head) {
> -		add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
> -		dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n",
> +		add_to_list(realloc_head, bus->self, b_res, size1-size0, add_align);
> +		dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx add_align %llx\n",
>  			   b_res, &bus->busn_res,
> -			   (unsigned long long)size1-size0);
> +			   (unsigned long long) (size1 - size0),
> +			   (unsigned long long) add_align);
>  	}
>  	return 0;
>  }
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Yang March 2, 2015, 7:32 a.m. UTC | #2
On Tue, Feb 24, 2015 at 02:41:52AM -0600, Bjorn Helgaas wrote:
>On Tue, Feb 24, 2015 at 02:34:06AM -0600, Bjorn Helgaas wrote:
>> From: Wei Yang <weiyang@linux.vnet.ibm.com>
>> 
>> When sizing and assigning resources, we divide the resources into two
>> lists: the requested list and the additional list.  We don't consider the
>> alignment of additional VF(n) BAR space.
>> 
>> This is reasonable because the alignment required for the VF(n) BAR space
>> is the size of an individual VF BAR, not the size of the space for *all*
>> VFs.  But some platforms, e.g., PowerNV, require additional alignment.
>> 
>> Consider the additional IOV BAR alignment when sizing and assigning
>> resources.  When there is not enough system MMIO space, the PF's IOV BAR
>> alignment will not contribute to the bridge.  When there is enough system
>> MMIO space, the additional alignment will contribute to the bridge.
>
>I don't understand the ""when there is not enough system MMIO space" part.
>How do we tell if there's enough MMIO space?
>

In __assign_resources_sorted(), it has two resources list, one for requested
(head) and one for additional (realloc_head). This function will first try to
combine them and assign. If failed, this means we don't have enough MMIO
space.

>> Also, take advantage of pci_dev_resource::min_align to store this
>> additional alignment.
>
>This comment doesn't seem to make sense; this patch doesn't save anything
>in min_align.
>

At the end of this patch:

   add_to_list(realloc_head, bus->self, b_res, size1-size0, add_align);

The add_align is stored in pci_dev_resource::min_align in add_to_list(). And
retrieved by get_res_add_align() in below code. This field is not used
previously, so I took advantage of this field to store the alignment of the
additional resources.

>Another question below...
>
>> [bhelgaas: changelog, printk cast]
>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>> ---
>>  drivers/pci/setup-bus.c |   83 ++++++++++++++++++++++++++++++++++++++++-------
>>  1 file changed, 70 insertions(+), 13 deletions(-)
>> 
>> diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
>> index e3e17f3c0f0f..affbceae560f 100644
>> --- a/drivers/pci/setup-bus.c
>> +++ b/drivers/pci/setup-bus.c
>> @@ -99,8 +99,8 @@ static void remove_from_list(struct list_head *head,
>>  	}
>>  }
>>  
>> -static resource_size_t get_res_add_size(struct list_head *head,
>> -					struct resource *res)
>> +static struct pci_dev_resource *res_to_dev_res(struct list_head *head,
>> +					       struct resource *res)
>>  {
>>  	struct pci_dev_resource *dev_res;
>>  
>> @@ -109,17 +109,37 @@ static resource_size_t get_res_add_size(struct list_head *head,
>>  			int idx = res - &dev_res->dev->resource[0];
>>  
>>  			dev_printk(KERN_DEBUG, &dev_res->dev->dev,
>> -				 "res[%d]=%pR get_res_add_size add_size %llx\n",
>> +				 "res[%d]=%pR res_to_dev_res add_size %llx min_align %llx\n",
>>  				 idx, dev_res->res,
>> -				 (unsigned long long)dev_res->add_size);
>> +				 (unsigned long long)dev_res->add_size,
>> +				 (unsigned long long)dev_res->min_align);
>>  
>> -			return dev_res->add_size;
>> +			return dev_res;
>>  		}
>>  	}
>>  
>> -	return 0;
>> +	return NULL;
>> +}
>> +
>> +static resource_size_t get_res_add_size(struct list_head *head,
>> +					struct resource *res)
>> +{
>> +	struct pci_dev_resource *dev_res;
>> +
>> +	dev_res = res_to_dev_res(head, res);
>> +	return dev_res ? dev_res->add_size : 0;
>> +}
>> +
>> +static resource_size_t get_res_add_align(struct list_head *head,
>> +					 struct resource *res)
>> +{
>> +	struct pci_dev_resource *dev_res;
>> +
>> +	dev_res = res_to_dev_res(head, res);
>> +	return dev_res ? dev_res->min_align : 0;
>>  }
>>  
>> +
>>  /* Sort resources by alignment */
>>  static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
>>  {
>> @@ -368,8 +388,9 @@ static void __assign_resources_sorted(struct list_head *head,
>>  	LIST_HEAD(save_head);
>>  	LIST_HEAD(local_fail_head);
>>  	struct pci_dev_resource *save_res;
>> -	struct pci_dev_resource *dev_res, *tmp_res;
>> +	struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
>>  	unsigned long fail_type;
>> +	resource_size_t add_align, align;
>>  
>>  	/* Check if optional add_size is there */
>>  	if (!realloc_head || list_empty(realloc_head))
>> @@ -384,10 +405,38 @@ static void __assign_resources_sorted(struct list_head *head,
>>  	}
>>  
>>  	/* Update res in head list with add_size in realloc_head list */
>> -	list_for_each_entry(dev_res, head, list)
>> +	list_for_each_entry_safe(dev_res, tmp_res, head, list) {
>>  		dev_res->res->end += get_res_add_size(realloc_head,
>>  							dev_res->res);
>>  
>> +		/*
>> +		 * There are two kinds of additional resources in the list:
>> +		 * 1. bridge resource  -- IORESOURCE_STARTALIGN
>> +		 * 2. SR-IOV resource   -- IORESOURCE_SIZEALIGN
>> +		 * Here just fix the additional alignment for bridge
>> +		 */
>> +		if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
>> +			continue;
>> +
>> +		add_align = get_res_add_align(realloc_head, dev_res->res);
>> +
>> +		/* Reorder the list by their alignment */
>
>Why do we need to reorder the list by alignment?

Resource list "head" is sorted by the alignment, while the alignment would be
changed after we considering the additional resource.

Take powernv platform as an example. The IOV BAR is expanded and need to be
aligned with its total size instead of the individual VF BAR size. If we don't
reorder it, the IOV BAR would be assigned after some other resources, which
may cause the real assignment fail even the total size is enough.

>
>> +		if (add_align > dev_res->res->start) {
>> +			dev_res->res->start = add_align;
>> +			dev_res->res->end = add_align +
>> +				            resource_size(dev_res->res);
>> +
>> +			list_for_each_entry(dev_res2, head, list) {
>> +				align = pci_resource_alignment(dev_res2->dev,
>> +							       dev_res2->res);
>> +				if (add_align > align)
>> +					list_move_tail(&dev_res->list,
>> +						       &dev_res2->list);
>> +			}
>> +               }
>> +
>> +	}
>> +
>>  	/* Try updated head list with add_size added */
>>  	assign_requested_resources_sorted(head, &local_fail_head);
>>  
>> @@ -962,6 +1011,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>>  	struct resource *b_res = find_free_bus_resource(bus,
>>  					mask | IORESOURCE_PREFETCH, type);
>>  	resource_size_t children_add_size = 0;
>> +	resource_size_t children_add_align = 0;
>> +	resource_size_t add_align = 0;
>>  
>>  	if (!b_res)
>>  		return -ENOSPC;
>> @@ -986,6 +1037,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>>  			/* put SRIOV requested res to the optional list */
>>  			if (realloc_head && i >= PCI_IOV_RESOURCES &&
>>  					i <= PCI_IOV_RESOURCE_END) {
>> +				add_align = max(pci_resource_alignment(dev, r), add_align);
>>  				r->end = r->start - 1;
>>  				add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
>>  				children_add_size += r_size;
>> @@ -1016,19 +1068,23 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>>  			if (order > max_order)
>>  				max_order = order;
>>  
>> -			if (realloc_head)
>> +			if (realloc_head) {
>>  				children_add_size += get_res_add_size(realloc_head, r);
>> +				children_add_align = get_res_add_align(realloc_head, r);
>> +				add_align = max(add_align, children_add_align);
>> +			}
>>  		}
>>  	}
>>  
>>  	min_align = calculate_mem_align(aligns, max_order);
>>  	min_align = max(min_align, window_alignment(bus, b_res->flags));
>>  	size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
>> +	add_align = max(min_align, add_align);
>>  	if (children_add_size > add_size)
>>  		add_size = children_add_size;
>>  	size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
>>  		calculate_memsize(size, min_size, add_size,
>> -				resource_size(b_res), min_align);
>> +				resource_size(b_res), add_align);
>>  	if (!size0 && !size1) {
>>  		if (b_res->start || b_res->end)
>>  			dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n",
>> @@ -1040,10 +1096,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
>>  	b_res->end = size0 + min_align - 1;
>>  	b_res->flags |= IORESOURCE_STARTALIGN;
>>  	if (size1 > size0 && realloc_head) {
>> -		add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
>> -		dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n",
>> +		add_to_list(realloc_head, bus->self, b_res, size1-size0, add_align);
>> +		dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx add_align %llx\n",
>>  			   b_res, &bus->busn_res,
>> -			   (unsigned long long)size1-size0);
>> +			   (unsigned long long) (size1 - size0),
>> +			   (unsigned long long) add_align);
>>  	}
>>  	return 0;
>>  }
>> 
>--
>To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas March 11, 2015, 2:36 a.m. UTC | #3
On Mon, Mar 02, 2015 at 03:32:47PM +0800, Wei Yang wrote:
> On Tue, Feb 24, 2015 at 02:41:52AM -0600, Bjorn Helgaas wrote:
> >On Tue, Feb 24, 2015 at 02:34:06AM -0600, Bjorn Helgaas wrote:
> >> From: Wei Yang <weiyang@linux.vnet.ibm.com>
> >> 
> >> When sizing and assigning resources, we divide the resources into two
> >> lists: the requested list and the additional list.  We don't consider the
> >> alignment of additional VF(n) BAR space.
> >> 
> >> This is reasonable because the alignment required for the VF(n) BAR space
> >> is the size of an individual VF BAR, not the size of the space for *all*
> >> VFs.  But some platforms, e.g., PowerNV, require additional alignment.
> >> 
> >> Consider the additional IOV BAR alignment when sizing and assigning
> >> resources.  When there is not enough system MMIO space, the PF's IOV BAR
> >> alignment will not contribute to the bridge.  When there is enough system
> >> MMIO space, the additional alignment will contribute to the bridge.
> >
> >I don't understand the ""when there is not enough system MMIO space" part.
> >How do we tell if there's enough MMIO space?
> >
> 
> In __assign_resources_sorted(), it has two resources list, one for requested
> (head) and one for additional (realloc_head). This function will first try to
> combine them and assign. If failed, this means we don't have enough MMIO
> space.

How about this text:

  This is because the alignment required for the VF(n) BAR space is the size
  of an individual VF BAR, not the size of the space for *all* VFs.  But we
  want additional alignment to support partitioning on PowerNV.

  Consider the additional IOV BAR alignment when sizing and assigning
  resources.  When there is not enough system MMIO space to accomodate both
  the requested list and the additional list, the PF's IOV BAR alignment will
  not contribute to the bridge.  When there is enough system MMIO space for
  both lists, the additional alignment will contribute to the bridge.

We're doing something specifically for PowerNV.  I would really like to be
able to read this patch and say "Oh, here's the hook where we get the
PowerNV behavior, and it's obvious that other platforms are unaffected."
But I don't see a pcibios or similar hook, so I don't know where that
PowerNV behavior is.

Is it something to do with get_res_add_align()?  That uses min_align, but I
don't know how that's connected ...  ah, I see, "add_align" is computed
from pci_resource_alignment(), which has this path:

  pci_resource_alignment
    pci_sriov_resource_alignment
      pcibios_iov_resource_alignment

and powerpc has a special pcibios_iov_resource_alignment() for PowerNV.

> >> Also, take advantage of pci_dev_resource::min_align to store this
> >> additional alignment.
> >
> >This comment doesn't seem to make sense; this patch doesn't save anything
> >in min_align.
> 
> At the end of this patch:
> 
>    add_to_list(realloc_head, bus->self, b_res, size1-size0, add_align);
> 
> The add_align is stored in pci_dev_resource::min_align in add_to_list(). And
> retrieved by get_res_add_align() in below code. This field is not used
> previously, so I took advantage of this field to store the alignment of the
> additional resources.

Hmm.  pci_dev_resource::min_align *is* already used in
reassign_resources_sorted().  Maybe there's no overlap; I gave up the
analysis before I could convince myself.

The changelog needs to mention the add_to_list() connection.

> >> +		/*
> >> +		 * There are two kinds of additional resources in the list:
> >> +		 * 1. bridge resource  -- IORESOURCE_STARTALIGN
> >> +		 * 2. SR-IOV resource   -- IORESOURCE_SIZEALIGN
> >> +		 * Here just fix the additional alignment for bridge
> >> +		 */
> >> +		if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
> >> +			continue;
> >> +
> >> +		add_align = get_res_add_align(realloc_head, dev_res->res);
> >> +
> >> +		/* Reorder the list by their alignment */
> >
> >Why do we need to reorder the list by alignment?
> 
> Resource list "head" is sorted by the alignment, while the alignment would be
> changed after we considering the additional resource.
> 
> Take powernv platform as an example. The IOV BAR is expanded and need to be
> aligned with its total size instead of the individual VF BAR size. If we don't
> reorder it, the IOV BAR would be assigned after some other resources, which
> may cause the real assignment fail even the total size is enough.

This is worthy of a comment in the code.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Yang March 11, 2015, 9:17 a.m. UTC | #4
On Tue, Mar 10, 2015 at 09:36:58PM -0500, Bjorn Helgaas wrote:
>On Mon, Mar 02, 2015 at 03:32:47PM +0800, Wei Yang wrote:
>> On Tue, Feb 24, 2015 at 02:41:52AM -0600, Bjorn Helgaas wrote:
>> >On Tue, Feb 24, 2015 at 02:34:06AM -0600, Bjorn Helgaas wrote:
>> >> From: Wei Yang <weiyang@linux.vnet.ibm.com>
>> >> 
>> >> When sizing and assigning resources, we divide the resources into two
>> >> lists: the requested list and the additional list.  We don't consider the
>> >> alignment of additional VF(n) BAR space.
>> >> 
>> >> This is reasonable because the alignment required for the VF(n) BAR space
>> >> is the size of an individual VF BAR, not the size of the space for *all*
>> >> VFs.  But some platforms, e.g., PowerNV, require additional alignment.
>> >> 
>> >> Consider the additional IOV BAR alignment when sizing and assigning
>> >> resources.  When there is not enough system MMIO space, the PF's IOV BAR
>> >> alignment will not contribute to the bridge.  When there is enough system
>> >> MMIO space, the additional alignment will contribute to the bridge.
>> >
>> >I don't understand the ""when there is not enough system MMIO space" part.
>> >How do we tell if there's enough MMIO space?
>> >
>> 
>> In __assign_resources_sorted(), it has two resources list, one for requested
>> (head) and one for additional (realloc_head). This function will first try to
>> combine them and assign. If failed, this means we don't have enough MMIO
>> space.
>
>How about this text:
>
>  This is because the alignment required for the VF(n) BAR space is the size
>  of an individual VF BAR, not the size of the space for *all* VFs.  But we
>  want additional alignment to support partitioning on PowerNV.
>
>  Consider the additional IOV BAR alignment when sizing and assigning
>  resources.  When there is not enough system MMIO space to accomodate both
>  the requested list and the additional list, the PF's IOV BAR alignment will
>  not contribute to the bridge.  When there is enough system MMIO space for
>  both lists, the additional alignment will contribute to the bridge.
>
>We're doing something specifically for PowerNV.  I would really like to be
>able to read this patch and say "Oh, here's the hook where we get the
>PowerNV behavior, and it's obvious that other platforms are unaffected."
>But I don't see a pcibios or similar hook, so I don't know where that
>PowerNV behavior is.
>
>Is it something to do with get_res_add_align()?  That uses min_align, but I
>don't know how that's connected ...  ah, I see, "add_align" is computed
>from pci_resource_alignment(), which has this path:
>
>  pci_resource_alignment
>    pci_sriov_resource_alignment
>      pcibios_iov_resource_alignment
>
>and powerpc has a special pcibios_iov_resource_alignment() for PowerNV.
>

Thanks for the text. I have added these in the change log and some description
about how it give arch a chance to be involved.

>> >> Also, take advantage of pci_dev_resource::min_align to store this
>> >> additional alignment.
>> >
>> >This comment doesn't seem to make sense; this patch doesn't save anything
>> >in min_align.
>> 
>> At the end of this patch:
>> 
>>    add_to_list(realloc_head, bus->self, b_res, size1-size0, add_align);
>> 
>> The add_align is stored in pci_dev_resource::min_align in add_to_list(). And
>> retrieved by get_res_add_align() in below code. This field is not used
>> previously, so I took advantage of this field to store the alignment of the
>> additional resources.
>
>Hmm.  pci_dev_resource::min_align *is* already used in
>reassign_resources_sorted().  Maybe there's no overlap; I gave up the
>analysis before I could convince myself.
>

Bjorn,

I know you may have some concern on this, let me try to explain how I
understand the code. If my understanding is not correct, please let me know.

In __assign_resources_sorted(), we pass two resources list, one is required
and the other is the additional. First, we try our best to assigned both of
them by merge them together. If this fails, we will assign the required list
first and then take care of the additional list.

There is one interesting thing in the first step. We merge these two list to
the required list and in this patch I fix the alignment in required list.
(Which is the "head" list in code.) And before doing so, we save the original
information in "save_head". When we fail to assign the merged list, we will
restore the required list, this mean we clean the alignment done in this patch
and make sure we assign the required resource just with basic alignment.

The usage of the min_align in reassign_resources_sorted() happens in the
second part to assign the additional list individually. In the realloc_head
list, those resources still have the add_align which is calculated in
pbus_size_mem(). And we try to allocate it with this alignment, which is
exactly what we want.

BTW, by reading the code again, it looks I missed to change one place in
reassign_resources_sorted(). In condition when (!resource_size(res)) is true,
we rely on the res->start to be the alignment. Since the alignment is no
longer the start address, we need to fix this part too.

I may miss some background of the code, if my understanding is not correct,
glad to hear from you.

>The changelog needs to mention the add_to_list() connection.
>

Added in the change log.

>> >> +		/*
>> >> +		 * There are two kinds of additional resources in the list:
>> >> +		 * 1. bridge resource  -- IORESOURCE_STARTALIGN
>> >> +		 * 2. SR-IOV resource   -- IORESOURCE_SIZEALIGN
>> >> +		 * Here just fix the additional alignment for bridge
>> >> +		 */
>> >> +		if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
>> >> +			continue;
>> >> +
>> >> +		add_align = get_res_add_align(realloc_head, dev_res->res);
>> >> +
>> >> +		/* Reorder the list by their alignment */
>> >
>> >Why do we need to reorder the list by alignment?
>> 
>> Resource list "head" is sorted by the alignment, while the alignment would be
>> changed after we considering the additional resource.
>> 
>> Take powernv platform as an example. The IOV BAR is expanded and need to be
>> aligned with its total size instead of the individual VF BAR size. If we don't
>> reorder it, the IOV BAR would be assigned after some other resources, which
>> may cause the real assignment fail even the total size is enough.
>
>This is worthy of a comment in the code.
>
>Bjorn
>--
>To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index e3e17f3c0f0f..affbceae560f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -99,8 +99,8 @@  static void remove_from_list(struct list_head *head,
 	}
 }
 
-static resource_size_t get_res_add_size(struct list_head *head,
-					struct resource *res)
+static struct pci_dev_resource *res_to_dev_res(struct list_head *head,
+					       struct resource *res)
 {
 	struct pci_dev_resource *dev_res;
 
@@ -109,17 +109,37 @@  static resource_size_t get_res_add_size(struct list_head *head,
 			int idx = res - &dev_res->dev->resource[0];
 
 			dev_printk(KERN_DEBUG, &dev_res->dev->dev,
-				 "res[%d]=%pR get_res_add_size add_size %llx\n",
+				 "res[%d]=%pR res_to_dev_res add_size %llx min_align %llx\n",
 				 idx, dev_res->res,
-				 (unsigned long long)dev_res->add_size);
+				 (unsigned long long)dev_res->add_size,
+				 (unsigned long long)dev_res->min_align);
 
-			return dev_res->add_size;
+			return dev_res;
 		}
 	}
 
-	return 0;
+	return NULL;
+}
+
+static resource_size_t get_res_add_size(struct list_head *head,
+					struct resource *res)
+{
+	struct pci_dev_resource *dev_res;
+
+	dev_res = res_to_dev_res(head, res);
+	return dev_res ? dev_res->add_size : 0;
+}
+
+static resource_size_t get_res_add_align(struct list_head *head,
+					 struct resource *res)
+{
+	struct pci_dev_resource *dev_res;
+
+	dev_res = res_to_dev_res(head, res);
+	return dev_res ? dev_res->min_align : 0;
 }
 
+
 /* Sort resources by alignment */
 static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
 {
@@ -368,8 +388,9 @@  static void __assign_resources_sorted(struct list_head *head,
 	LIST_HEAD(save_head);
 	LIST_HEAD(local_fail_head);
 	struct pci_dev_resource *save_res;
-	struct pci_dev_resource *dev_res, *tmp_res;
+	struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
 	unsigned long fail_type;
+	resource_size_t add_align, align;
 
 	/* Check if optional add_size is there */
 	if (!realloc_head || list_empty(realloc_head))
@@ -384,10 +405,38 @@  static void __assign_resources_sorted(struct list_head *head,
 	}
 
 	/* Update res in head list with add_size in realloc_head list */
-	list_for_each_entry(dev_res, head, list)
+	list_for_each_entry_safe(dev_res, tmp_res, head, list) {
 		dev_res->res->end += get_res_add_size(realloc_head,
 							dev_res->res);
 
+		/*
+		 * There are two kinds of additional resources in the list:
+		 * 1. bridge resource  -- IORESOURCE_STARTALIGN
+		 * 2. SR-IOV resource   -- IORESOURCE_SIZEALIGN
+		 * Here just fix the additional alignment for bridge
+		 */
+		if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
+			continue;
+
+		add_align = get_res_add_align(realloc_head, dev_res->res);
+
+		/* Reorder the list by their alignment */
+		if (add_align > dev_res->res->start) {
+			dev_res->res->start = add_align;
+			dev_res->res->end = add_align +
+				            resource_size(dev_res->res);
+
+			list_for_each_entry(dev_res2, head, list) {
+				align = pci_resource_alignment(dev_res2->dev,
+							       dev_res2->res);
+				if (add_align > align)
+					list_move_tail(&dev_res->list,
+						       &dev_res2->list);
+			}
+               }
+
+	}
+
 	/* Try updated head list with add_size added */
 	assign_requested_resources_sorted(head, &local_fail_head);
 
@@ -962,6 +1011,8 @@  static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	struct resource *b_res = find_free_bus_resource(bus,
 					mask | IORESOURCE_PREFETCH, type);
 	resource_size_t children_add_size = 0;
+	resource_size_t children_add_align = 0;
+	resource_size_t add_align = 0;
 
 	if (!b_res)
 		return -ENOSPC;
@@ -986,6 +1037,7 @@  static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			/* put SRIOV requested res to the optional list */
 			if (realloc_head && i >= PCI_IOV_RESOURCES &&
 					i <= PCI_IOV_RESOURCE_END) {
+				add_align = max(pci_resource_alignment(dev, r), add_align);
 				r->end = r->start - 1;
 				add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
 				children_add_size += r_size;
@@ -1016,19 +1068,23 @@  static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			if (order > max_order)
 				max_order = order;
 
-			if (realloc_head)
+			if (realloc_head) {
 				children_add_size += get_res_add_size(realloc_head, r);
+				children_add_align = get_res_add_align(realloc_head, r);
+				add_align = max(add_align, children_add_align);
+			}
 		}
 	}
 
 	min_align = calculate_mem_align(aligns, max_order);
 	min_align = max(min_align, window_alignment(bus, b_res->flags));
 	size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
+	add_align = max(min_align, add_align);
 	if (children_add_size > add_size)
 		add_size = children_add_size;
 	size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
 		calculate_memsize(size, min_size, add_size,
-				resource_size(b_res), min_align);
+				resource_size(b_res), add_align);
 	if (!size0 && !size1) {
 		if (b_res->start || b_res->end)
 			dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n",
@@ -1040,10 +1096,11 @@  static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	b_res->end = size0 + min_align - 1;
 	b_res->flags |= IORESOURCE_STARTALIGN;
 	if (size1 > size0 && realloc_head) {
-		add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
-		dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n",
+		add_to_list(realloc_head, bus->self, b_res, size1-size0, add_align);
+		dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx add_align %llx\n",
 			   b_res, &bus->busn_res,
-			   (unsigned long long)size1-size0);
+			   (unsigned long long) (size1 - size0),
+			   (unsigned long long) add_align);
 	}
 	return 0;
 }