Message ID | 1439949704-8023-4-git-send-email-weiyang@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Wed, Aug 19, 2015 at 10:01:41AM +0800, Wei Yang wrote: >In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64 >BARs in Single PE mode to cover the number of VFs required to be enabled. >By doing so, several VFs would be in one VF Group and leads to interference >between VFs in the same group. > >And in this patch, m64_wins is renamed to m64_map, which means index number >of the M64 BAR used to map the VF BAR. Based on Gavin's comments. > >This patch changes the design by using one M64 BAR in Single PE mode for >one VF BAR. This gives absolute isolation for VFs. > >Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com> >--- > arch/powerpc/include/asm/pci-bridge.h | 5 +- > arch/powerpc/platforms/powernv/pci-ioda.c | 178 ++++++++++++----------------- > 2 files changed, 74 insertions(+), 109 deletions(-) > >diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h >index 712add5..8aeba4c 100644 >--- a/arch/powerpc/include/asm/pci-bridge.h >+++ b/arch/powerpc/include/asm/pci-bridge.h >@@ -214,10 +214,9 @@ struct pci_dn { > u16 vfs_expanded; /* number of VFs IOV BAR expanded */ > u16 num_vfs; /* number of VFs enabled*/ > int offset; /* PE# for the first VF PE */ >-#define M64_PER_IOV 4 >- int m64_per_iov; >+ bool m64_single_mode; /* Use M64 BAR in Single Mode */ > #define IODA_INVALID_M64 (-1) >- int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV]; >+ int (*m64_map)[PCI_SRIOV_NUM_BARS]; > #endif /* CONFIG_PCI_IOV */ > #endif > struct list_head child_list; >diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c >index e3e0acb..de7db1d 100644 >--- a/arch/powerpc/platforms/powernv/pci-ioda.c >+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >@@ -1148,29 +1148,36 @@ static void pnv_pci_ioda_setup_PEs(void) > } > > #ifdef CONFIG_PCI_IOV >-static int pnv_pci_vf_release_m64(struct pci_dev *pdev) >+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) > { > struct pci_bus *bus; > struct pci_controller *hose; > struct pnv_phb *phb; > struct pci_dn *pdn; > int i, j; >+ int m64_bars; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); > phb = hose->private_data; > pdn = pci_get_pdn(pdev); > >+ if (pdn->m64_single_mode) >+ m64_bars = num_vfs; >+ else >+ m64_bars = 1; >+ > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) >- for (j = 0; j < M64_PER_IOV; j++) { >- if (pdn->m64_wins[i][j] == IODA_INVALID_M64) >+ for (j = 0; j < m64_bars; j++) { >+ if (pdn->m64_map[j][i] == IODA_INVALID_M64) > continue; > opal_pci_phb_mmio_enable(phb->opal_id, >- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); >- clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc); >- pdn->m64_wins[i][j] = IODA_INVALID_M64; >+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0); >+ clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc); >+ pdn->m64_map[j][i] = IODA_INVALID_M64; > } > >+ kfree(pdn->m64_map); > return 0; > } > >@@ -1187,8 +1194,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > int total_vfs; > resource_size_t size, start; > int pe_num; >- int vf_groups; >- int vf_per_group; >+ int m64_bars; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); >@@ -1196,26 +1202,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > pdn = pci_get_pdn(pdev); > total_vfs = pci_sriov_get_totalvfs(pdev); > >- /* Initialize the m64_wins to IODA_INVALID_M64 */ >- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) >- for (j = 0; j < M64_PER_IOV; j++) >- pdn->m64_wins[i][j] = IODA_INVALID_M64; >+ if (pdn->m64_single_mode) >+ m64_bars = num_vfs; >+ else >+ m64_bars = 1; >+ >+ pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL); >+ if (!pdn->m64_map) >+ return -ENOMEM; >+ /* Initialize the m64_map to IODA_INVALID_M64 */ >+ for (i = 0; i < m64_bars ; i++) >+ for (j = 0; j < PCI_SRIOV_NUM_BARS; j++) >+ pdn->m64_map[i][j] = IODA_INVALID_M64; > >- if (pdn->m64_per_iov == M64_PER_IOV) { >- vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV; >- vf_per_group = (num_vfs <= M64_PER_IOV)? 1: >- roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; >- } else { >- vf_groups = 1; >- vf_per_group = 1; >- } > > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > res = &pdev->resource[i + PCI_IOV_RESOURCES]; > if (!res->flags || !res->parent) > continue; > >- for (j = 0; j < vf_groups; j++) { >+ for (j = 0; j < m64_bars; j++) { > do { > win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, > phb->ioda.m64_bar_idx + 1, 0); >@@ -1224,12 +1230,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > goto m64_failed; > } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); > >- pdn->m64_wins[i][j] = win; >+ pdn->m64_map[j][i] = win; > >- if (pdn->m64_per_iov == M64_PER_IOV) { >+ if (pdn->m64_single_mode) { > size = pci_iov_resource_size(pdev, > PCI_IOV_RESOURCES + i); >- size = size * vf_per_group; > start = res->start + size * j; > } else { > size = resource_size(res); >@@ -1237,16 +1242,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > } > > /* Map the M64 here */ >- if (pdn->m64_per_iov == M64_PER_IOV) { >+ if (pdn->m64_single_mode) { > pe_num = pdn->offset + j; > rc = opal_pci_map_pe_mmio_window(phb->opal_id, > pe_num, OPAL_M64_WINDOW_TYPE, >- pdn->m64_wins[i][j], 0); >+ pdn->m64_map[j][i], 0); > } > > rc = opal_pci_set_phb_mem_window(phb->opal_id, > OPAL_M64_WINDOW_TYPE, >- pdn->m64_wins[i][j], >+ pdn->m64_map[j][i], > start, > 0, /* unused */ > size); >@@ -1258,12 +1263,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > goto m64_failed; > } > >- if (pdn->m64_per_iov == M64_PER_IOV) >+ if (pdn->m64_single_mode) > rc = opal_pci_phb_mmio_enable(phb->opal_id, >- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2); >+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2); > else > rc = opal_pci_phb_mmio_enable(phb->opal_id, >- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1); >+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1); > > if (rc != OPAL_SUCCESS) { > dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", >@@ -1275,7 +1280,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > return 0; > > m64_failed: >- pnv_pci_vf_release_m64(pdev); >+ pnv_pci_vf_release_m64(pdev, num_vfs); > return -EBUSY; > } > >@@ -1302,15 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe > iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); > } > >-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) >+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) > { > struct pci_bus *bus; > struct pci_controller *hose; > struct pnv_phb *phb; > struct pnv_ioda_pe *pe, *pe_n; > struct pci_dn *pdn; >- u16 vf_index; >- int64_t rc; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); >@@ -1320,35 +1323,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) > if (!pdev->is_physfn) > return; > >- if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { >- int vf_group; >- int vf_per_group; >- int vf_index1; >- >- vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; >- >- for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) >- for (vf_index = vf_group * vf_per_group; >- vf_index < (vf_group + 1) * vf_per_group && >- vf_index < num_vfs; >- vf_index++) >- for (vf_index1 = vf_group * vf_per_group; >- vf_index1 < (vf_group + 1) * vf_per_group && >- vf_index1 < num_vfs; >- vf_index1++){ >- >- rc = opal_pci_set_peltv(phb->opal_id, >- pdn->offset + vf_index, >- pdn->offset + vf_index1, >- OPAL_REMOVE_PE_FROM_DOMAIN); >- >- if (rc) >- dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n", >- __func__, >- pdn->offset + vf_index1, rc); >- } >- } >- > list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { > if (pe->parent_dev != pdev) > continue; >@@ -1383,14 +1357,14 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) > num_vfs = pdn->num_vfs; > > /* Release VF PEs */ >- pnv_ioda_release_vf_PE(pdev, num_vfs); >+ pnv_ioda_release_vf_PE(pdev); > > if (phb->type == PNV_PHB_IODA2) { >- if (pdn->m64_per_iov == 1) >+ if (!pdn->m64_single_mode) > pnv_pci_vf_resource_shift(pdev, -pdn->offset); > > /* Release M64 windows */ >- pnv_pci_vf_release_m64(pdev); >+ pnv_pci_vf_release_m64(pdev, num_vfs); > > /* Release PE numbers */ > bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); >@@ -1409,7 +1383,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) > int pe_num; > u16 vf_index; > struct pci_dn *pdn; >- int64_t rc; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); >@@ -1454,37 +1427,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) > > pnv_pci_ioda2_setup_dma_pe(phb, pe); > } >- >- if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { >- int vf_group; >- int vf_per_group; >- int vf_index1; >- >- vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; >- >- for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) { >- for (vf_index = vf_group * vf_per_group; >- vf_index < (vf_group + 1) * vf_per_group && >- vf_index < num_vfs; >- vf_index++) { >- for (vf_index1 = vf_group * vf_per_group; >- vf_index1 < (vf_group + 1) * vf_per_group && >- vf_index1 < num_vfs; >- vf_index1++) { >- >- rc = opal_pci_set_peltv(phb->opal_id, >- pdn->offset + vf_index, >- pdn->offset + vf_index1, >- OPAL_ADD_PE_TO_DOMAIN); >- >- if (rc) >- dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n", >- __func__, >- pdn->offset + vf_index1, rc); >- } >- } >- } >- } > } > > int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) >@@ -1507,6 +1449,15 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) > return -ENOSPC; > } > >+ /* >+ * When M64 BARs functions in Single PE mode, the number of VFs >+ * could be enabled must be less than the number of M64 BARs. >+ */ >+ if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) { >+ dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n"); >+ return -EBUSY; >+ } >+ > /* Calculate available PE for required VFs */ > mutex_lock(&phb->ioda.pe_alloc_mutex); > pdn->offset = bitmap_find_next_zero_area( >@@ -1534,7 +1485,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) > * the IOV BAR according to the PE# allocated to the VFs. > * Otherwise, the PE# for the VF will conflict with others. > */ >- if (pdn->m64_per_iov == 1) { >+ if (!pdn->m64_single_mode) { > ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); > if (ret) > goto m64_failed; >@@ -1567,8 +1518,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) > /* Allocate PCI data */ > add_dev_pci_data(pdev); > >- pnv_pci_sriov_enable(pdev, num_vfs); >- return 0; >+ return pnv_pci_sriov_enable(pdev, num_vfs); > } > #endif /* CONFIG_PCI_IOV */ > >@@ -2762,9 +2712,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) > > pdn = pci_get_pdn(pdev); > pdn->vfs_expanded = 0; >+ pdn->m64_single_mode = false; > > total_vfs = pci_sriov_get_totalvfs(pdev); >- pdn->m64_per_iov = 1; > mul = phb->ioda.total_pe; > > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { >@@ -2784,8 +2734,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) > if (size > (1 << 26)) { > dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n", > i, res); >- pdn->m64_per_iov = M64_PER_IOV; > mul = roundup_pow_of_two(total_vfs); >+ pdn->m64_single_mode = true; > break; > } > } >@@ -2987,6 +2937,8 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, > static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, > int resno) > { >+ struct pci_controller *hose = pci_bus_to_host(pdev->bus); >+ struct pnv_phb *phb = hose->private_data; > struct pci_dn *pdn = pci_get_pdn(pdev); > resource_size_t align; > >@@ -2995,12 +2947,26 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, > * SR-IOV. While from hardware perspective, the range mapped by M64 > * BAR should be size aligned. > * >+ * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra >+ * powernv-specific hardware restriction is gone. But if just use the >+ * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with >+ * in one segment of M64 #15, which introduces the PE conflict between >+ * PF and VF. Based on this, the minimum alignment of an IOV BAR is >+ * m64_segsize. >+ * > * This function returns the total IOV BAR size if expanded or just the >- * individual size if not. >+ * individual size if not, when M64 BAR is in Shared PE mode. >+ * If the M64 BAR is in Single PE mode, return the VF BAR size or >+ * M64 segment size if IOV BAR size is less. > */ > align = pci_iov_resource_size(pdev, resno); >- if (pdn->vfs_expanded) >- return pdn->vfs_expanded * align; >+ if (pdn->vfs_expanded) { >+ if (pdn->m64_single_mode) >+ return max(align, >+ (resource_size_t)phb->ioda.m64_segsize); >+ else >+ return pdn->vfs_expanded * align; >+ } > > return align; > } >-- >1.7.9.5 >
On 08/19/2015 12:01 PM, Wei Yang wrote: > In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64 > BARs in Single PE mode to cover the number of VFs required to be enabled. > By doing so, several VFs would be in one VF Group and leads to interference > between VFs in the same group. > > And in this patch, m64_wins is renamed to m64_map, which means index number > of the M64 BAR used to map the VF BAR. Based on Gavin's comments. > > This patch changes the design by using one M64 BAR in Single PE mode for > one VF BAR. This gives absolute isolation for VFs. > > Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com> > --- > arch/powerpc/include/asm/pci-bridge.h | 5 +- > arch/powerpc/platforms/powernv/pci-ioda.c | 178 ++++++++++++----------------- > 2 files changed, 74 insertions(+), 109 deletions(-) > > diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h > index 712add5..8aeba4c 100644 > --- a/arch/powerpc/include/asm/pci-bridge.h > +++ b/arch/powerpc/include/asm/pci-bridge.h > @@ -214,10 +214,9 @@ struct pci_dn { > u16 vfs_expanded; /* number of VFs IOV BAR expanded */ > u16 num_vfs; /* number of VFs enabled*/ > int offset; /* PE# for the first VF PE */ > -#define M64_PER_IOV 4 > - int m64_per_iov; > + bool m64_single_mode; /* Use M64 BAR in Single Mode */ > #define IODA_INVALID_M64 (-1) > - int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV]; > + int (*m64_map)[PCI_SRIOV_NUM_BARS]; > #endif /* CONFIG_PCI_IOV */ > #endif > struct list_head child_list; > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c > index e3e0acb..de7db1d 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -1148,29 +1148,36 @@ static void pnv_pci_ioda_setup_PEs(void) > } > > #ifdef CONFIG_PCI_IOV > -static int pnv_pci_vf_release_m64(struct pci_dev *pdev) > +static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) > { > struct pci_bus *bus; > struct pci_controller *hose; > struct pnv_phb *phb; > struct pci_dn *pdn; > int i, j; > + int m64_bars; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); > phb = hose->private_data; > pdn = pci_get_pdn(pdev); > > + if (pdn->m64_single_mode) > + m64_bars = num_vfs; > + else > + m64_bars = 1; > + > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) > - for (j = 0; j < M64_PER_IOV; j++) { > - if (pdn->m64_wins[i][j] == IODA_INVALID_M64) > + for (j = 0; j < m64_bars; j++) { > + if (pdn->m64_map[j][i] == IODA_INVALID_M64) > continue; > opal_pci_phb_mmio_enable(phb->opal_id, > - OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); > - clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc); > - pdn->m64_wins[i][j] = IODA_INVALID_M64; > + OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0); > + clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc); > + pdn->m64_map[j][i] = IODA_INVALID_M64; > } > > + kfree(pdn->m64_map); > return 0; > } > > @@ -1187,8 +1194,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > int total_vfs; > resource_size_t size, start; > int pe_num; > - int vf_groups; > - int vf_per_group; > + int m64_bars; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); > @@ -1196,26 +1202,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > pdn = pci_get_pdn(pdev); > total_vfs = pci_sriov_get_totalvfs(pdev); > > - /* Initialize the m64_wins to IODA_INVALID_M64 */ > - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) > - for (j = 0; j < M64_PER_IOV; j++) > - pdn->m64_wins[i][j] = IODA_INVALID_M64; > + if (pdn->m64_single_mode) > + m64_bars = num_vfs; > + else > + m64_bars = 1; > + > + pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL); > + if (!pdn->m64_map) > + return -ENOMEM; > + /* Initialize the m64_map to IODA_INVALID_M64 */ > + for (i = 0; i < m64_bars ; i++) > + for (j = 0; j < PCI_SRIOV_NUM_BARS; j++) > + pdn->m64_map[i][j] = IODA_INVALID_M64; > > - if (pdn->m64_per_iov == M64_PER_IOV) { > - vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV; > - vf_per_group = (num_vfs <= M64_PER_IOV)? 1: > - roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; > - } else { > - vf_groups = 1; > - vf_per_group = 1; > - } > > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > res = &pdev->resource[i + PCI_IOV_RESOURCES]; > if (!res->flags || !res->parent) > continue; > > - for (j = 0; j < vf_groups; j++) { > + for (j = 0; j < m64_bars; j++) { > do { > win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, > phb->ioda.m64_bar_idx + 1, 0); > @@ -1224,12 +1230,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > goto m64_failed; > } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); > > - pdn->m64_wins[i][j] = win; > + pdn->m64_map[j][i] = win; > > - if (pdn->m64_per_iov == M64_PER_IOV) { > + if (pdn->m64_single_mode) { > size = pci_iov_resource_size(pdev, > PCI_IOV_RESOURCES + i); > - size = size * vf_per_group; > start = res->start + size * j; > } else { > size = resource_size(res); > @@ -1237,16 +1242,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > } > > /* Map the M64 here */ > - if (pdn->m64_per_iov == M64_PER_IOV) { > + if (pdn->m64_single_mode) { > pe_num = pdn->offset + j; > rc = opal_pci_map_pe_mmio_window(phb->opal_id, > pe_num, OPAL_M64_WINDOW_TYPE, > - pdn->m64_wins[i][j], 0); > + pdn->m64_map[j][i], 0); > } > > rc = opal_pci_set_phb_mem_window(phb->opal_id, > OPAL_M64_WINDOW_TYPE, > - pdn->m64_wins[i][j], > + pdn->m64_map[j][i], > start, > 0, /* unused */ > size); > @@ -1258,12 +1263,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > goto m64_failed; > } > > - if (pdn->m64_per_iov == M64_PER_IOV) > + if (pdn->m64_single_mode) > rc = opal_pci_phb_mmio_enable(phb->opal_id, > - OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2); > + OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2); > else > rc = opal_pci_phb_mmio_enable(phb->opal_id, > - OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1); > + OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1); > > if (rc != OPAL_SUCCESS) { > dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", > @@ -1275,7 +1280,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) > return 0; > > m64_failed: > - pnv_pci_vf_release_m64(pdev); > + pnv_pci_vf_release_m64(pdev, num_vfs); > return -EBUSY; > } > > @@ -1302,15 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe > iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); > } > > -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) > +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) > { > struct pci_bus *bus; > struct pci_controller *hose; > struct pnv_phb *phb; > struct pnv_ioda_pe *pe, *pe_n; > struct pci_dn *pdn; > - u16 vf_index; > - int64_t rc; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); > @@ -1320,35 +1323,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) > if (!pdev->is_physfn) > return; > > - if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { > - int vf_group; > - int vf_per_group; > - int vf_index1; > - > - vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; > - > - for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) > - for (vf_index = vf_group * vf_per_group; > - vf_index < (vf_group + 1) * vf_per_group && > - vf_index < num_vfs; > - vf_index++) > - for (vf_index1 = vf_group * vf_per_group; > - vf_index1 < (vf_group + 1) * vf_per_group && > - vf_index1 < num_vfs; > - vf_index1++){ > - > - rc = opal_pci_set_peltv(phb->opal_id, > - pdn->offset + vf_index, > - pdn->offset + vf_index1, > - OPAL_REMOVE_PE_FROM_DOMAIN); > - > - if (rc) > - dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n", > - __func__, > - pdn->offset + vf_index1, rc); > - } > - } > - > list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { > if (pe->parent_dev != pdev) > continue; > @@ -1383,14 +1357,14 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) > num_vfs = pdn->num_vfs; > > /* Release VF PEs */ > - pnv_ioda_release_vf_PE(pdev, num_vfs); > + pnv_ioda_release_vf_PE(pdev); > > if (phb->type == PNV_PHB_IODA2) { > - if (pdn->m64_per_iov == 1) > + if (!pdn->m64_single_mode) > pnv_pci_vf_resource_shift(pdev, -pdn->offset); > > /* Release M64 windows */ > - pnv_pci_vf_release_m64(pdev); > + pnv_pci_vf_release_m64(pdev, num_vfs); > > /* Release PE numbers */ > bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); > @@ -1409,7 +1383,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) > int pe_num; > u16 vf_index; > struct pci_dn *pdn; > - int64_t rc; > > bus = pdev->bus; > hose = pci_bus_to_host(bus); > @@ -1454,37 +1427,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) > > pnv_pci_ioda2_setup_dma_pe(phb, pe); > } > - > - if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { > - int vf_group; > - int vf_per_group; > - int vf_index1; > - > - vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; > - > - for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) { > - for (vf_index = vf_group * vf_per_group; > - vf_index < (vf_group + 1) * vf_per_group && > - vf_index < num_vfs; > - vf_index++) { > - for (vf_index1 = vf_group * vf_per_group; > - vf_index1 < (vf_group + 1) * vf_per_group && > - vf_index1 < num_vfs; > - vf_index1++) { > - > - rc = opal_pci_set_peltv(phb->opal_id, > - pdn->offset + vf_index, > - pdn->offset + vf_index1, > - OPAL_ADD_PE_TO_DOMAIN); > - > - if (rc) > - dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n", > - __func__, > - pdn->offset + vf_index1, rc); > - } > - } > - } > - } > } > > int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) > @@ -1507,6 +1449,15 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) > return -ENOSPC; > } > > + /* > + * When M64 BARs functions in Single PE mode, the number of VFs > + * could be enabled must be less than the number of M64 BARs. > + */ > + if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) { m64_bar_idx is not really the maximum number of M64 BARs, you program it to store the latest number but after some future rework/hardware update you might want to change it to be not the last M64 BAR and you will have to update this check as well. Instead of doing: arch/powerpc/platforms/powernv/pci-ioda.c|374| phb->ioda.m64_bar_idx = 15; do #define PCI_IODA2_M64_BAR_NUM 15 and use new macro everywhere. Or read from OPAL if it tells about it. > + dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n"); > + return -EBUSY; > + } > + > /* Calculate available PE for required VFs */ > mutex_lock(&phb->ioda.pe_alloc_mutex); > pdn->offset = bitmap_find_next_zero_area( > @@ -1534,7 +1485,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) > * the IOV BAR according to the PE# allocated to the VFs. > * Otherwise, the PE# for the VF will conflict with others. > */ > - if (pdn->m64_per_iov == 1) { > + if (!pdn->m64_single_mode) { > ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); > if (ret) > goto m64_failed; > @@ -1567,8 +1518,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) > /* Allocate PCI data */ > add_dev_pci_data(pdev); > > - pnv_pci_sriov_enable(pdev, num_vfs); > - return 0; > + return pnv_pci_sriov_enable(pdev, num_vfs); > } > #endif /* CONFIG_PCI_IOV */ > > @@ -2762,9 +2712,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) > > pdn = pci_get_pdn(pdev); > pdn->vfs_expanded = 0; > + pdn->m64_single_mode = false; > > total_vfs = pci_sriov_get_totalvfs(pdev); > - pdn->m64_per_iov = 1; > mul = phb->ioda.total_pe; > > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > @@ -2784,8 +2734,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) > if (size > (1 << 26)) { > dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n", > i, res); > - pdn->m64_per_iov = M64_PER_IOV; > mul = roundup_pow_of_two(total_vfs); > + pdn->m64_single_mode = true; > break; > } > } > @@ -2987,6 +2937,8 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, > static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, > int resno) > { > + struct pci_controller *hose = pci_bus_to_host(pdev->bus); > + struct pnv_phb *phb = hose->private_data; > struct pci_dn *pdn = pci_get_pdn(pdev); > resource_size_t align; > > @@ -2995,12 +2947,26 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, > * SR-IOV. While from hardware perspective, the range mapped by M64 > * BAR should be size aligned. > * > + * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra > + * powernv-specific hardware restriction is gone. But if just use the > + * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with > + * in one segment of M64 #15, which introduces the PE conflict between > + * PF and VF. Based on this, the minimum alignment of an IOV BAR is > + * m64_segsize. > + * > * This function returns the total IOV BAR size if expanded or just the > - * individual size if not. > + * individual size if not, when M64 BAR is in Shared PE mode. > + * If the M64 BAR is in Single PE mode, return the VF BAR size or > + * M64 segment size if IOV BAR size is less. "Expanded" == "non-shared M64 BAR"? I'd stick to the "non-shared". > */ > align = pci_iov_resource_size(pdev, resno); > - if (pdn->vfs_expanded) > - return pdn->vfs_expanded * align; > + if (pdn->vfs_expanded) { > + if (pdn->m64_single_mode) > + return max(align, > + (resource_size_t)phb->ioda.m64_segsize); > + else > + return pdn->vfs_expanded * align; > + } > > return align; > } > if (!pdn->vfs_expanded) return align; if (pdn->m64_single_mode) return max(align, (resource_size_t)phb->ioda.m64_segsize); return pdn->vfs_expanded * align; Fewer braces/indents/line wraps :)
On Fri, Oct 02, 2015 at 07:29:29PM +1000, Alexey Kardashevskiy wrote: >On 08/19/2015 12:01 PM, Wei Yang wrote: >>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64 >>BARs in Single PE mode to cover the number of VFs required to be enabled. >>By doing so, several VFs would be in one VF Group and leads to interference >>between VFs in the same group. >> >>And in this patch, m64_wins is renamed to m64_map, which means index number >>of the M64 BAR used to map the VF BAR. Based on Gavin's comments. >> >>This patch changes the design by using one M64 BAR in Single PE mode for >>one VF BAR. This gives absolute isolation for VFs. >> >>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com> >>--- >> arch/powerpc/include/asm/pci-bridge.h | 5 +- >> arch/powerpc/platforms/powernv/pci-ioda.c | 178 ++++++++++++----------------- >> 2 files changed, 74 insertions(+), 109 deletions(-) >> >>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h >>index 712add5..8aeba4c 100644 >>--- a/arch/powerpc/include/asm/pci-bridge.h >>+++ b/arch/powerpc/include/asm/pci-bridge.h >>@@ -214,10 +214,9 @@ struct pci_dn { >> u16 vfs_expanded; /* number of VFs IOV BAR expanded */ >> u16 num_vfs; /* number of VFs enabled*/ >> int offset; /* PE# for the first VF PE */ >>-#define M64_PER_IOV 4 >>- int m64_per_iov; >>+ bool m64_single_mode; /* Use M64 BAR in Single Mode */ >> #define IODA_INVALID_M64 (-1) >>- int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV]; >>+ int (*m64_map)[PCI_SRIOV_NUM_BARS]; >> #endif /* CONFIG_PCI_IOV */ >> #endif >> struct list_head child_list; >>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c >>index e3e0acb..de7db1d 100644 >>--- a/arch/powerpc/platforms/powernv/pci-ioda.c >>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >>@@ -1148,29 +1148,36 @@ static void pnv_pci_ioda_setup_PEs(void) >> } >> >> #ifdef CONFIG_PCI_IOV >>-static int pnv_pci_vf_release_m64(struct pci_dev *pdev) >>+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) >> { >> struct pci_bus *bus; >> struct pci_controller *hose; >> struct pnv_phb *phb; >> struct pci_dn *pdn; >> int i, j; >>+ int m64_bars; >> >> bus = pdev->bus; >> hose = pci_bus_to_host(bus); >> phb = hose->private_data; >> pdn = pci_get_pdn(pdev); >> >>+ if (pdn->m64_single_mode) >>+ m64_bars = num_vfs; >>+ else >>+ m64_bars = 1; >>+ >> for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) >>- for (j = 0; j < M64_PER_IOV; j++) { >>- if (pdn->m64_wins[i][j] == IODA_INVALID_M64) >>+ for (j = 0; j < m64_bars; j++) { >>+ if (pdn->m64_map[j][i] == IODA_INVALID_M64) >> continue; >> opal_pci_phb_mmio_enable(phb->opal_id, >>- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); >>- clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc); >>- pdn->m64_wins[i][j] = IODA_INVALID_M64; >>+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0); >>+ clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc); >>+ pdn->m64_map[j][i] = IODA_INVALID_M64; >> } >> >>+ kfree(pdn->m64_map); >> return 0; >> } >> >>@@ -1187,8 +1194,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) >> int total_vfs; >> resource_size_t size, start; >> int pe_num; >>- int vf_groups; >>- int vf_per_group; >>+ int m64_bars; >> >> bus = pdev->bus; >> hose = pci_bus_to_host(bus); >>@@ -1196,26 +1202,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) >> pdn = pci_get_pdn(pdev); >> total_vfs = pci_sriov_get_totalvfs(pdev); >> >>- /* Initialize the m64_wins to IODA_INVALID_M64 */ >>- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) >>- for (j = 0; j < M64_PER_IOV; j++) >>- pdn->m64_wins[i][j] = IODA_INVALID_M64; >>+ if (pdn->m64_single_mode) >>+ m64_bars = num_vfs; >>+ else >>+ m64_bars = 1; >>+ >>+ pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL); >>+ if (!pdn->m64_map) >>+ return -ENOMEM; >>+ /* Initialize the m64_map to IODA_INVALID_M64 */ >>+ for (i = 0; i < m64_bars ; i++) >>+ for (j = 0; j < PCI_SRIOV_NUM_BARS; j++) >>+ pdn->m64_map[i][j] = IODA_INVALID_M64; >> >>- if (pdn->m64_per_iov == M64_PER_IOV) { >>- vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV; >>- vf_per_group = (num_vfs <= M64_PER_IOV)? 1: >>- roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; >>- } else { >>- vf_groups = 1; >>- vf_per_group = 1; >>- } >> >> for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { >> res = &pdev->resource[i + PCI_IOV_RESOURCES]; >> if (!res->flags || !res->parent) >> continue; >> >>- for (j = 0; j < vf_groups; j++) { >>+ for (j = 0; j < m64_bars; j++) { >> do { >> win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, >> phb->ioda.m64_bar_idx + 1, 0); >>@@ -1224,12 +1230,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) >> goto m64_failed; >> } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); >> >>- pdn->m64_wins[i][j] = win; >>+ pdn->m64_map[j][i] = win; >> >>- if (pdn->m64_per_iov == M64_PER_IOV) { >>+ if (pdn->m64_single_mode) { >> size = pci_iov_resource_size(pdev, >> PCI_IOV_RESOURCES + i); >>- size = size * vf_per_group; >> start = res->start + size * j; >> } else { >> size = resource_size(res); >>@@ -1237,16 +1242,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) >> } >> >> /* Map the M64 here */ >>- if (pdn->m64_per_iov == M64_PER_IOV) { >>+ if (pdn->m64_single_mode) { >> pe_num = pdn->offset + j; >> rc = opal_pci_map_pe_mmio_window(phb->opal_id, >> pe_num, OPAL_M64_WINDOW_TYPE, >>- pdn->m64_wins[i][j], 0); >>+ pdn->m64_map[j][i], 0); >> } >> >> rc = opal_pci_set_phb_mem_window(phb->opal_id, >> OPAL_M64_WINDOW_TYPE, >>- pdn->m64_wins[i][j], >>+ pdn->m64_map[j][i], >> start, >> 0, /* unused */ >> size); >>@@ -1258,12 +1263,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) >> goto m64_failed; >> } >> >>- if (pdn->m64_per_iov == M64_PER_IOV) >>+ if (pdn->m64_single_mode) >> rc = opal_pci_phb_mmio_enable(phb->opal_id, >>- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2); >>+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2); >> else >> rc = opal_pci_phb_mmio_enable(phb->opal_id, >>- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1); >>+ OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1); >> >> if (rc != OPAL_SUCCESS) { >> dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", >>@@ -1275,7 +1280,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) >> return 0; >> >> m64_failed: >>- pnv_pci_vf_release_m64(pdev); >>+ pnv_pci_vf_release_m64(pdev, num_vfs); >> return -EBUSY; >> } >> >>@@ -1302,15 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe >> iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); >> } >> >>-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) >>+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) >> { >> struct pci_bus *bus; >> struct pci_controller *hose; >> struct pnv_phb *phb; >> struct pnv_ioda_pe *pe, *pe_n; >> struct pci_dn *pdn; >>- u16 vf_index; >>- int64_t rc; >> >> bus = pdev->bus; >> hose = pci_bus_to_host(bus); >>@@ -1320,35 +1323,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) >> if (!pdev->is_physfn) >> return; >> >>- if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { >>- int vf_group; >>- int vf_per_group; >>- int vf_index1; >>- >>- vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; >>- >>- for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) >>- for (vf_index = vf_group * vf_per_group; >>- vf_index < (vf_group + 1) * vf_per_group && >>- vf_index < num_vfs; >>- vf_index++) >>- for (vf_index1 = vf_group * vf_per_group; >>- vf_index1 < (vf_group + 1) * vf_per_group && >>- vf_index1 < num_vfs; >>- vf_index1++){ >>- >>- rc = opal_pci_set_peltv(phb->opal_id, >>- pdn->offset + vf_index, >>- pdn->offset + vf_index1, >>- OPAL_REMOVE_PE_FROM_DOMAIN); >>- >>- if (rc) >>- dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n", >>- __func__, >>- pdn->offset + vf_index1, rc); >>- } >>- } >>- >> list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { >> if (pe->parent_dev != pdev) >> continue; >>@@ -1383,14 +1357,14 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) >> num_vfs = pdn->num_vfs; >> >> /* Release VF PEs */ >>- pnv_ioda_release_vf_PE(pdev, num_vfs); >>+ pnv_ioda_release_vf_PE(pdev); >> >> if (phb->type == PNV_PHB_IODA2) { >>- if (pdn->m64_per_iov == 1) >>+ if (!pdn->m64_single_mode) >> pnv_pci_vf_resource_shift(pdev, -pdn->offset); >> >> /* Release M64 windows */ >>- pnv_pci_vf_release_m64(pdev); >>+ pnv_pci_vf_release_m64(pdev, num_vfs); >> >> /* Release PE numbers */ >> bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); >>@@ -1409,7 +1383,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) >> int pe_num; >> u16 vf_index; >> struct pci_dn *pdn; >>- int64_t rc; >> >> bus = pdev->bus; >> hose = pci_bus_to_host(bus); >>@@ -1454,37 +1427,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) >> >> pnv_pci_ioda2_setup_dma_pe(phb, pe); >> } >>- >>- if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { >>- int vf_group; >>- int vf_per_group; >>- int vf_index1; >>- >>- vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; >>- >>- for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) { >>- for (vf_index = vf_group * vf_per_group; >>- vf_index < (vf_group + 1) * vf_per_group && >>- vf_index < num_vfs; >>- vf_index++) { >>- for (vf_index1 = vf_group * vf_per_group; >>- vf_index1 < (vf_group + 1) * vf_per_group && >>- vf_index1 < num_vfs; >>- vf_index1++) { >>- >>- rc = opal_pci_set_peltv(phb->opal_id, >>- pdn->offset + vf_index, >>- pdn->offset + vf_index1, >>- OPAL_ADD_PE_TO_DOMAIN); >>- >>- if (rc) >>- dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n", >>- __func__, >>- pdn->offset + vf_index1, rc); >>- } >>- } >>- } >>- } >> } >> >> int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) >>@@ -1507,6 +1449,15 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) >> return -ENOSPC; >> } >> >>+ /* >>+ * When M64 BARs functions in Single PE mode, the number of VFs >>+ * could be enabled must be less than the number of M64 BARs. >>+ */ >>+ if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) { > > >m64_bar_idx is not really the maximum number of M64 BARs, you program it to >store the latest number but after some future rework/hardware update you >might want to change it to be not the last M64 BAR and you will have to >update this check as well. > >Instead of doing: >arch/powerpc/platforms/powernv/pci-ioda.c|374| phb->ioda.m64_bar_idx = 15; >do >#define PCI_IODA2_M64_BAR_NUM 15 >and use new macro everywhere. >Or read from OPAL if it tells about it. > I didn't touch this field since this is not that related to SRIOV. And I prefer the OPAL way, or device tree way, which need to do some modification in skiboot. If you think it is ok, I would send a separate patch to fix this in both skiboot and kernel. > > >>+ dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n"); >>+ return -EBUSY; >>+ } >>+ >> /* Calculate available PE for required VFs */ >> mutex_lock(&phb->ioda.pe_alloc_mutex); >> pdn->offset = bitmap_find_next_zero_area( >>@@ -1534,7 +1485,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) >> * the IOV BAR according to the PE# allocated to the VFs. >> * Otherwise, the PE# for the VF will conflict with others. >> */ >>- if (pdn->m64_per_iov == 1) { >>+ if (!pdn->m64_single_mode) { >> ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); >> if (ret) >> goto m64_failed; >>@@ -1567,8 +1518,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) >> /* Allocate PCI data */ >> add_dev_pci_data(pdev); >> >>- pnv_pci_sriov_enable(pdev, num_vfs); >>- return 0; >>+ return pnv_pci_sriov_enable(pdev, num_vfs); >> } >> #endif /* CONFIG_PCI_IOV */ >> >>@@ -2762,9 +2712,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) >> >> pdn = pci_get_pdn(pdev); >> pdn->vfs_expanded = 0; >>+ pdn->m64_single_mode = false; >> >> total_vfs = pci_sriov_get_totalvfs(pdev); >>- pdn->m64_per_iov = 1; >> mul = phb->ioda.total_pe; >> >> for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { >>@@ -2784,8 +2734,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) >> if (size > (1 << 26)) { >> dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n", >> i, res); >>- pdn->m64_per_iov = M64_PER_IOV; >> mul = roundup_pow_of_two(total_vfs); >>+ pdn->m64_single_mode = true; >> break; >> } >> } >>@@ -2987,6 +2937,8 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, >> static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, >> int resno) >> { >>+ struct pci_controller *hose = pci_bus_to_host(pdev->bus); >>+ struct pnv_phb *phb = hose->private_data; >> struct pci_dn *pdn = pci_get_pdn(pdev); >> resource_size_t align; >> >>@@ -2995,12 +2947,26 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, >> * SR-IOV. While from hardware perspective, the range mapped by M64 >> * BAR should be size aligned. >> * >>+ * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra >>+ * powernv-specific hardware restriction is gone. But if just use the >>+ * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with >>+ * in one segment of M64 #15, which introduces the PE conflict between >>+ * PF and VF. Based on this, the minimum alignment of an IOV BAR is >>+ * m64_segsize. >>+ * >> * This function returns the total IOV BAR size if expanded or just the >>- * individual size if not. >>+ * individual size if not, when M64 BAR is in Shared PE mode. >>+ * If the M64 BAR is in Single PE mode, return the VF BAR size or >>+ * M64 segment size if IOV BAR size is less. > >"Expanded" == "non-shared M64 BAR"? I'd stick to the "non-shared". > Yes, your way is more friendly to audience. The current comment is not accurate. The correct way is: * This function returns the total IOV BAR size if M64 BAR is in Shared PE * mode or just the individual size if not. > >> */ >> align = pci_iov_resource_size(pdev, resno); >>- if (pdn->vfs_expanded) >>- return pdn->vfs_expanded * align; >>+ if (pdn->vfs_expanded) { >>+ if (pdn->m64_single_mode) >>+ return max(align, >>+ (resource_size_t)phb->ioda.m64_segsize); >>+ else >>+ return pdn->vfs_expanded * align; >>+ } >> >> return align; >> } >> > > if (!pdn->vfs_expanded) > return align; > > if (pdn->m64_single_mode) > return max(align, (resource_size_t)phb->ioda.m64_segsize); > > return pdn->vfs_expanded * align; > >Fewer braces/indents/line wraps :) > Yep~ Looks nice. Thanks! > > >-- >Alexey
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 712add5..8aeba4c 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -214,10 +214,9 @@ struct pci_dn { u16 vfs_expanded; /* number of VFs IOV BAR expanded */ u16 num_vfs; /* number of VFs enabled*/ int offset; /* PE# for the first VF PE */ -#define M64_PER_IOV 4 - int m64_per_iov; + bool m64_single_mode; /* Use M64 BAR in Single Mode */ #define IODA_INVALID_M64 (-1) - int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV]; + int (*m64_map)[PCI_SRIOV_NUM_BARS]; #endif /* CONFIG_PCI_IOV */ #endif struct list_head child_list; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e3e0acb..de7db1d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1148,29 +1148,36 @@ static void pnv_pci_ioda_setup_PEs(void) } #ifdef CONFIG_PCI_IOV -static int pnv_pci_vf_release_m64(struct pci_dev *pdev) +static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) { struct pci_bus *bus; struct pci_controller *hose; struct pnv_phb *phb; struct pci_dn *pdn; int i, j; + int m64_bars; bus = pdev->bus; hose = pci_bus_to_host(bus); phb = hose->private_data; pdn = pci_get_pdn(pdev); + if (pdn->m64_single_mode) + m64_bars = num_vfs; + else + m64_bars = 1; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) - for (j = 0; j < M64_PER_IOV; j++) { - if (pdn->m64_wins[i][j] == IODA_INVALID_M64) + for (j = 0; j < m64_bars; j++) { + if (pdn->m64_map[j][i] == IODA_INVALID_M64) continue; opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); - clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc); - pdn->m64_wins[i][j] = IODA_INVALID_M64; + OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0); + clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc); + pdn->m64_map[j][i] = IODA_INVALID_M64; } + kfree(pdn->m64_map); return 0; } @@ -1187,8 +1194,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) int total_vfs; resource_size_t size, start; int pe_num; - int vf_groups; - int vf_per_group; + int m64_bars; bus = pdev->bus; hose = pci_bus_to_host(bus); @@ -1196,26 +1202,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) pdn = pci_get_pdn(pdev); total_vfs = pci_sriov_get_totalvfs(pdev); - /* Initialize the m64_wins to IODA_INVALID_M64 */ - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) - for (j = 0; j < M64_PER_IOV; j++) - pdn->m64_wins[i][j] = IODA_INVALID_M64; + if (pdn->m64_single_mode) + m64_bars = num_vfs; + else + m64_bars = 1; + + pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL); + if (!pdn->m64_map) + return -ENOMEM; + /* Initialize the m64_map to IODA_INVALID_M64 */ + for (i = 0; i < m64_bars ; i++) + for (j = 0; j < PCI_SRIOV_NUM_BARS; j++) + pdn->m64_map[i][j] = IODA_INVALID_M64; - if (pdn->m64_per_iov == M64_PER_IOV) { - vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV; - vf_per_group = (num_vfs <= M64_PER_IOV)? 1: - roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; - } else { - vf_groups = 1; - vf_per_group = 1; - } for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &pdev->resource[i + PCI_IOV_RESOURCES]; if (!res->flags || !res->parent) continue; - for (j = 0; j < vf_groups; j++) { + for (j = 0; j < m64_bars; j++) { do { win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, phb->ioda.m64_bar_idx + 1, 0); @@ -1224,12 +1230,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) goto m64_failed; } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); - pdn->m64_wins[i][j] = win; + pdn->m64_map[j][i] = win; - if (pdn->m64_per_iov == M64_PER_IOV) { + if (pdn->m64_single_mode) { size = pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i); - size = size * vf_per_group; start = res->start + size * j; } else { size = resource_size(res); @@ -1237,16 +1242,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) } /* Map the M64 here */ - if (pdn->m64_per_iov == M64_PER_IOV) { + if (pdn->m64_single_mode) { pe_num = pdn->offset + j; rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe_num, OPAL_M64_WINDOW_TYPE, - pdn->m64_wins[i][j], 0); + pdn->m64_map[j][i], 0); } rc = opal_pci_set_phb_mem_window(phb->opal_id, OPAL_M64_WINDOW_TYPE, - pdn->m64_wins[i][j], + pdn->m64_map[j][i], start, 0, /* unused */ size); @@ -1258,12 +1263,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) goto m64_failed; } - if (pdn->m64_per_iov == M64_PER_IOV) + if (pdn->m64_single_mode) rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2); + OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2); else rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1); + OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1); if (rc != OPAL_SUCCESS) { dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", @@ -1275,7 +1280,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) return 0; m64_failed: - pnv_pci_vf_release_m64(pdev); + pnv_pci_vf_release_m64(pdev, num_vfs); return -EBUSY; } @@ -1302,15 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); } -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) { struct pci_bus *bus; struct pci_controller *hose; struct pnv_phb *phb; struct pnv_ioda_pe *pe, *pe_n; struct pci_dn *pdn; - u16 vf_index; - int64_t rc; bus = pdev->bus; hose = pci_bus_to_host(bus); @@ -1320,35 +1323,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) if (!pdev->is_physfn) return; - if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { - int vf_group; - int vf_per_group; - int vf_index1; - - vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; - - for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) - for (vf_index = vf_group * vf_per_group; - vf_index < (vf_group + 1) * vf_per_group && - vf_index < num_vfs; - vf_index++) - for (vf_index1 = vf_group * vf_per_group; - vf_index1 < (vf_group + 1) * vf_per_group && - vf_index1 < num_vfs; - vf_index1++){ - - rc = opal_pci_set_peltv(phb->opal_id, - pdn->offset + vf_index, - pdn->offset + vf_index1, - OPAL_REMOVE_PE_FROM_DOMAIN); - - if (rc) - dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n", - __func__, - pdn->offset + vf_index1, rc); - } - } - list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { if (pe->parent_dev != pdev) continue; @@ -1383,14 +1357,14 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) num_vfs = pdn->num_vfs; /* Release VF PEs */ - pnv_ioda_release_vf_PE(pdev, num_vfs); + pnv_ioda_release_vf_PE(pdev); if (phb->type == PNV_PHB_IODA2) { - if (pdn->m64_per_iov == 1) + if (!pdn->m64_single_mode) pnv_pci_vf_resource_shift(pdev, -pdn->offset); /* Release M64 windows */ - pnv_pci_vf_release_m64(pdev); + pnv_pci_vf_release_m64(pdev, num_vfs); /* Release PE numbers */ bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); @@ -1409,7 +1383,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) int pe_num; u16 vf_index; struct pci_dn *pdn; - int64_t rc; bus = pdev->bus; hose = pci_bus_to_host(bus); @@ -1454,37 +1427,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) pnv_pci_ioda2_setup_dma_pe(phb, pe); } - - if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { - int vf_group; - int vf_per_group; - int vf_index1; - - vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; - - for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) { - for (vf_index = vf_group * vf_per_group; - vf_index < (vf_group + 1) * vf_per_group && - vf_index < num_vfs; - vf_index++) { - for (vf_index1 = vf_group * vf_per_group; - vf_index1 < (vf_group + 1) * vf_per_group && - vf_index1 < num_vfs; - vf_index1++) { - - rc = opal_pci_set_peltv(phb->opal_id, - pdn->offset + vf_index, - pdn->offset + vf_index1, - OPAL_ADD_PE_TO_DOMAIN); - - if (rc) - dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n", - __func__, - pdn->offset + vf_index1, rc); - } - } - } - } } int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) @@ -1507,6 +1449,15 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) return -ENOSPC; } + /* + * When M64 BARs functions in Single PE mode, the number of VFs + * could be enabled must be less than the number of M64 BARs. + */ + if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) { + dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n"); + return -EBUSY; + } + /* Calculate available PE for required VFs */ mutex_lock(&phb->ioda.pe_alloc_mutex); pdn->offset = bitmap_find_next_zero_area( @@ -1534,7 +1485,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) * the IOV BAR according to the PE# allocated to the VFs. * Otherwise, the PE# for the VF will conflict with others. */ - if (pdn->m64_per_iov == 1) { + if (!pdn->m64_single_mode) { ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); if (ret) goto m64_failed; @@ -1567,8 +1518,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) /* Allocate PCI data */ add_dev_pci_data(pdev); - pnv_pci_sriov_enable(pdev, num_vfs); - return 0; + return pnv_pci_sriov_enable(pdev, num_vfs); } #endif /* CONFIG_PCI_IOV */ @@ -2762,9 +2712,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) pdn = pci_get_pdn(pdev); pdn->vfs_expanded = 0; + pdn->m64_single_mode = false; total_vfs = pci_sriov_get_totalvfs(pdev); - pdn->m64_per_iov = 1; mul = phb->ioda.total_pe; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { @@ -2784,8 +2734,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) if (size > (1 << 26)) { dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n", i, res); - pdn->m64_per_iov = M64_PER_IOV; mul = roundup_pow_of_two(total_vfs); + pdn->m64_single_mode = true; break; } } @@ -2987,6 +2937,8 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, int resno) { + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn = pci_get_pdn(pdev); resource_size_t align; @@ -2995,12 +2947,26 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, * SR-IOV. While from hardware perspective, the range mapped by M64 * BAR should be size aligned. * + * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra + * powernv-specific hardware restriction is gone. But if just use the + * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with + * in one segment of M64 #15, which introduces the PE conflict between + * PF and VF. Based on this, the minimum alignment of an IOV BAR is + * m64_segsize. + * * This function returns the total IOV BAR size if expanded or just the - * individual size if not. + * individual size if not, when M64 BAR is in Shared PE mode. + * If the M64 BAR is in Single PE mode, return the VF BAR size or + * M64 segment size if IOV BAR size is less. */ align = pci_iov_resource_size(pdev, resno); - if (pdn->vfs_expanded) - return pdn->vfs_expanded * align; + if (pdn->vfs_expanded) { + if (pdn->m64_single_mode) + return max(align, + (resource_size_t)phb->ioda.m64_segsize); + else + return pdn->vfs_expanded * align; + } return align; }
In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64 BARs in Single PE mode to cover the number of VFs required to be enabled. By doing so, several VFs would be in one VF Group and leads to interference between VFs in the same group. And in this patch, m64_wins is renamed to m64_map, which means index number of the M64 BAR used to map the VF BAR. Based on Gavin's comments. This patch changes the design by using one M64 BAR in Single PE mode for one VF BAR. This gives absolute isolation for VFs. Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com> --- arch/powerpc/include/asm/pci-bridge.h | 5 +- arch/powerpc/platforms/powernv/pci-ioda.c | 178 ++++++++++++----------------- 2 files changed, 74 insertions(+), 109 deletions(-)