[v3] npu: Implement FLR

Message ID 20170410074057.13560-1-aik@ozlabs.ru
State Accepted
Headers show

Commit Message

Alexey Kardashevskiy April 10, 2017, 7:40 a.m.
As the comment in npu_dev_populate_pcie_cap() says,
"We should support FLR" and the NPU device advertises its
support. However, when the kernel issues FLR, skiboot does
nothing which leaves NPU in a state which does not allow
to use NV links again after GPU was reset.

This adds basic handling of FLR (function level reset).

This does not update hreset/freset handlers as they are not going to be
called under any circumstance - EEH is not supported for NPU and
the kernel won't issue OPAL reset otherwise.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Alistair Popple <alistair@popple.id.au>
---
Changes:
v3:
* dropped "__unused" - leftovers from v1

v2:
* added npu_dev_procedure_reset() to keep procedure_xxxx handling
in one file
* added check that device control register access is 2 bytes long
and aligned
* fixed typo

---
With this fix, the bandwidthTest reports PINNED Memory Transfers
>= 20000MB/s, without it only first guest boot does report this,
all consequent guests only report 6000MB/s maximum.
---
 include/npu.h          |  1 +
 hw/npu-hw-procedures.c |  8 ++++++++
 hw/npu.c               | 29 ++++++++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 1 deletion(-)

Comments

Stewart Smith April 27, 2017, 6:54 a.m. | #1
Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> As the comment in npu_dev_populate_pcie_cap() says,
> "We should support FLR" and the NPU device advertises its
> support. However, when the kernel issues FLR, skiboot does
> nothing which leaves NPU in a state which does not allow
> to use NV links again after GPU was reset.
>
> This adds basic handling of FLR (function level reset).
>
> This does not update hreset/freset handlers as they are not going to be
> called under any circumstance - EEH is not supported for NPU and
> the kernel won't issue OPAL reset otherwise.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> Acked-by: Alistair Popple <alistair@popple.id.au>

Thanks, merged to master as of b74841db759d33d6823a8f39603e07319b90103b

Patch

diff --git a/include/npu.h b/include/npu.h
index 77beca37..a76acdfc 100644
--- a/include/npu.h
+++ b/include/npu.h
@@ -167,6 +167,7 @@  int64_t npu_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf,
 			  bool write);
 
 void npu_set_fence_state(struct npu *p, bool fence);
+void npu_dev_procedure_reset(struct npu_dev *dev);
 
 #define NPUDBG(p, fmt, a...)	prlog(PR_DEBUG, "NPU%d: " fmt, \
 				      (p)->phb.opal_id, ##a)
diff --git a/hw/npu-hw-procedures.c b/hw/npu-hw-procedures.c
index 85f09a11..cabc9c99 100644
--- a/hw/npu-hw-procedures.c
+++ b/hw/npu-hw-procedures.c
@@ -606,3 +606,11 @@  int64_t npu_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf,
 
 	return npu_dev_procedure_read(ndev, offset - pcrf->start, len, data);
 }
+
+void npu_dev_procedure_reset(struct npu_dev *dev)
+{
+	dev->procedure_status = 0;
+	dev->procedure_number = 0;
+	dev->procedure_step = 0;
+	dev->procedure_data = 0;
+}
diff --git a/hw/npu.c b/hw/npu.c
index 302a5518..f00e9c5c 100644
--- a/hw/npu.c
+++ b/hw/npu.c
@@ -271,6 +271,29 @@  static int64_t npu_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
 	return npu_dev_cfg_bar_read(ndev, pcrf, offset, len, data);
 }
 
+static int64_t npu_dev_cfg_exp_devcap(void *dev,
+		struct pci_cfg_reg_filter *pcrf __unused,
+		uint32_t offset, uint32_t size,
+		uint32_t *data, bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu_dev *ndev = pvd->data;
+
+	assert(write);
+
+	if ((size != 2) || (offset & 1)) {
+		/* Short config writes are not supported */
+		prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n",
+		      ndev->phb->opal_id);
+		return OPAL_PARAMETER;
+	}
+
+	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
+		npu_dev_procedure_reset(ndev);
+
+	return OPAL_PARTIAL;
+}
+
 static struct npu_dev *bdfn_to_npu_dev(struct npu *p, uint32_t bdfn)
 {
 	struct pci_virt_device *pvd;
@@ -1187,7 +1210,7 @@  static void npu_dev_populate_pcie_cap(struct npu_dev_cap *cap)
 
 	/* 0x04 - Device capability
 	 *
-	 * We should support FLR. Oterwhsie, it might have
+	 * We should support FLR. Otherwise, it might have
 	 * problem passing it through to userland via Linux
 	 * VFIO infrastructure
 	 */
@@ -1198,6 +1221,10 @@  static void npu_dev_populate_pcie_cap(struct npu_dev_cap *cap)
 	       (PCICAP_EXP_DEVCAP_FUNC_RESET));
 	PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_DEVCAP, 4, val);
 
+	pci_virt_add_filter(pvd, base + PCICAP_EXP_DEVCTL, 2,
+			    PCI_REG_FLAG_WRITE,
+			    npu_dev_cfg_exp_devcap, NULL);
+
 	/* 0x08 - Device control and status */
 	PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DEVCTL, 4, 0x00002810,
 			 0xffff0000, 0x000f0000);