Message ID | 20190909123151.21944-11-fbarrat@linux.ibm.com |
---|---|
State | Superseded |
Headers | show |
Series | opencapi: enable card reset and link retraining | expand |
Context | Check | Description |
---|---|---|
snowpatch_ozlabs/apply_patch | success | Successfully applied on branch master (470ffb5f29d741c3bed600f7bb7bf0cbb270e05a) |
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot | success | Test snowpatch/job/snowpatch-skiboot on branch master |
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco | success | Signed-off-by present |
On 09/09/2019 14:31, Frederic Barrat wrote: > Modify slightly the ordering of a few steps in our init sequence on > fundamental reset, so that it can be called from the OS, when the link > is already up: > > - when the card is reset, the link goes down, so we need to fence the > brick to prevent errors propagating to the NPU and OS > - since fencing and unfencing don't require any delay, let's also > fence/unfence during the very first reset at boot. It's useless but > doesn't hurt and keep the code simpler. > - resetting the PHY must be done a bit later, while fenced and the ODL > and DLx in reset > > Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com> > --- > hw/npu2-opencapi.c | 48 +++++++++++++++++++++++++--------------------- > include/npu2.h | 2 -- > 2 files changed, 26 insertions(+), 24 deletions(-) > > diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c > index f7be9f09..619d4be8 100644 > --- a/hw/npu2-opencapi.c > +++ b/hw/npu2-opencapi.c > @@ -1039,6 +1039,28 @@ static int64_t npu2_opencapi_get_presence_state(struct pci_slot __unused *slot, > return OPAL_SUCCESS; > } > > +static void fence_brick(struct npu2_dev *dev) > +{ > + OCAPIDBG(dev, "Fencing brick\n"); > + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, > + dev->brick_index, 0b11); > + /* from 13.2.1, Quiesce Fence State */ > + npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, > + PPC_BIT(dev->brick_index + 6)); > +} > + > +static void unfence_brick(struct npu2_dev *dev) > +{ > + OCAPIDBG(dev, "Unfencing brick\n"); > + npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, > + PPC_BIT(dev->brick_index)); > + > + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, > + dev->brick_index, 0b10); Unfence the brick implies to reset the PowerBus (0b10)? This has been already done in fence_brick() ? > + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, > + dev->brick_index, 0b00); > +} > + > static enum OpalShpcLinkState get_link_width(uint64_t odl_status) > { > uint64_t tx_lanes, rx_lanes, state; > @@ -1153,7 +1175,7 @@ static int64_t npu2_opencapi_poll_link(struct pci_slot *slot) > return OPAL_HARDWARE; > } > > -static int64_t npu2_opencapi_creset(struct pci_slot *slot __unused) > +static int64_t npu2_opencapi_creset(struct pci_slot *slot) > { > struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb); > > @@ -1183,19 +1205,10 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot) > OCAPIINF(dev, "no card detected\n"); > return OPAL_SUCCESS; > } > - if (dev->train_need_fence) { > - OCAPIDBG(dev, "Fencing OTL during reset\n"); > - set_fence_control(chip_id, dev->npu->xscom_base, > - dev->brick_index, 0b11); > - npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, > - PPC_BIT(dev->brick_index + 6)); > - dev->train_fenced = true; > - } > - dev->train_need_fence = true; > slot->link_retries = OCAPI_LINK_TRAINING_RETRIES; > - npu2_opencapi_phy_reset(dev); > /* fall-through */ > case OCAPI_SLOT_FRESET_INIT: > + fence_brick(dev); > assert_odl_reset(chip_id, dev->brick_index); > assert_adapter_reset(dev); > pci_slot_set_state(slot, > @@ -1204,6 +1217,7 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot) > return pci_slot_set_sm_timeout(slot, msecs_to_tb(5)); > > case OCAPI_SLOT_FRESET_ASSERT_DELAY: > + npu2_opencapi_phy_reset(dev); > deassert_odl_reset(chip_id, dev->brick_index); > pci_slot_set_state(slot, > OCAPI_SLOT_FRESET_DEASSERT_DELAY); > @@ -1221,15 +1235,7 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot) > return pci_slot_set_sm_timeout(slot, msecs_to_tb(250)); > > case OCAPI_SLOT_FRESET_DEASSERT_DELAY2: > - if (dev->train_fenced) { > - OCAPIDBG(dev, "Unfencing OTL after reset\n"); > - npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, > - PPC_BIT(dev->brick_index)); > - set_fence_control(chip_id, dev->npu->xscom_base, > - dev->brick_index, 0b00); > - dev->train_fenced = false; > - } > - > + unfence_brick(dev); > set_init_pattern(chip_id, dev); > pci_slot_set_state(slot, > OCAPI_SLOT_FRESET_INIT_DELAY); > @@ -1692,8 +1698,6 @@ static void setup_device(struct npu2_dev *dev) > > dev->bdfn = 0; > dev->linux_pe = -1; > - dev->train_need_fence = false; > - dev->train_fenced = false; > > /* TODO: Procedure 13.1.3.7 - AFU Memory Range BARs */ > /* Procedure 13.1.3.8 - AFU MMIO Range BARs */ > diff --git a/include/npu2.h b/include/npu2.h > index d2316dc1..6b1063da 100644 > --- a/include/npu2.h > +++ b/include/npu2.h > @@ -145,8 +145,6 @@ struct npu2_dev { > /* OpenCAPI */ > struct phb phb_ocapi; > uint64_t linux_pe; > - bool train_need_fence; > - bool train_fenced; > unsigned long train_start; > unsigned long train_timeout; > }; >
Le 17/09/2019 à 11:45, christophe lombard a écrit : > On 09/09/2019 14:31, Frederic Barrat wrote: >> Modify slightly the ordering of a few steps in our init sequence on >> fundamental reset, so that it can be called from the OS, when the link >> is already up: >> >> - when the card is reset, the link goes down, so we need to fence the >> brick to prevent errors propagating to the NPU and OS >> - since fencing and unfencing don't require any delay, let's also >> fence/unfence during the very first reset at boot. It's useless but >> doesn't hurt and keep the code simpler. >> - resetting the PHY must be done a bit later, while fenced and the ODL >> and DLx in reset >> >> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com> >> --- >> hw/npu2-opencapi.c | 48 +++++++++++++++++++++++++--------------------- >> include/npu2.h | 2 -- >> 2 files changed, 26 insertions(+), 24 deletions(-) >> >> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c >> index f7be9f09..619d4be8 100644 >> --- a/hw/npu2-opencapi.c >> +++ b/hw/npu2-opencapi.c >> @@ -1039,6 +1039,28 @@ static int64_t >> npu2_opencapi_get_presence_state(struct pci_slot __unused *slot, >> return OPAL_SUCCESS; >> } >> +static void fence_brick(struct npu2_dev *dev) >> +{ >> + OCAPIDBG(dev, "Fencing brick\n"); >> + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, >> + dev->brick_index, 0b11); >> + /* from 13.2.1, Quiesce Fence State */ >> + npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, >> + PPC_BIT(dev->brick_index + 6)); >> +} >> + >> +static void unfence_brick(struct npu2_dev *dev) >> +{ >> + OCAPIDBG(dev, "Unfencing brick\n"); >> + npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, >> + PPC_BIT(dev->brick_index)); >> + >> + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, >> + dev->brick_index, 0b10); > > Unfence the brick implies to reset the PowerBus (0b10)? > This has been already done in fence_brick() ? There are 2 sides for the fence: on the power bus and on the link. To go from fully fenced to unfenced, we need to go through the transitional 0b10 state (i.e. unfence the link first. a.k.a half-fenced). I'm pretty sure it's described as such in the workbook, though I can't find it now that I'm looking for it. But it's showing up in section "13.2.3 Exiting NPU Quiesce state". And we also already do the same during the NPU init, as we go through a fence->unfence transition in set_npcq_config() Fred >> + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, >> + dev->brick_index, 0b00); >> +} >> + >> static enum OpalShpcLinkState get_link_width(uint64_t odl_status) >> { >> uint64_t tx_lanes, rx_lanes, state; >> @@ -1153,7 +1175,7 @@ static int64_t npu2_opencapi_poll_link(struct >> pci_slot *slot) >> return OPAL_HARDWARE; >> } >> -static int64_t npu2_opencapi_creset(struct pci_slot *slot __unused) >> +static int64_t npu2_opencapi_creset(struct pci_slot *slot) >> { >> struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb); >> @@ -1183,19 +1205,10 @@ static int64_t npu2_opencapi_freset(struct >> pci_slot *slot) >> OCAPIINF(dev, "no card detected\n"); >> return OPAL_SUCCESS; >> } >> - if (dev->train_need_fence) { >> - OCAPIDBG(dev, "Fencing OTL during reset\n"); >> - set_fence_control(chip_id, dev->npu->xscom_base, >> - dev->brick_index, 0b11); >> - npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, >> - PPC_BIT(dev->brick_index + 6)); >> - dev->train_fenced = true; >> - } >> - dev->train_need_fence = true; >> slot->link_retries = OCAPI_LINK_TRAINING_RETRIES; >> - npu2_opencapi_phy_reset(dev); >> /* fall-through */ >> case OCAPI_SLOT_FRESET_INIT: >> + fence_brick(dev); >> assert_odl_reset(chip_id, dev->brick_index); >> assert_adapter_reset(dev); >> pci_slot_set_state(slot, >> @@ -1204,6 +1217,7 @@ static int64_t npu2_opencapi_freset(struct >> pci_slot *slot) >> return pci_slot_set_sm_timeout(slot, msecs_to_tb(5)); >> case OCAPI_SLOT_FRESET_ASSERT_DELAY: >> + npu2_opencapi_phy_reset(dev); >> deassert_odl_reset(chip_id, dev->brick_index); >> pci_slot_set_state(slot, >> OCAPI_SLOT_FRESET_DEASSERT_DELAY); >> @@ -1221,15 +1235,7 @@ static int64_t npu2_opencapi_freset(struct >> pci_slot *slot) >> return pci_slot_set_sm_timeout(slot, msecs_to_tb(250)); >> case OCAPI_SLOT_FRESET_DEASSERT_DELAY2: >> - if (dev->train_fenced) { >> - OCAPIDBG(dev, "Unfencing OTL after reset\n"); >> - npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, >> - PPC_BIT(dev->brick_index)); >> - set_fence_control(chip_id, dev->npu->xscom_base, >> - dev->brick_index, 0b00); >> - dev->train_fenced = false; >> - } >> - >> + unfence_brick(dev); >> set_init_pattern(chip_id, dev); >> pci_slot_set_state(slot, >> OCAPI_SLOT_FRESET_INIT_DELAY); >> @@ -1692,8 +1698,6 @@ static void setup_device(struct npu2_dev *dev) >> dev->bdfn = 0; >> dev->linux_pe = -1; >> - dev->train_need_fence = false; >> - dev->train_fenced = false; >> /* TODO: Procedure 13.1.3.7 - AFU Memory Range BARs */ >> /* Procedure 13.1.3.8 - AFU MMIO Range BARs */ >> diff --git a/include/npu2.h b/include/npu2.h >> index d2316dc1..6b1063da 100644 >> --- a/include/npu2.h >> +++ b/include/npu2.h >> @@ -145,8 +145,6 @@ struct npu2_dev { >> /* OpenCAPI */ >> struct phb phb_ocapi; >> uint64_t linux_pe; >> - bool train_need_fence; >> - bool train_fenced; >> unsigned long train_start; >> unsigned long train_timeout; >> }; >> >
diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c index f7be9f09..619d4be8 100644 --- a/hw/npu2-opencapi.c +++ b/hw/npu2-opencapi.c @@ -1039,6 +1039,28 @@ static int64_t npu2_opencapi_get_presence_state(struct pci_slot __unused *slot, return OPAL_SUCCESS; } +static void fence_brick(struct npu2_dev *dev) +{ + OCAPIDBG(dev, "Fencing brick\n"); + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, + dev->brick_index, 0b11); + /* from 13.2.1, Quiesce Fence State */ + npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, + PPC_BIT(dev->brick_index + 6)); +} + +static void unfence_brick(struct npu2_dev *dev) +{ + OCAPIDBG(dev, "Unfencing brick\n"); + npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, + PPC_BIT(dev->brick_index)); + + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, + dev->brick_index, 0b10); + set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, + dev->brick_index, 0b00); +} + static enum OpalShpcLinkState get_link_width(uint64_t odl_status) { uint64_t tx_lanes, rx_lanes, state; @@ -1153,7 +1175,7 @@ static int64_t npu2_opencapi_poll_link(struct pci_slot *slot) return OPAL_HARDWARE; } -static int64_t npu2_opencapi_creset(struct pci_slot *slot __unused) +static int64_t npu2_opencapi_creset(struct pci_slot *slot) { struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb); @@ -1183,19 +1205,10 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot) OCAPIINF(dev, "no card detected\n"); return OPAL_SUCCESS; } - if (dev->train_need_fence) { - OCAPIDBG(dev, "Fencing OTL during reset\n"); - set_fence_control(chip_id, dev->npu->xscom_base, - dev->brick_index, 0b11); - npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, - PPC_BIT(dev->brick_index + 6)); - dev->train_fenced = true; - } - dev->train_need_fence = true; slot->link_retries = OCAPI_LINK_TRAINING_RETRIES; - npu2_opencapi_phy_reset(dev); /* fall-through */ case OCAPI_SLOT_FRESET_INIT: + fence_brick(dev); assert_odl_reset(chip_id, dev->brick_index); assert_adapter_reset(dev); pci_slot_set_state(slot, @@ -1204,6 +1217,7 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot) return pci_slot_set_sm_timeout(slot, msecs_to_tb(5)); case OCAPI_SLOT_FRESET_ASSERT_DELAY: + npu2_opencapi_phy_reset(dev); deassert_odl_reset(chip_id, dev->brick_index); pci_slot_set_state(slot, OCAPI_SLOT_FRESET_DEASSERT_DELAY); @@ -1221,15 +1235,7 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot) return pci_slot_set_sm_timeout(slot, msecs_to_tb(250)); case OCAPI_SLOT_FRESET_DEASSERT_DELAY2: - if (dev->train_fenced) { - OCAPIDBG(dev, "Unfencing OTL after reset\n"); - npu2_write(dev->npu, NPU2_MISC_FENCE_STATE, - PPC_BIT(dev->brick_index)); - set_fence_control(chip_id, dev->npu->xscom_base, - dev->brick_index, 0b00); - dev->train_fenced = false; - } - + unfence_brick(dev); set_init_pattern(chip_id, dev); pci_slot_set_state(slot, OCAPI_SLOT_FRESET_INIT_DELAY); @@ -1692,8 +1698,6 @@ static void setup_device(struct npu2_dev *dev) dev->bdfn = 0; dev->linux_pe = -1; - dev->train_need_fence = false; - dev->train_fenced = false; /* TODO: Procedure 13.1.3.7 - AFU Memory Range BARs */ /* Procedure 13.1.3.8 - AFU MMIO Range BARs */ diff --git a/include/npu2.h b/include/npu2.h index d2316dc1..6b1063da 100644 --- a/include/npu2.h +++ b/include/npu2.h @@ -145,8 +145,6 @@ struct npu2_dev { /* OpenCAPI */ struct phb phb_ocapi; uint64_t linux_pe; - bool train_need_fence; - bool train_fenced; unsigned long train_start; unsigned long train_timeout; };
Modify slightly the ordering of a few steps in our init sequence on fundamental reset, so that it can be called from the OS, when the link is already up: - when the card is reset, the link goes down, so we need to fence the brick to prevent errors propagating to the NPU and OS - since fencing and unfencing don't require any delay, let's also fence/unfence during the very first reset at boot. It's useless but doesn't hurt and keep the code simpler. - resetting the PHY must be done a bit later, while fenced and the ODL and DLx in reset Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com> --- hw/npu2-opencapi.c | 48 +++++++++++++++++++++++++--------------------- include/npu2.h | 2 -- 2 files changed, 26 insertions(+), 24 deletions(-)