Message ID | 20180613132414.32207-1-mpe@ellerman.id.au (mailing list archive) |
---|---|
State | Accepted |
Commit | 54dbcfc211f15586c57d27492f938eb4df964257 |
Headers | show |
Series | powerpc/64s: Report SLB multi-hit rather than parity error | expand |
On Wed, 13 Jun 2018 23:24:14 +1000 Michael Ellerman <mpe@ellerman.id.au> wrote: > When we take an SLB multi-hit on bare metal, we see both the multi-hit > and parity error bits set in DSISR. The user manuals indicates this is > expected to always happen on Power8, whereas on Power9 it says a > multi-hit will "usually" also cause a parity error. > > We decide what to do based on the various error tables in mce_power.c, > and because we process them in order and only report the first, we > currently always report a parity error but not the multi-hit, eg: > > Severe Machine check interrupt [Recovered] > Initiator: CPU > Error type: SLB [Parity] > Effective address: c000000ffffd4300 > > Although this is correct, it leaves the user wondering why they got a > parity error. It would be clearer instead if we reported the > multi-hit because that is more likely to be simply a software bug, > whereas a true parity error is possibly an indication of a bad core. > > We can do that simply by reordering the error tables so that multi-hit > appears before parity. That doesn't affect the error recovery at all, > because we flush the SLB either way. Yeah this is a good idea. I wonder if there are any other conditions like this that should be reordered. I think the i-side should not have to be changed here because it matches the value not bits, so that shouldn't matter. A bit of a shame we don't report i/d side, and ideally we'd be able to report multiple conditions. The reporting APIs really want to be massaged a bit, but for now this is a good step. Reviewed-by: Nicholas Piggin <npiggin@gmail.com> > > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> > --- > arch/powerpc/kernel/mce_power.c | 36 ++++++++++++++++++------------------ > 1 file changed, 18 insertions(+), 18 deletions(-) > > diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c > index 38c5b4764bfe..1e450d0c4f72 100644 > --- a/arch/powerpc/kernel/mce_power.c > +++ b/arch/powerpc/kernel/mce_power.c > @@ -140,12 +140,12 @@ static const struct mce_ierror_table mce_p7_ierror_table[] = { > { 0x00000000001c0000, 0x0000000000040000, true, > MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > +{ 0x00000000001c0000, 0x00000000000c0000, true, > + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ > + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000000001c0000, 0x0000000000080000, true, > MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > -{ 0x00000000001c0000, 0x00000000000c0000, true, > - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, > - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000000001c0000, 0x0000000000100000, true, > MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > @@ -164,12 +164,12 @@ static const struct mce_ierror_table mce_p8_ierror_table[] = { > { 0x00000000081c0000, 0x0000000000040000, true, > MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > +{ 0x00000000081c0000, 0x00000000000c0000, true, > + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ > + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000000081c0000, 0x0000000000080000, true, > MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > -{ 0x00000000081c0000, 0x00000000000c0000, true, > - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, > - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000000081c0000, 0x0000000000100000, true, > MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > @@ -194,12 +194,12 @@ static const struct mce_ierror_table mce_p9_ierror_table[] = { > { 0x00000000081c0000, 0x0000000000040000, true, > MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > +{ 0x00000000081c0000, 0x00000000000c0000, true, > + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ > + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000000081c0000, 0x0000000000080000, true, > MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > -{ 0x00000000081c0000, 0x00000000000c0000, true, > - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, > - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000000081c0000, 0x0000000000100000, true, > MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > @@ -257,12 +257,12 @@ static const struct mce_derror_table mce_p7_derror_table[] = { > { 0x00000400, true, > MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > +{ 0x00000080, true, > + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ > + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000100, true, > MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > -{ 0x00000080, true, > - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, > - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000040, true, > MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > @@ -290,12 +290,12 @@ static const struct mce_derror_table mce_p8_derror_table[] = { > { 0x00000200, true, > MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */ > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > +{ 0x00000080, true, > + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ > + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000100, true, > MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > -{ 0x00000080, true, > - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, > - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0, false, 0, 0, 0, 0 } }; > > static const struct mce_derror_table mce_p9_derror_table[] = { > @@ -320,12 +320,12 @@ static const struct mce_derror_table mce_p9_derror_table[] = { > { 0x00000200, false, > MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > +{ 0x00000080, true, > + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ > + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000100, true, > MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > -{ 0x00000080, true, > - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, > - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, > { 0x00000040, true, > MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD, > MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
Nicholas Piggin <npiggin@gmail.com> writes: > On Wed, 13 Jun 2018 23:24:14 +1000 > Michael Ellerman <mpe@ellerman.id.au> wrote: > >> When we take an SLB multi-hit on bare metal, we see both the multi-hit >> and parity error bits set in DSISR. The user manuals indicates this is >> expected to always happen on Power8, whereas on Power9 it says a >> multi-hit will "usually" also cause a parity error. >> >> We decide what to do based on the various error tables in mce_power.c, >> and because we process them in order and only report the first, we >> currently always report a parity error but not the multi-hit, eg: >> >> Severe Machine check interrupt [Recovered] >> Initiator: CPU >> Error type: SLB [Parity] >> Effective address: c000000ffffd4300 >> >> Although this is correct, it leaves the user wondering why they got a >> parity error. It would be clearer instead if we reported the >> multi-hit because that is more likely to be simply a software bug, >> whereas a true parity error is possibly an indication of a bad core. >> >> We can do that simply by reordering the error tables so that multi-hit >> appears before parity. That doesn't affect the error recovery at all, >> because we flush the SLB either way. > > Yeah this is a good idea. I wonder if there are any other conditions > like this that should be reordered. Yeah good point, this one just caught my eye because I was testing it. Ideally it wouldn't matter and we could actually report multiple, but that would be a bit of a bigger change. > I think the i-side should not have to be changed here because it > matches the value not bits, so that shouldn't matter. Ah OK, will check. > A bit of a shame we don't report i/d side, and ideally we'd be able > to report multiple conditions. The reporting APIs really want to be > massaged a bit, but for now this is a good step. Ah snap, yep, more detail & multiple conditions would be nice. I don't really understand the way we do the reporting now. The struct machine_check_event is all carefully laid out with reserved fields and a version number and everything as if it's an ABI. But AFAICS it's purely internal to the kernel. And then we have struct mce_error_info, but that's a separate thing and struct machine_check_event doesn't contain one of them? cheers
On Fri, 15 Jun 2018 21:37:15 +1000 Michael Ellerman <mpe@ellerman.id.au> wrote: > Nicholas Piggin <npiggin@gmail.com> writes: > > On Wed, 13 Jun 2018 23:24:14 +1000 > > Michael Ellerman <mpe@ellerman.id.au> wrote: > > > >> When we take an SLB multi-hit on bare metal, we see both the multi-hit > >> and parity error bits set in DSISR. The user manuals indicates this is > >> expected to always happen on Power8, whereas on Power9 it says a > >> multi-hit will "usually" also cause a parity error. > >> > >> We decide what to do based on the various error tables in mce_power.c, > >> and because we process them in order and only report the first, we > >> currently always report a parity error but not the multi-hit, eg: > >> > >> Severe Machine check interrupt [Recovered] > >> Initiator: CPU > >> Error type: SLB [Parity] > >> Effective address: c000000ffffd4300 > >> > >> Although this is correct, it leaves the user wondering why they got a > >> parity error. It would be clearer instead if we reported the > >> multi-hit because that is more likely to be simply a software bug, > >> whereas a true parity error is possibly an indication of a bad core. > >> > >> We can do that simply by reordering the error tables so that multi-hit > >> appears before parity. That doesn't affect the error recovery at all, > >> because we flush the SLB either way. > > > > Yeah this is a good idea. I wonder if there are any other conditions > > like this that should be reordered. > > Yeah good point, this one just caught my eye because I was testing it. > Ideally it wouldn't matter and we could actually report multiple, but > that would be a bit of a bigger change. Yep this patch looks fine for a minimal fix. > > > I think the i-side should not have to be changed here because it > > matches the value not bits, so that shouldn't matter. > > Ah OK, will check. > > > A bit of a shame we don't report i/d side, and ideally we'd be able > > to report multiple conditions. The reporting APIs really want to be > > massaged a bit, but for now this is a good step. > > Ah snap, yep, more detail & multiple conditions would be nice. > > I don't really understand the way we do the reporting now. The > struct machine_check_event is all carefully laid out with reserved > fields and a version number and everything as if it's an ABI. But AFAICS > it's purely internal to the kernel. > > And then we have struct mce_error_info, but that's a separate thing and > struct machine_check_event doesn't contain one of them? Yeah I noticed that too a while back, was it an old OPAL API or maybe a proposed new API that was never implemented? I would like to end up doing most MCE decoding in firmware at some point, but I don't think it's worth keeping this existing ABI thing around for it. Thanks, Nick
On Wed, 2018-06-13 at 13:24:14 UTC, Michael Ellerman wrote: > When we take an SLB multi-hit on bare metal, we see both the multi-hit > and parity error bits set in DSISR. The user manuals indicates this is > expected to always happen on Power8, whereas on Power9 it says a > multi-hit will "usually" also cause a parity error. > > We decide what to do based on the various error tables in mce_power.c, > and because we process them in order and only report the first, we > currently always report a parity error but not the multi-hit, eg: > > Severe Machine check interrupt [Recovered] > Initiator: CPU > Error type: SLB [Parity] > Effective address: c000000ffffd4300 > > Although this is correct, it leaves the user wondering why they got a > parity error. It would be clearer instead if we reported the > multi-hit because that is more likely to be simply a software bug, > whereas a true parity error is possibly an indication of a bad core. > > We can do that simply by reordering the error tables so that multi-hit > appears before parity. That doesn't affect the error recovery at all, > because we flush the SLB either way. > > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> > Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Applied to powerpc next. https://git.kernel.org/powerpc/c/54dbcfc211f15586c57d27492f938e cheers
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 38c5b4764bfe..1e450d0c4f72 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -140,12 +140,12 @@ static const struct mce_ierror_table mce_p7_ierror_table[] = { { 0x00000000001c0000, 0x0000000000040000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x00000000000c0000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000001c0000, 0x0000000000080000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0x00000000001c0000, 0x00000000000c0000, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000001c0000, 0x0000000000100000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, @@ -164,12 +164,12 @@ static const struct mce_ierror_table mce_p8_ierror_table[] = { { 0x00000000081c0000, 0x0000000000040000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000000c0000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000081c0000, 0x0000000000080000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0x00000000081c0000, 0x00000000000c0000, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000081c0000, 0x0000000000100000, true, MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, @@ -194,12 +194,12 @@ static const struct mce_ierror_table mce_p9_ierror_table[] = { { 0x00000000081c0000, 0x0000000000040000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000000c0000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000081c0000, 0x0000000000080000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0x00000000081c0000, 0x00000000000c0000, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000081c0000, 0x0000000000100000, true, MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, @@ -257,12 +257,12 @@ static const struct mce_derror_table mce_p7_derror_table[] = { { 0x00000400, true, MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000080, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000100, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0x00000080, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000040, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, @@ -290,12 +290,12 @@ static const struct mce_derror_table mce_p8_derror_table[] = { { 0x00000200, true, MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000080, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000100, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0x00000080, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0, false, 0, 0, 0, 0 } }; static const struct mce_derror_table mce_p9_derror_table[] = { @@ -320,12 +320,12 @@ static const struct mce_derror_table mce_p9_derror_table[] = { { 0x00000200, false, MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000080, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000100, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0x00000080, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000040, true, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
When we take an SLB multi-hit on bare metal, we see both the multi-hit and parity error bits set in DSISR. The user manuals indicates this is expected to always happen on Power8, whereas on Power9 it says a multi-hit will "usually" also cause a parity error. We decide what to do based on the various error tables in mce_power.c, and because we process them in order and only report the first, we currently always report a parity error but not the multi-hit, eg: Severe Machine check interrupt [Recovered] Initiator: CPU Error type: SLB [Parity] Effective address: c000000ffffd4300 Although this is correct, it leaves the user wondering why they got a parity error. It would be clearer instead if we reported the multi-hit because that is more likely to be simply a software bug, whereas a true parity error is possibly an indication of a bad core. We can do that simply by reordering the error tables so that multi-hit appears before parity. That doesn't affect the error recovery at all, because we flush the SLB either way. Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> --- arch/powerpc/kernel/mce_power.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-)