diff mbox series

[v2] soc: tegra: Add Tegra186 ARI driver

Message ID 20210617121307.792386-1-mperttunen@nvidia.com
State Accepted
Headers show
Series [v2] soc: tegra: Add Tegra186 ARI driver | expand

Commit Message

Mikko Perttunen June 17, 2021, 12:13 p.m. UTC
Add a driver to hook into panic notifiers and print machine check
status for debugging. Status information is retrieved via SMC. This
is supported by upstream ARM Trusted Firmware.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
---
v2:
* Changed to use panic notifier instead of serror hook
---
 drivers/soc/tegra/Makefile       |  1 +
 drivers/soc/tegra/ari-tegra186.c | 80 ++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 drivers/soc/tegra/ari-tegra186.c

Comments

Thierry Reding June 18, 2021, 12:03 p.m. UTC | #1
On Thu, Jun 17, 2021 at 03:13:07PM +0300, Mikko Perttunen wrote:
> Add a driver to hook into panic notifiers and print machine check
> status for debugging. Status information is retrieved via SMC. This
> is supported by upstream ARM Trusted Firmware.
> 
> Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
> ---
> v2:
> * Changed to use panic notifier instead of serror hook
> ---
>  drivers/soc/tegra/Makefile       |  1 +
>  drivers/soc/tegra/ari-tegra186.c | 80 ++++++++++++++++++++++++++++++++
>  2 files changed, 81 insertions(+)
>  create mode 100644 drivers/soc/tegra/ari-tegra186.c
> 
> diff --git a/drivers/soc/tegra/Makefile b/drivers/soc/tegra/Makefile
> index 9c809c1814bd..054e862b63d8 100644
> --- a/drivers/soc/tegra/Makefile
> +++ b/drivers/soc/tegra/Makefile
> @@ -7,3 +7,4 @@ obj-$(CONFIG_SOC_TEGRA_PMC) += pmc.o
>  obj-$(CONFIG_SOC_TEGRA_POWERGATE_BPMP) += powergate-bpmp.o
>  obj-$(CONFIG_SOC_TEGRA20_VOLTAGE_COUPLER) += regulators-tegra20.o
>  obj-$(CONFIG_SOC_TEGRA30_VOLTAGE_COUPLER) += regulators-tegra30.o
> +obj-$(CONFIG_ARCH_TEGRA_186_SOC) += ari-tegra186.o
> diff --git a/drivers/soc/tegra/ari-tegra186.c b/drivers/soc/tegra/ari-tegra186.c
> new file mode 100644
> index 000000000000..02577853ec49
> --- /dev/null
> +++ b/drivers/soc/tegra/ari-tegra186.c
> @@ -0,0 +1,80 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
> + */
> +
> +#include <linux/arm-smccc.h>
> +#include <linux/kernel.h>
> +#include <linux/of.h>
> +#include <linux/panic_notifier.h>
> +
> +#define SMC_SIP_INVOKE_MCE			0xc2ffff00
> +#define MCE_SMC_READ_MCA			12
> +
> +#define MCA_ARI_CMD_RD_SERR			1
> +
> +#define MCA_ARI_RW_SUBIDX_STAT			1
> +#define SERR_STATUS_VAL				BIT_ULL(63)
> +
> +#define MCA_ARI_RW_SUBIDX_ADDR			2
> +#define MCA_ARI_RW_SUBIDX_MSC1			3
> +#define MCA_ARI_RW_SUBIDX_MSC2			4
> +
> +static const char * const bank_names[] = {
> +	"SYS:DPMU", "ROC:IOB", "ROC:MCB", "ROC:CCE", "ROC:CQX", "ROC:CTU",
> +};
> +
> +static void read_uncore_mca(u8 cmd, u8 idx, u8 subidx, u8 inst, u64 *data)
> +{
> +	struct arm_smccc_res res;
> +
> +	arm_smccc_smc(SMC_SIP_INVOKE_MCE | MCE_SMC_READ_MCA,
> +		      ((u64)inst << 24) | ((u64)idx << 16) |
> +			      ((u64)subidx << 8) | ((u64)cmd << 0),
> +		      0, 0, 0, 0, 0, 0, &res);
> +
> +	*data = res.a2;
> +}
> +
> +static int tegra186_ari_panic_handler(struct notifier_block *nb,
> +				      unsigned long code, void *unused)
> +{
> +	u64 status;
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(bank_names); i++) {
> +		read_uncore_mca(MCA_ARI_CMD_RD_SERR, i, MCA_ARI_RW_SUBIDX_STAT,
> +				0, &status);
> +
> +		if (status & SERR_STATUS_VAL) {
> +			u64 addr, misc1, misc2;
> +
> +			read_uncore_mca(MCA_ARI_CMD_RD_SERR, i,
> +					MCA_ARI_RW_SUBIDX_ADDR, 0, &addr);
> +			read_uncore_mca(MCA_ARI_CMD_RD_SERR, i,
> +					MCA_ARI_RW_SUBIDX_MSC1, 0, &misc1);
> +			read_uncore_mca(MCA_ARI_CMD_RD_SERR, i,
> +					MCA_ARI_RW_SUBIDX_MSC2, 0, &misc2);
> +
> +			pr_crit("Machine Check Error in %s\n"
> +				"  status=0x%llx addr=0x%llx\n"
> +				"  msc1=0x%llx msc2=0x%llx\n",
> +				bank_names[i], status, addr, misc1, misc2);

This still looks rather cryptic to me. Is there some way to further
decode things like the status and MSC registers? Or is this something
that people are supposed to know how to intepret?

Also, I'm not sure it's evident what those various banks are. Is there
some way we can provide a description for these?

Additional information doesn't necessarily have to go into code, but
it'd be nice if at least there was some sort of comment somewhere that
goes into a bit more detail so that people know how to use this. Or
perhaps this is documented in the TRM? If so, perhaps provide a
reference to that so that people know where to find the information.

Thierry
Mikko Perttunen June 18, 2021, 12:32 p.m. UTC | #2
On 6/18/21 3:03 PM, Thierry Reding wrote:
> On Thu, Jun 17, 2021 at 03:13:07PM +0300, Mikko Perttunen wrote:
>> Add a driver to hook into panic notifiers and print machine check
>> status for debugging. Status information is retrieved via SMC. This
>> is supported by upstream ARM Trusted Firmware.
>>
>> Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
>> ---
>> v2:
>> * Changed to use panic notifier instead of serror hook
>> ---
>>   drivers/soc/tegra/Makefile       |  1 +
>>   drivers/soc/tegra/ari-tegra186.c | 80 ++++++++++++++++++++++++++++++++
>>   2 files changed, 81 insertions(+)
>>   create mode 100644 drivers/soc/tegra/ari-tegra186.c
>>
>> diff --git a/drivers/soc/tegra/Makefile b/drivers/soc/tegra/Makefile
>> index 9c809c1814bd..054e862b63d8 100644
>> --- a/drivers/soc/tegra/Makefile
>> +++ b/drivers/soc/tegra/Makefile
>> @@ -7,3 +7,4 @@ obj-$(CONFIG_SOC_TEGRA_PMC) += pmc.o
>>   obj-$(CONFIG_SOC_TEGRA_POWERGATE_BPMP) += powergate-bpmp.o
>>   obj-$(CONFIG_SOC_TEGRA20_VOLTAGE_COUPLER) += regulators-tegra20.o
>>   obj-$(CONFIG_SOC_TEGRA30_VOLTAGE_COUPLER) += regulators-tegra30.o
>> +obj-$(CONFIG_ARCH_TEGRA_186_SOC) += ari-tegra186.o
>> diff --git a/drivers/soc/tegra/ari-tegra186.c b/drivers/soc/tegra/ari-tegra186.c
>> new file mode 100644
>> index 000000000000..02577853ec49
>> --- /dev/null
>> +++ b/drivers/soc/tegra/ari-tegra186.c
>> @@ -0,0 +1,80 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
>> + */
>> +
>> +#include <linux/arm-smccc.h>
>> +#include <linux/kernel.h>
>> +#include <linux/of.h>
>> +#include <linux/panic_notifier.h>
>> +
>> +#define SMC_SIP_INVOKE_MCE			0xc2ffff00
>> +#define MCE_SMC_READ_MCA			12
>> +
>> +#define MCA_ARI_CMD_RD_SERR			1
>> +
>> +#define MCA_ARI_RW_SUBIDX_STAT			1
>> +#define SERR_STATUS_VAL				BIT_ULL(63)
>> +
>> +#define MCA_ARI_RW_SUBIDX_ADDR			2
>> +#define MCA_ARI_RW_SUBIDX_MSC1			3
>> +#define MCA_ARI_RW_SUBIDX_MSC2			4
>> +
>> +static const char * const bank_names[] = {
>> +	"SYS:DPMU", "ROC:IOB", "ROC:MCB", "ROC:CCE", "ROC:CQX", "ROC:CTU",
>> +};
>> +
>> +static void read_uncore_mca(u8 cmd, u8 idx, u8 subidx, u8 inst, u64 *data)
>> +{
>> +	struct arm_smccc_res res;
>> +
>> +	arm_smccc_smc(SMC_SIP_INVOKE_MCE | MCE_SMC_READ_MCA,
>> +		      ((u64)inst << 24) | ((u64)idx << 16) |
>> +			      ((u64)subidx << 8) | ((u64)cmd << 0),
>> +		      0, 0, 0, 0, 0, 0, &res);
>> +
>> +	*data = res.a2;
>> +}
>> +
>> +static int tegra186_ari_panic_handler(struct notifier_block *nb,
>> +				      unsigned long code, void *unused)
>> +{
>> +	u64 status;
>> +	int i;
>> +
>> +	for (i = 0; i < ARRAY_SIZE(bank_names); i++) {
>> +		read_uncore_mca(MCA_ARI_CMD_RD_SERR, i, MCA_ARI_RW_SUBIDX_STAT,
>> +				0, &status);
>> +
>> +		if (status & SERR_STATUS_VAL) {
>> +			u64 addr, misc1, misc2;
>> +
>> +			read_uncore_mca(MCA_ARI_CMD_RD_SERR, i,
>> +					MCA_ARI_RW_SUBIDX_ADDR, 0, &addr);
>> +			read_uncore_mca(MCA_ARI_CMD_RD_SERR, i,
>> +					MCA_ARI_RW_SUBIDX_MSC1, 0, &misc1);
>> +			read_uncore_mca(MCA_ARI_CMD_RD_SERR, i,
>> +					MCA_ARI_RW_SUBIDX_MSC2, 0, &misc2);
>> +
>> +			pr_crit("Machine Check Error in %s\n"
>> +				"  status=0x%llx addr=0x%llx\n"
>> +				"  msc1=0x%llx msc2=0x%llx\n",
>> +				bank_names[i], status, addr, misc1, misc2);
> 
> This still looks rather cryptic to me. Is there some way to further
> decode things like the status and MSC registers? Or is this something
> that people are supposed to know how to intepret?

Indeed, it is very cryptic. The corresponding downstream driver does a 
lot of work to print more human-readable descriptions of these - that's 
what I used as a decoding reference as well myself.

I was thinking that having the barebones here is good to at least have 
the error data to decode manually, and we can add the more 
human-readable decoding afterwards, or perhaps provide some script to 
decode it.

> 
> Also, I'm not sure it's evident what those various banks are. Is there
> some way we can provide a description for these?

I don't know if the bank names themselves are very useful - each bank is 
decoded differently and the decoded information is the useful part.

> 
> Additional information doesn't necessarily have to go into code, but
> it'd be nice if at least there was some sort of comment somewhere that
> goes into a bit more detail so that people know how to use this. Or
> perhaps this is documented in the TRM? If so, perhaps provide a
> reference to that so that people know where to find the information.

I don't know if the TRM has this information. How to decode these values 
can be seen in

https://nv-tegra.nvidia.com/gitweb/?p=linux-nvidia.git;a=blob;f=drivers/platform/tegra/ari_mca.c;h=040d05978ca49755a68365bebe7c46f6628c5162;hb=6dc57fec39c444e4c4448be61ddd19c55693daf1

and

https://nv-tegra.nvidia.com/gitweb/?p=linux-nvidia.git;a=blob;f=include/linux/platform/tegra/ari_mca.h;h=e6e4ac3abf674a1fc93f7b0dbcac4d6e672772d6;hb=6dc57fec39c444e4c4448be61ddd19c55693daf1

Not sure if these kinds of links are very appropriate as references though.

> 
> Thierry
> 

Thanks,
Mikko
Thierry Reding Oct. 7, 2021, 6:47 p.m. UTC | #3
On Thu, Jun 17, 2021 at 03:13:07PM +0300, Mikko Perttunen wrote:
> Add a driver to hook into panic notifiers and print machine check
> status for debugging. Status information is retrieved via SMC. This
> is supported by upstream ARM Trusted Firmware.
> 
> Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
> ---
> v2:
> * Changed to use panic notifier instead of serror hook
> ---
>  drivers/soc/tegra/Makefile       |  1 +
>  drivers/soc/tegra/ari-tegra186.c | 80 ++++++++++++++++++++++++++++++++
>  2 files changed, 81 insertions(+)
>  create mode 100644 drivers/soc/tegra/ari-tegra186.c

Applied, thanks.

Thierry
diff mbox series

Patch

diff --git a/drivers/soc/tegra/Makefile b/drivers/soc/tegra/Makefile
index 9c809c1814bd..054e862b63d8 100644
--- a/drivers/soc/tegra/Makefile
+++ b/drivers/soc/tegra/Makefile
@@ -7,3 +7,4 @@  obj-$(CONFIG_SOC_TEGRA_PMC) += pmc.o
 obj-$(CONFIG_SOC_TEGRA_POWERGATE_BPMP) += powergate-bpmp.o
 obj-$(CONFIG_SOC_TEGRA20_VOLTAGE_COUPLER) += regulators-tegra20.o
 obj-$(CONFIG_SOC_TEGRA30_VOLTAGE_COUPLER) += regulators-tegra30.o
+obj-$(CONFIG_ARCH_TEGRA_186_SOC) += ari-tegra186.o
diff --git a/drivers/soc/tegra/ari-tegra186.c b/drivers/soc/tegra/ari-tegra186.c
new file mode 100644
index 000000000000..02577853ec49
--- /dev/null
+++ b/drivers/soc/tegra/ari-tegra186.c
@@ -0,0 +1,80 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/panic_notifier.h>
+
+#define SMC_SIP_INVOKE_MCE			0xc2ffff00
+#define MCE_SMC_READ_MCA			12
+
+#define MCA_ARI_CMD_RD_SERR			1
+
+#define MCA_ARI_RW_SUBIDX_STAT			1
+#define SERR_STATUS_VAL				BIT_ULL(63)
+
+#define MCA_ARI_RW_SUBIDX_ADDR			2
+#define MCA_ARI_RW_SUBIDX_MSC1			3
+#define MCA_ARI_RW_SUBIDX_MSC2			4
+
+static const char * const bank_names[] = {
+	"SYS:DPMU", "ROC:IOB", "ROC:MCB", "ROC:CCE", "ROC:CQX", "ROC:CTU",
+};
+
+static void read_uncore_mca(u8 cmd, u8 idx, u8 subidx, u8 inst, u64 *data)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_smc(SMC_SIP_INVOKE_MCE | MCE_SMC_READ_MCA,
+		      ((u64)inst << 24) | ((u64)idx << 16) |
+			      ((u64)subidx << 8) | ((u64)cmd << 0),
+		      0, 0, 0, 0, 0, 0, &res);
+
+	*data = res.a2;
+}
+
+static int tegra186_ari_panic_handler(struct notifier_block *nb,
+				      unsigned long code, void *unused)
+{
+	u64 status;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(bank_names); i++) {
+		read_uncore_mca(MCA_ARI_CMD_RD_SERR, i, MCA_ARI_RW_SUBIDX_STAT,
+				0, &status);
+
+		if (status & SERR_STATUS_VAL) {
+			u64 addr, misc1, misc2;
+
+			read_uncore_mca(MCA_ARI_CMD_RD_SERR, i,
+					MCA_ARI_RW_SUBIDX_ADDR, 0, &addr);
+			read_uncore_mca(MCA_ARI_CMD_RD_SERR, i,
+					MCA_ARI_RW_SUBIDX_MSC1, 0, &misc1);
+			read_uncore_mca(MCA_ARI_CMD_RD_SERR, i,
+					MCA_ARI_RW_SUBIDX_MSC2, 0, &misc2);
+
+			pr_crit("Machine Check Error in %s\n"
+				"  status=0x%llx addr=0x%llx\n"
+				"  msc1=0x%llx msc2=0x%llx\n",
+				bank_names[i], status, addr, misc1, misc2);
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block tegra186_ari_panic_nb = {
+	.notifier_call = tegra186_ari_panic_handler,
+};
+
+static int __init tegra186_ari_init(void)
+{
+	if (of_machine_is_compatible("nvidia,tegra186"))
+		atomic_notifier_chain_register(&panic_notifier_list, &tegra186_ari_panic_nb);
+
+	return 0;
+}
+early_initcall(tegra186_ari_init);