[v4,14/25] powernv/fadump: process the crashdump by exporting it as /proc/vmcore
diff mbox series

Message ID 156327681824.27462.1314030665685342118.stgit@hbathini.in.ibm.com
State Changes Requested
Headers show
Series
  • Add FADump support on PowerNV platform
Related show

Checks

Context Check Description
snowpatch_ozlabs/apply_patch fail Failed to apply to any branch

Commit Message

Hari Bathini July 16, 2019, 11:33 a.m. UTC
Add support in the kernel to process the crash'ed kernel's memory
preserved during MPIPL and export it as /proc/vmcore file for the
userland scripts to filter and analyze it later.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
---
 arch/powerpc/platforms/powernv/opal-fadump.c |  190 ++++++++++++++++++++++++++
 1 file changed, 187 insertions(+), 3 deletions(-)

Comments

Mahesh J Salgaonkar Aug. 14, 2019, 10:18 a.m. UTC | #1
On 2019-07-16 17:03:38 Tue, Hari Bathini wrote:
> Add support in the kernel to process the crash'ed kernel's memory
> preserved during MPIPL and export it as /proc/vmcore file for the
> userland scripts to filter and analyze it later.
> 
> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
> ---
>  arch/powerpc/platforms/powernv/opal-fadump.c |  190 ++++++++++++++++++++++++++
>  1 file changed, 187 insertions(+), 3 deletions(-)
> 
[...]
> +		ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
> +		if ((ret != OPAL_SUCCESS) || !addr) {
> +			pr_err("Failed to get Kernel metadata (%lld)\n", ret);
> +			return 1;
> +		}
> +
> +		addr = be64_to_cpu(addr);
> +		pr_debug("Kernel metadata addr: %llx\n", addr);
> +
> +		opal_fdm_active = __va(addr);
> +		r_opal_fdm_active = (void *)addr;
> +		if (r_opal_fdm_active->version != OPAL_FADUMP_VERSION) {
> +			pr_err("FADump active but version (%u) unsupported!\n",
> +			       r_opal_fdm_active->version);
> +			return 1;
> +		}
> +
> +		/* Kernel regions not registered with f/w  for MPIPL */
> +		if (r_opal_fdm_active->registered_regions == 0) {
> +			opal_fdm_active = NULL;

What about partial dump capture scenario ? What if opal crashes while
kernel was in middle of registering ranges ? We may have partial dump
captured which won't be useful.
e,g. If we have total of 4 ranges to be registered and opal crashes
after successful registration of only 2 ranges with 2 pending, we will get a
partial dump which needs to be ignored.

I think check shuold be comparing registered_regions against total number of
regions. What do you think ?

Thanks,
-Mahesh.

> +			return 1;
> +		}
> +
> +		pr_info("Firmware-assisted dump is active.\n");
> +		fadump_conf->dump_active = 1;
> +		opal_fadump_get_config(fadump_conf, r_opal_fdm_active);
> +	}
> +
>  	return 1;
>  }
>
Hari Bathini Aug. 14, 2019, 11:11 a.m. UTC | #2
On 14/08/19 3:48 PM, Mahesh J Salgaonkar wrote:
> On 2019-07-16 17:03:38 Tue, Hari Bathini wrote:
>> Add support in the kernel to process the crash'ed kernel's memory
>> preserved during MPIPL and export it as /proc/vmcore file for the
>> userland scripts to filter and analyze it later.
>>
>> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
>> ---
>>  arch/powerpc/platforms/powernv/opal-fadump.c |  190 ++++++++++++++++++++++++++
>>  1 file changed, 187 insertions(+), 3 deletions(-)
>>
> [...]
>> +		ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
>> +		if ((ret != OPAL_SUCCESS) || !addr) {
>> +			pr_err("Failed to get Kernel metadata (%lld)\n", ret);
>> +			return 1;
>> +		}
>> +
>> +		addr = be64_to_cpu(addr);
>> +		pr_debug("Kernel metadata addr: %llx\n", addr);
>> +
>> +		opal_fdm_active = __va(addr);
>> +		r_opal_fdm_active = (void *)addr;
>> +		if (r_opal_fdm_active->version != OPAL_FADUMP_VERSION) {
>> +			pr_err("FADump active but version (%u) unsupported!\n",
>> +			       r_opal_fdm_active->version);
>> +			return 1;
>> +		}
>> +
>> +		/* Kernel regions not registered with f/w  for MPIPL */
>> +		if (r_opal_fdm_active->registered_regions == 0) {
>> +			opal_fdm_active = NULL;
> 
> What about partial dump capture scenario ? What if opal crashes while
> kernel was in middle of registering ranges ? We may have partial dump
> captured which won't be useful.
> e,g. If we have total of 4 ranges to be registered and opal crashes
> after successful registration of only 2 ranges with 2 pending, we will get a
> partial dump which needs to be ignored.
> 
> I think check shuold be comparing registered_regions against total number of
> regions. What do you think ?

Yes, Mahesh.
Taking care of that in 22/25

Thanks
Hari

Patch
diff mbox series

diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index 9c68c83..dffc0e7 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -18,6 +18,7 @@ 
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
 #include <linux/mm.h>
+#include <linux/crash_dump.h>
 
 #include <asm/page.h>
 #include <asm/opal.h>
@@ -25,6 +26,7 @@ 
 #include "../../kernel/fadump-common.h"
 #include "opal-fadump.h"
 
+static const struct opal_fadump_mem_struct *opal_fdm_active;
 static struct opal_fadump_mem_struct *opal_fdm;
 
 static void opal_fadump_update_config(struct fw_dump *fadump_conf,
@@ -41,6 +43,50 @@  static void opal_fadump_update_config(struct fw_dump *fadump_conf,
 		 fadump_conf->boot_mem_dest_addr);
 
 	fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr;
+
+	/* Start address of preserve area (permanent reservation) */
+	fadump_conf->preserv_area_start = fadump_conf->boot_mem_dest_addr;
+	pr_debug("Preserve area start address: 0x%lx\n",
+		 fadump_conf->preserv_area_start);
+}
+
+/*
+ * This function is called in the capture kernel to get configuration details
+ * from metadata setup by the first kernel.
+ */
+static void opal_fadump_get_config(struct fw_dump *fadump_conf,
+				   const struct opal_fadump_mem_struct *fdm)
+{
+	unsigned long base, size, last_end, hole_size;
+	int i;
+
+	if (!fadump_conf->dump_active)
+		return;
+
+	last_end = 0;
+	hole_size = 0;
+	fadump_conf->boot_memory_size = 0;
+
+	if (fdm->region_cnt)
+		pr_debug("Boot memory regions:\n");
+
+	for (i = 0; i < fdm->region_cnt; i++) {
+		base = fdm->rgn[i].src;
+		size = fdm->rgn[i].size;
+		pr_debug("\t%d. base: 0x%lx, size: 0x%lx\n",
+			 (i + 1), base, size);
+
+		fadump_conf->boot_mem_addr[i] = base;
+		fadump_conf->boot_mem_size[i] = size;
+		fadump_conf->boot_memory_size += size;
+		hole_size += (base - last_end);
+
+		last_end = base + size;
+	}
+
+	fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size);
+	fadump_conf->boot_mem_regs_cnt = fdm->region_cnt;
+	opal_fadump_update_config(fadump_conf, fdm);
 }
 
 static ulong opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
@@ -174,27 +220,127 @@  static int opal_fadump_unregister_fadump(struct fw_dump *fadump_conf)
 
 static int opal_fadump_invalidate_fadump(struct fw_dump *fadump_conf)
 {
-	return -EIO;
+	s64 rc;
+
+	rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0);
+	if (rc) {
+		pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc);
+		return -EIO;
+	}
+
+	fadump_conf->dump_active = 0;
+	opal_fdm_active = NULL;
+	return 0;
+}
+
+/*
+ * Convert CPU state data saved at the time of crash into ELF notes.
+ */
+static int __init opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf)
+{
+	u32 num_cpus, *note_buf;
+	struct fadump_crash_info_header *fdh = NULL;
+
+	num_cpus = 1;
+	/* Allocate buffer to hold cpu crash notes. */
+	fadump_conf->cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
+	fadump_conf->cpu_notes_buf_size =
+		PAGE_ALIGN(fadump_conf->cpu_notes_buf_size);
+	note_buf = fadump_cpu_notes_buf_alloc(fadump_conf->cpu_notes_buf_size);
+	if (!note_buf) {
+		pr_err("Failed to allocate 0x%lx bytes for cpu notes buffer\n",
+		       fadump_conf->cpu_notes_buf_size);
+		return -ENOMEM;
+	}
+	fadump_conf->cpu_notes_buf = __pa(note_buf);
+
+	pr_debug("Allocated buffer for cpu notes of size %ld at %p\n",
+		 (num_cpus * sizeof(note_buf_t)), note_buf);
+
+	if (fadump_conf->fadumphdr_addr)
+		fdh = __va(fadump_conf->fadumphdr_addr);
+
+	if (fdh && (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN)) {
+		note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
+		final_note(note_buf);
+
+		pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+			 fdh->elfcorehdr_addr);
+		fadump_update_elfcore_header(fadump_conf,
+					     __va(fdh->elfcorehdr_addr));
+	}
+
+	return 0;
 }
 
 static int __init opal_fadump_process_fadump(struct fw_dump *fadump_conf)
 {
-	return -EINVAL;
+	struct fadump_crash_info_header *fdh;
+	int rc = 0;
+
+	if (!opal_fdm_active || !fadump_conf->fadumphdr_addr)
+		return -EINVAL;
+
+	/* Validate the fadump crash info header */
+	fdh = __va(fadump_conf->fadumphdr_addr);
+	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+		pr_err("Crash info header is not valid.\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * TODO: To build cpu notes, find a way to map PIR to logical id.
+	 *       Also, we may need different method for pseries and powernv.
+	 *       The currently booted kernel could have a different PIR to
+	 *       logical id mapping. So, try saving info of previous kernel's
+	 *       paca to get the right PIR to logical id mapping.
+	 */
+	rc = opal_fadump_build_cpu_notes(fadump_conf);
+	if (rc)
+		return rc;
+
+	/*
+	 * We are done validating dump info and elfcore header is now ready
+	 * to be exported. set elfcorehdr_addr so that vmcore module will
+	 * export the elfcore header through '/proc/vmcore'.
+	 */
+	elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+	return rc;
 }
 
 static void opal_fadump_region_show(struct fw_dump *fadump_conf,
 				    struct seq_file *m)
 {
 	int i;
-	const struct opal_fadump_mem_struct *fdm_ptr = opal_fdm;
+	const struct opal_fadump_mem_struct *fdm_ptr;
 	u64 dumped_bytes = 0;
 
+	if (fadump_conf->dump_active)
+		fdm_ptr = opal_fdm_active;
+	else
+		fdm_ptr = opal_fdm;
+
 	for (i = 0; i < fdm_ptr->region_cnt; i++) {
+		/*
+		 * Only regions that are registered for MPIPL
+		 * would have dump data.
+		 */
+		if ((fadump_conf->dump_active) &&
+		    (i < fdm_ptr->registered_regions))
+			dumped_bytes = fdm_ptr->rgn[i].size;
+
 		seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
 			   fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest);
 		seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
 			   fdm_ptr->rgn[i].size, dumped_bytes);
 	}
+
+	/* Dump is active. Show reserved area start address. */
+	if (fadump_conf->dump_active) {
+		seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
+			   fadump_conf->reserve_dump_area_start);
+	}
 }
 
 static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
@@ -225,6 +371,7 @@  static struct fadump_ops opal_fadump_ops = {
 int __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, ulong node)
 {
 	unsigned long dn;
+	const __be32 *prop;
 
 	/*
 	 * Check if Firmware-Assisted Dump is supported. if yes, check
@@ -251,5 +398,42 @@  int __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, ulong node)
 	 */
 	fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE);
 
+	/*
+	 * Check if dump has been initiated on last reboot.
+	 */
+	prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+	if (prop) {
+		u64 addr = 0;
+		s64 ret;
+		const struct opal_fadump_mem_struct *r_opal_fdm_active;
+
+		ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
+		if ((ret != OPAL_SUCCESS) || !addr) {
+			pr_err("Failed to get Kernel metadata (%lld)\n", ret);
+			return 1;
+		}
+
+		addr = be64_to_cpu(addr);
+		pr_debug("Kernel metadata addr: %llx\n", addr);
+
+		opal_fdm_active = __va(addr);
+		r_opal_fdm_active = (void *)addr;
+		if (r_opal_fdm_active->version != OPAL_FADUMP_VERSION) {
+			pr_err("FADump active but version (%u) unsupported!\n",
+			       r_opal_fdm_active->version);
+			return 1;
+		}
+
+		/* Kernel regions not registered with f/w  for MPIPL */
+		if (r_opal_fdm_active->registered_regions == 0) {
+			opal_fdm_active = NULL;
+			return 1;
+		}
+
+		pr_info("Firmware-assisted dump is active.\n");
+		fadump_conf->dump_active = 1;
+		opal_fadump_get_config(fadump_conf, r_opal_fdm_active);
+	}
+
 	return 1;
 }