diff mbox series

[v5,21/31] powernv/fadump: process architected register state data provided by firmware

Message ID 156630280239.8896.11769233860624935762.stgit@hbathini.in.ibm.com (mailing list archive)
State Superseded
Headers show
Series Add FADump support on PowerNV platform | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch next (c9633332103e55bc73d80d07ead28b95a22a85a3)
snowpatch_ozlabs/checkpatch warning total: 0 errors, 2 warnings, 4 checks, 309 lines checked

Commit Message

Hari Bathini Aug. 20, 2019, 12:06 p.m. UTC
From: Hari Bathini <hbathini@linux.vnet.ibm.com>

Firmware provides architected register state data at the time of crash.
Process this data and build CPU notes to append to ELF core.

Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/fadump-common.h          |    4 +
 arch/powerpc/platforms/powernv/opal-fadump.c |  198 ++++++++++++++++++++++++--
 arch/powerpc/platforms/powernv/opal-fadump.h |   39 +++++
 3 files changed, 229 insertions(+), 12 deletions(-)

Comments

Michael Ellerman Sept. 4, 2019, 12:20 p.m. UTC | #1
Hari Bathini <hbathini@linux.ibm.com> writes:

> diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h
> index 7107cf2..fc408b0 100644
> --- a/arch/powerpc/kernel/fadump-common.h
> +++ b/arch/powerpc/kernel/fadump-common.h
> @@ -98,7 +98,11 @@ struct fw_dump {
>  	/* cmd line option during boot */
>  	unsigned long	reserve_bootvar;
>  
> +	unsigned long	cpu_state_destination_addr;

AFAICS that is only used in two places, and both of them have to call
__va() on it, so why don't we store the virtual address to start with?

> diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
> index f75b861..9a32a7f 100644
> --- a/arch/powerpc/platforms/powernv/opal-fadump.c
> +++ b/arch/powerpc/platforms/powernv/opal-fadump.c
> @@ -282,15 +283,122 @@ static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
>  		pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
>  }
>  
> +static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs,
> +						 u32 reg_type, u32 reg_num,
> +						 u64 reg_val)
> +{
> +	if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) {
> +		if (reg_num < 32)
> +			regs->gpr[reg_num] = reg_val;
> +		return;
> +	}
> +
> +	switch (reg_num) {
> +	case SPRN_CTR:
> +		regs->ctr = reg_val;
> +		break;
> +	case SPRN_LR:
> +		regs->link = reg_val;
> +		break;
> +	case SPRN_XER:
> +		regs->xer = reg_val;
> +		break;
> +	case SPRN_DAR:
> +		regs->dar = reg_val;
> +		break;
> +	case SPRN_DSISR:
> +		regs->dsisr = reg_val;
> +		break;
> +	case HDAT_FADUMP_REG_ID_NIP:
> +		regs->nip = reg_val;
> +		break;
> +	case HDAT_FADUMP_REG_ID_MSR:
> +		regs->msr = reg_val;
> +		break;
> +	case HDAT_FADUMP_REG_ID_CCR:
> +		regs->ccr = reg_val;
> +		break;
> +	}
> +}
> +
> +static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
> +					 unsigned int reg_entry_size,
> +					 struct pt_regs *regs)
> +{
> +	int i;
> +	struct hdat_fadump_reg_entry *reg_entry;

Where's my christmas tree :)

> +
> +	memset(regs, 0, sizeof(struct pt_regs));
> +
> +	for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
> +		reg_entry = (struct hdat_fadump_reg_entry *)bufp;
> +		opal_fadump_set_regval_regnum(regs,
> +					      be32_to_cpu(reg_entry->reg_type),
> +					      be32_to_cpu(reg_entry->reg_num),
> +					      be64_to_cpu(reg_entry->reg_val));
> +	}
> +}
> +
> +static inline bool __init is_thread_core_inactive(u8 core_state)
> +{
> +	bool is_inactive = false;
> +
> +	if (core_state == HDAT_FADUMP_CORE_INACTIVE)
> +		is_inactive = true;
> +
> +	return is_inactive;

	return core_state == HDAT_FADUMP_CORE_INACTIVE;

??

In fact there's only one caller, so just drop the inline entirely.

> +}
> +
>  /*
>   * Convert CPU state data saved at the time of crash into ELF notes.
> + *
> + * Each register entry is of 16 bytes, A numerical identifier along with
> + * a GPR/SPR flag in the first 8 bytes and the register value in the next
> + * 8 bytes. For more details refer to F/W documentation.
>   */
>  static int __init opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf)
>  {
>  	u32 num_cpus, *note_buf;
>  	struct fadump_crash_info_header *fdh = NULL;
> +	struct hdat_fadump_thread_hdr *thdr;
> +	unsigned long addr;
> +	u32 thread_pir;
> +	char *bufp;
> +	struct pt_regs regs;
> +	unsigned int size_of_each_thread;
> +	unsigned int regs_offset, regs_cnt, reg_esize;
> +	int i;

	unsigned int size_of_each_thread, regs_offset, regs_cnt, reg_esize;
  	struct fadump_crash_info_header *fdh = NULL;
  	u32 num_cpus, thread_pir, *note_buf;
	struct hdat_fadump_thread_hdr *thdr;
	struct pt_regs regs;
	unsigned long addr;
	char *bufp;
	int i;

Ah much better :)

Though the number of variables might be an indication that this function
could be split into smaller parts.

> @@ -473,6 +627,26 @@ int __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, ulong node)
>  			return 1;
>  		}
>  
> +		ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
> +		if ((ret != OPAL_SUCCESS) || !addr) {
> +			pr_err("Failed to get CPU metadata (%lld)\n", ret);
> +			return 1;
> +		}
> +
> +		addr = be64_to_cpu(addr);
> +		pr_debug("CPU metadata addr: %llx\n", addr);
> +
> +		opal_cpu_metadata = __va(addr);
> +		r_opal_cpu_metadata = (void *)addr;

Another r_ variable I don't understand.

> diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
> index 19cac1f..ce4c522 100644
> --- a/arch/powerpc/platforms/powernv/opal-fadump.h
> +++ b/arch/powerpc/platforms/powernv/opal-fadump.h
> @@ -30,4 +30,43 @@ struct opal_fadump_mem_struct {
>  	struct opal_mpipl_region	rgn[OPAL_FADUMP_MAX_MEM_REGS];
>  } __attribute__((packed));
>  
> +/*
> + * CPU state data is provided by f/w. Below are the definitions
> + * provided in HDAT spec. Refer to latest HDAT specification for
> + * any update to this format.
> + */

How is this meant to work? If HDAT ever changes the format they will
break all existing kernels in the field.

> +#define HDAT_FADUMP_CPU_DATA_VERSION		1
> +
> +#define HDAT_FADUMP_CORE_INACTIVE		(0x0F)
> +
> +/* HDAT thread header for register entries */
> +struct hdat_fadump_thread_hdr {
> +	__be32  pir;
> +	/* 0x00 - 0x0F - The corresponding stop state of the core */
> +	u8      core_state;
> +	u8      reserved[3];
> +
> +	__be32	offset;	/* Offset to Register Entries array */
> +	__be32	ecnt;	/* Number of entries */
> +	__be32	esize;	/* Alloc size of each array entry in bytes */
> +	__be32	eactsz;	/* Actual size of each array entry in bytes */
> +} __attribute__((packed));
> +
> +/* Register types populated by f/w */
> +#define HDAT_FADUMP_REG_TYPE_GPR		0x01
> +#define HDAT_FADUMP_REG_TYPE_SPR		0x02
> +
> +/* ID numbers used by f/w while populating certain registers */
> +#define HDAT_FADUMP_REG_ID_NIP			0x7D0
> +#define HDAT_FADUMP_REG_ID_MSR			0x7D1
> +#define HDAT_FADUMP_REG_ID_CCR			0x7D2
> +
> +/* HDAT register entry. */
> +struct hdat_fadump_reg_entry {
> +	__be32		reg_type;
> +	__be32		reg_num;
> +	__be64		reg_val;
> +} __attribute__((packed));

cheers
Hari Bathini Sept. 9, 2019, 1:23 p.m. UTC | #2
On 04/09/19 5:50 PM, Michael Ellerman wrote:
> Hari Bathini <hbathini@linux.ibm.com> writes:
>

[...]

>> +/*
>> + * CPU state data is provided by f/w. Below are the definitions
>> + * provided in HDAT spec. Refer to latest HDAT specification for
>> + * any update to this format.
>> + */
> 
> How is this meant to work? If HDAT ever changes the format they will
> break all existing kernels in the field.
> 
>> +#define HDAT_FADUMP_CPU_DATA_VERSION		1

Changes are not expected here. But this is just to cover for such scenario,
if that ever happens.

Also, I think it is a bit far-fetched to error out if versions mismatch.
Warning and proceeding sounds worthier because the changes are usually
backward compatible, if and when there are any. Will update accordingly... 

- Hari
Oliver O'Halloran Sept. 9, 2019, 3:33 p.m. UTC | #3
On Mon, Sep 9, 2019 at 11:23 PM Hari Bathini <hbathini@linux.ibm.com> wrote:
>
> On 04/09/19 5:50 PM, Michael Ellerman wrote:
> > Hari Bathini <hbathini@linux.ibm.com> writes:
> >
>
> [...]
>
> >> +/*
> >> + * CPU state data is provided by f/w. Below are the definitions
> >> + * provided in HDAT spec. Refer to latest HDAT specification for
> >> + * any update to this format.
> >> + */
> >
> > How is this meant to work? If HDAT ever changes the format they will
> > break all existing kernels in the field.
> >
> >> +#define HDAT_FADUMP_CPU_DATA_VERSION                1
>
> Changes are not expected here. But this is just to cover for such scenario,
> if that ever happens.

The HDAT spec doesn't define the SPR numbers for NIA, MSR and the CR.
As far as I can tell the values you've assumed here are chip-specific,
non-architected SPR numbers that come from an array buried somewhere
in the SBE codebase. I don't believe you for a second when you say
that this will never change.

> Also, I think it is a bit far-fetched to error out if versions mismatch.
> Warning and proceeding sounds worthier because the changes are usually
> backward compatible, if and when there are any. Will update accordingly...

Literally the only reason I didn't drop the CPU DATA parts of the OPAL
MPIPL series was because I assumed the kernel would do the sensible
thing and reject or ignore the structure if it did not know how to
parse the data.

Oliver
Hari Bathini Sept. 10, 2019, 8:48 a.m. UTC | #4
On 09/09/19 9:03 PM, Oliver O'Halloran wrote:
> On Mon, Sep 9, 2019 at 11:23 PM Hari Bathini <hbathini@linux.ibm.com> wrote:
>>
>> On 04/09/19 5:50 PM, Michael Ellerman wrote:
>>> Hari Bathini <hbathini@linux.ibm.com> writes:
>>>
>>
>> [...]
>>
>>>> +/*
>>>> + * CPU state data is provided by f/w. Below are the definitions
>>>> + * provided in HDAT spec. Refer to latest HDAT specification for
>>>> + * any update to this format.
>>>> + */
>>>
>>> How is this meant to work? If HDAT ever changes the format they will
>>> break all existing kernels in the field.
>>>
>>>> +#define HDAT_FADUMP_CPU_DATA_VERSION                1
>>
>> Changes are not expected here. But this is just to cover for such scenario,
>> if that ever happens.
> 
> The HDAT spec doesn't define the SPR numbers for NIA, MSR and the CR.
> As far as I can tell the values you've assumed here are chip-specific,
> non-architected SPR numbers that come from an array buried somewhere
> in the SBE codebase. I don't believe you for a second when you say
> that this will never change.

At least, the understanding is that this numbers not change across processor
generations. If something changes, it is supposed to be handled in SBE. Also,
I am told this numbers would be listed in the HDAT Spec. Not sure if that
happened yet though. Vasant, you have anything to add?

>> Also, I think it is a bit far-fetched to error out if versions mismatch.
>> Warning and proceeding sounds worthier because the changes are usually
>> backward compatible, if and when there are any. Will update accordingly...
> 
> Literally the only reason I didn't drop the CPU DATA parts of the OPAL
> MPIPL series was because I assumed the kernel would do the sensible
> thing and reject or ignore the structure if it did not know how to
> parse the data.

I think, the changes if any, would have to be backward compatible for the sake
of sanity. Even if they are not, we are better off exporting the /proc/vmcore
with a warning and some crazy CPU register data (if parsing goes alright) than
no dump at all? 

- Hari
Michael Ellerman Sept. 10, 2019, 2:05 p.m. UTC | #5
Hari Bathini <hbathini@linux.ibm.com> writes:
> On 09/09/19 9:03 PM, Oliver O'Halloran wrote:
>> On Mon, Sep 9, 2019 at 11:23 PM Hari Bathini <hbathini@linux.ibm.com> wrote:
>>> On 04/09/19 5:50 PM, Michael Ellerman wrote:
>>>> Hari Bathini <hbathini@linux.ibm.com> writes:
>>> [...]
>>>
>>>>> +/*
>>>>> + * CPU state data is provided by f/w. Below are the definitions
>>>>> + * provided in HDAT spec. Refer to latest HDAT specification for
>>>>> + * any update to this format.
>>>>> + */
>>>>
>>>> How is this meant to work? If HDAT ever changes the format they will
>>>> break all existing kernels in the field.
>>>>
>>>>> +#define HDAT_FADUMP_CPU_DATA_VERSION                1
>>>
>>> Changes are not expected here. But this is just to cover for such scenario,
>>> if that ever happens.
>> 
>> The HDAT spec doesn't define the SPR numbers for NIA, MSR and the CR.
>> As far as I can tell the values you've assumed here are chip-specific,
>> non-architected SPR numbers that come from an array buried somewhere
>> in the SBE codebase. I don't believe you for a second when you say
>> that this will never change.
>
> At least, the understanding is that this numbers not change across processor
> generations. If something changes, it is supposed to be handled in SBE. Also,
> I am told this numbers would be listed in the HDAT Spec. Not sure if that
> happened yet though. Vasant, you have anything to add?

That doesn't help much because the HDAT spec is not public.

The point is with the code written the way it is, these values *must
not* change, or else all existing kernels will be broken, which is not
acceptable.

>>> Also, I think it is a bit far-fetched to error out if versions mismatch.
>>> Warning and proceeding sounds worthier because the changes are usually
>>> backward compatible, if and when there are any. Will update accordingly...
>> 
>> Literally the only reason I didn't drop the CPU DATA parts of the OPAL
>> MPIPL series was because I assumed the kernel would do the sensible
>> thing and reject or ignore the structure if it did not know how to
>> parse the data.
>
> I think, the changes if any, would have to be backward compatible for the sake
> of sanity.

People need to understand that this is an ABI between firmware and
in-the-field distribution kernels which are only updated at customer
discretion, or possibly never.

Any changes *must be* backward compatible.

Looking at the header struct:

+struct hdat_fadump_thread_hdr {
+	__be32  pir;
+	/* 0x00 - 0x0F - The corresponding stop state of the core */
+	u8      core_state;
+	u8      reserved[3];

You have those 3 reserved bytes, so a future revision could repurpose
one of those as a flag to indicate a new format. And/or the hdr could be
made bigger and new kernels could be taught to look for new things in
the space after the hdr but before the reg entries.

So I think there is a reasonable mechanism for extending the format in
future, but my point is people must understand that this is an ABI and
changes must be made accordingly.

> Even if they are not, we are better off exporting the /proc/vmcore
> with a warning and some crazy CPU register data (if parsing goes alright) than
> no dump at all? 

If it's just a case of reg entries that we don't recognise then yes I
think it would be OK to just skip them and continue exporting. But if
there's any deeper misunderstanding of the format then we should bail
out.

I notice now that you don't do anything in opal_fadump_set_regval_regnum()
if you are passed a register we don't understand, so that probably needs
fixing.

cheers
Hari Bathini Sept. 10, 2019, 4:10 p.m. UTC | #6
On 10/09/19 7:35 PM, Michael Ellerman wrote:
> Hari Bathini <hbathini@linux.ibm.com> writes:
>> On 09/09/19 9:03 PM, Oliver O'Halloran wrote:
>>> On Mon, Sep 9, 2019 at 11:23 PM Hari Bathini <hbathini@linux.ibm.com> wrote:
>>>> On 04/09/19 5:50 PM, Michael Ellerman wrote:
>>>>> Hari Bathini <hbathini@linux.ibm.com> writes:
>>>> [...]
>>>>
>>>>>> +/*
>>>>>> + * CPU state data is provided by f/w. Below are the definitions
>>>>>> + * provided in HDAT spec. Refer to latest HDAT specification for
>>>>>> + * any update to this format.
>>>>>> + */
>>>>>
>>>>> How is this meant to work? If HDAT ever changes the format they will
>>>>> break all existing kernels in the field.
>>>>>
>>>>>> +#define HDAT_FADUMP_CPU_DATA_VERSION                1
>>>>
>>>> Changes are not expected here. But this is just to cover for such scenario,
>>>> if that ever happens.
>>>
>>> The HDAT spec doesn't define the SPR numbers for NIA, MSR and the CR.
>>> As far as I can tell the values you've assumed here are chip-specific,
>>> non-architected SPR numbers that come from an array buried somewhere
>>> in the SBE codebase. I don't believe you for a second when you say
>>> that this will never change.
>>
>> At least, the understanding is that this numbers not change across processor
>> generations. If something changes, it is supposed to be handled in SBE. Also,
>> I am told this numbers would be listed in the HDAT Spec. Not sure if that
>> happened yet though. Vasant, you have anything to add?
> 
> That doesn't help much because the HDAT spec is not public.
> 
> The point is with the code written the way it is, these values *must
> not* change, or else all existing kernels will be broken, which is not
> acceptable.

Yeah. It is absurd to error out just by looking at version number...

> 
>>>> Also, I think it is a bit far-fetched to error out if versions mismatch.
>>>> Warning and proceeding sounds worthier because the changes are usually
>>>> backward compatible, if and when there are any. Will update accordingly...
>>>
>>> Literally the only reason I didn't drop the CPU DATA parts of the OPAL
>>> MPIPL series was because I assumed the kernel would do the sensible
>>> thing and reject or ignore the structure if it did not know how to
>>> parse the data.
>>
>> I think, the changes if any, would have to be backward compatible for the sake
>> of sanity.
> 
> People need to understand that this is an ABI between firmware and
> in-the-field distribution kernels which are only updated at customer
> discretion, or possibly never.
> 
> Any changes *must be* backward compatible.
> 
> Looking at the header struct:
> 
> +struct hdat_fadump_thread_hdr {
> +	__be32  pir;
> +	/* 0x00 - 0x0F - The corresponding stop state of the core */
> +	u8      core_state;
> +	u8      reserved[3];
> 
> You have those 3 reserved bytes, so a future revision could repurpose
> one of those as a flag to indicate a new format. And/or the hdr could be
> made bigger and new kernels could be taught to look for new things in
> the space after the hdr but before the reg entries.
> 
> So I think there is a reasonable mechanism for extending the format in
> future, but my point is people must understand that this is an ABI and
> changes must be made accordingly.

True. The folks who make the changes to this format should be aware that
breaking kernel ABI is not going to be pretty and I think they are :)

> 
>> Even if they are not, we are better off exporting the /proc/vmcore
>> with a warning and some crazy CPU register data (if parsing goes alright) than
>> no dump at all? 
> 
> If it's just a case of reg entries that we don't recognise then yes I
> think it would be OK to just skip them and continue exporting. But if
> there's any deeper misunderstanding of the format then we should bail
> out.

Sure. Will try and fix that by first trying to do a sanity check on the
fields that are needed for parsing the data and proceed with a warning if
nothing weird is detected and fallback to just appending crashing cpu as
done in patch 16/31, if anything weird is observed. That should hopefully
take care of all cases in the best possible way..

> 
> I notice now that you don't do anything in opal_fadump_set_regval_regnum()
> if you are passed a register we don't understand, so that probably needs
> fixing.

f/w provides about 100 odd registers in the CPU state data. Most of them
pt_regs doesn't care about. So, opal_fadump_set_regval_regnum is happy as
long as it find the registers listed in it. Unless, pt_regs changes, we
can stick to this and ignore the rest of them?

- Hari
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h
index 7107cf2..fc408b0 100644
--- a/arch/powerpc/kernel/fadump-common.h
+++ b/arch/powerpc/kernel/fadump-common.h
@@ -98,7 +98,11 @@  struct fw_dump {
 	/* cmd line option during boot */
 	unsigned long	reserve_bootvar;
 
+	unsigned long	cpu_state_destination_addr;
+	unsigned long	cpu_state_data_version;
+	unsigned long	cpu_state_entry_size;
 	unsigned long	cpu_state_data_size;
+
 	unsigned long	hpte_region_size;
 
 	unsigned long	boot_mem_dest_addr;
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index f75b861..9a32a7f 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -23,6 +23,7 @@ 
 #include "opal-fadump.h"
 
 static const struct opal_fadump_mem_struct *opal_fdm_active;
+static const struct opal_mpipl_fadump *opal_cpu_metadata;
 static struct opal_fadump_mem_struct *opal_fdm;
 
 static int opal_fadump_unregister(struct fw_dump *fadump_conf);
@@ -282,15 +283,122 @@  static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
 		pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
 }
 
+static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs,
+						 u32 reg_type, u32 reg_num,
+						 u64 reg_val)
+{
+	if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) {
+		if (reg_num < 32)
+			regs->gpr[reg_num] = reg_val;
+		return;
+	}
+
+	switch (reg_num) {
+	case SPRN_CTR:
+		regs->ctr = reg_val;
+		break;
+	case SPRN_LR:
+		regs->link = reg_val;
+		break;
+	case SPRN_XER:
+		regs->xer = reg_val;
+		break;
+	case SPRN_DAR:
+		regs->dar = reg_val;
+		break;
+	case SPRN_DSISR:
+		regs->dsisr = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_NIP:
+		regs->nip = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_MSR:
+		regs->msr = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_CCR:
+		regs->ccr = reg_val;
+		break;
+	}
+}
+
+static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
+					 unsigned int reg_entry_size,
+					 struct pt_regs *regs)
+{
+	int i;
+	struct hdat_fadump_reg_entry *reg_entry;
+
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
+		reg_entry = (struct hdat_fadump_reg_entry *)bufp;
+		opal_fadump_set_regval_regnum(regs,
+					      be32_to_cpu(reg_entry->reg_type),
+					      be32_to_cpu(reg_entry->reg_num),
+					      be64_to_cpu(reg_entry->reg_val));
+	}
+}
+
+static inline bool __init is_thread_core_inactive(u8 core_state)
+{
+	bool is_inactive = false;
+
+	if (core_state == HDAT_FADUMP_CORE_INACTIVE)
+		is_inactive = true;
+
+	return is_inactive;
+}
+
 /*
  * Convert CPU state data saved at the time of crash into ELF notes.
+ *
+ * Each register entry is of 16 bytes, A numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation.
  */
 static int __init opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf)
 {
 	u32 num_cpus, *note_buf;
 	struct fadump_crash_info_header *fdh = NULL;
+	struct hdat_fadump_thread_hdr *thdr;
+	unsigned long addr;
+	u32 thread_pir;
+	char *bufp;
+	struct pt_regs regs;
+	unsigned int size_of_each_thread;
+	unsigned int regs_offset, regs_cnt, reg_esize;
+	int i;
+
+	fadump_conf->cpu_state_entry_size =
+			be32_to_cpu(opal_cpu_metadata->cpu_data_size);
+	fadump_conf->cpu_state_destination_addr =
+			be64_to_cpu(opal_cpu_metadata->region[0].dest);
+	fadump_conf->cpu_state_data_size =
+			be64_to_cpu(opal_cpu_metadata->region[0].size);
+
+	if ((fadump_conf->cpu_state_destination_addr == 0) ||
+	    (fadump_conf->cpu_state_entry_size == 0)) {
+		pr_err("CPU state data not available for processing!\n");
+		return -ENODEV;
+	}
+
+	size_of_each_thread = fadump_conf->cpu_state_entry_size;
+	num_cpus = (fadump_conf->cpu_state_data_size / size_of_each_thread);
+
+	addr = fadump_conf->cpu_state_destination_addr;
+	bufp = __va(addr);
+
+	/*
+	 * Offset for register entries, entry size and registers count is
+	 * duplicated in every thread header in keeping with HDAT format.
+	 * Use these values from the first thread header.
+	 */
+	thdr = (struct hdat_fadump_thread_hdr *)bufp;
+	regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+		       be32_to_cpu(thdr->offset));
+	reg_esize = be32_to_cpu(thdr->esize);
+	regs_cnt  = be32_to_cpu(thdr->ecnt);
 
-	num_cpus = 1;
 	/* Allocate buffer to hold cpu crash notes. */
 	fadump_conf->cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
 	fadump_conf->cpu_notes_buf_size =
@@ -309,10 +417,53 @@  static int __init opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf)
 	if (fadump_conf->fadumphdr_addr)
 		fdh = __va(fadump_conf->fadumphdr_addr);
 
-	if (fdh && (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN)) {
-		note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
-		final_note(note_buf);
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("NumCpus     : %u\n", num_cpus);
+	pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+		 regs_offset, reg_esize, regs_cnt);
+
+	for (i = 0; i < num_cpus; i++, bufp += size_of_each_thread) {
+		thdr = (struct hdat_fadump_thread_hdr *)bufp;
+
+		thread_pir = be32_to_cpu(thdr->pir);
+		pr_debug("%04d) PIR: 0x%x, core state: 0x%02x\n",
+			 (i + 1), thread_pir, thdr->core_state);
+
+		/*
+		 * Register state data of MAX cores is provided by firmware,
+		 * but some of this cores may not be active. So, while
+		 * processing register state data, check core state and
+		 * skip threads that belong to inactive cores.
+		 */
+		if (is_thread_core_inactive(thdr->core_state))
+			continue;
+
+		/*
+		 * If this is kernel initiated crash, crashing_cpu would be set
+		 * appropriately and register data of the crashing CPU saved by
+		 * crashing kernel. Add this saved register data of crashing CPU
+		 * to elf notes and populate the pt_regs for the remaining CPUs
+		 * from register state data provided by firmware.
+		 */
+		if (fdh && (fdh->crashing_cpu == thread_pir)) {
+			note_buf = fadump_regs_to_elf_notes(note_buf,
+							    &fdh->regs);
+			pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+				 fdh->crashing_cpu, fdh->regs.gpr[1],
+				 fdh->regs.nip);
+			continue;
+		}
+
+		opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+				      reg_esize, &regs);
 
+		note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+		pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+			 thread_pir, regs.gpr[1], regs.nip);
+	}
+	final_note(note_buf);
+
+	if (fdh) {
 		pr_debug("Updating elfcore header (%llx) with cpu notes\n",
 			 fdh->elfcorehdr_addr);
 		fadump_update_elfcore_header(fadump_conf,
@@ -327,7 +478,8 @@  static int __init opal_fadump_process(struct fw_dump *fadump_conf)
 	struct fadump_crash_info_header *fdh;
 	int rc = 0;
 
-	if (!opal_fdm_active || !fadump_conf->fadumphdr_addr)
+	if (!opal_fdm_active || !opal_cpu_metadata ||
+	    !fadump_conf->fadumphdr_addr)
 		return -EINVAL;
 
 	/* Validate the fadump crash info header */
@@ -337,13 +489,6 @@  static int __init opal_fadump_process(struct fw_dump *fadump_conf)
 		return -EINVAL;
 	}
 
-	/*
-	 * TODO: To build cpu notes, find a way to map PIR to logical id.
-	 *       Also, we may need different method for pseries and powernv.
-	 *       The currently booted kernel could have a different PIR to
-	 *       logical id mapping. So, try saving info of previous kernel's
-	 *       paca to get the right PIR to logical id mapping.
-	 */
 	rc = opal_fadump_build_cpu_notes(fadump_conf);
 	if (rc)
 		return rc;
@@ -397,6 +542,14 @@  static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
 {
 	int rc;
 
+	/*
+	 * Unlike on pSeries platform, logical CPU number is not provided
+	 * with architected register state data. So, store the crashing
+	 * CPU's PIR instead to plug the appropriate register data for
+	 * crashing CPU in the vmcore file.
+	 */
+	fdh->crashing_cpu = (u32)mfspr(SPRN_PIR);
+
 	rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg);
 	if (rc == OPAL_UNSUPPORTED) {
 		pr_emerg("Reboot type %d not supported.\n",
@@ -449,6 +602,7 @@  int __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, ulong node)
 		u64 addr = 0;
 		s64 ret;
 		const struct opal_fadump_mem_struct *r_opal_fdm_active;
+		const struct opal_mpipl_fadump *r_opal_cpu_metadata;
 
 		ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
 		if ((ret != OPAL_SUCCESS) || !addr) {
@@ -473,6 +627,26 @@  int __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, ulong node)
 			return 1;
 		}
 
+		ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
+		if ((ret != OPAL_SUCCESS) || !addr) {
+			pr_err("Failed to get CPU metadata (%lld)\n", ret);
+			return 1;
+		}
+
+		addr = be64_to_cpu(addr);
+		pr_debug("CPU metadata addr: %llx\n", addr);
+
+		opal_cpu_metadata = __va(addr);
+		r_opal_cpu_metadata = (void *)addr;
+		fadump_conf->cpu_state_data_version =
+			be32_to_cpu(r_opal_cpu_metadata->cpu_data_version);
+		if (fadump_conf->cpu_state_data_version !=
+		    HDAT_FADUMP_CPU_DATA_VERSION) {
+			pr_err("CPU data format version (%lu) mismatch!\n",
+			       fadump_conf->cpu_state_data_version);
+			return 1;
+		}
+
 		pr_info("Firmware-assisted dump is active.\n");
 		fadump_conf->dump_active = 1;
 		opal_fadump_get_config(fadump_conf, r_opal_fdm_active);
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
index 19cac1f..ce4c522 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.h
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -30,4 +30,43 @@  struct opal_fadump_mem_struct {
 	struct opal_mpipl_region	rgn[OPAL_FADUMP_MAX_MEM_REGS];
 } __attribute__((packed));
 
+/*
+ * CPU state data is provided by f/w. Below are the definitions
+ * provided in HDAT spec. Refer to latest HDAT specification for
+ * any update to this format.
+ */
+
+#define HDAT_FADUMP_CPU_DATA_VERSION		1
+
+#define HDAT_FADUMP_CORE_INACTIVE		(0x0F)
+
+/* HDAT thread header for register entries */
+struct hdat_fadump_thread_hdr {
+	__be32  pir;
+	/* 0x00 - 0x0F - The corresponding stop state of the core */
+	u8      core_state;
+	u8      reserved[3];
+
+	__be32	offset;	/* Offset to Register Entries array */
+	__be32	ecnt;	/* Number of entries */
+	__be32	esize;	/* Alloc size of each array entry in bytes */
+	__be32	eactsz;	/* Actual size of each array entry in bytes */
+} __attribute__((packed));
+
+/* Register types populated by f/w */
+#define HDAT_FADUMP_REG_TYPE_GPR		0x01
+#define HDAT_FADUMP_REG_TYPE_SPR		0x02
+
+/* ID numbers used by f/w while populating certain registers */
+#define HDAT_FADUMP_REG_ID_NIP			0x7D0
+#define HDAT_FADUMP_REG_ID_MSR			0x7D1
+#define HDAT_FADUMP_REG_ID_CCR			0x7D2
+
+/* HDAT register entry. */
+struct hdat_fadump_reg_entry {
+	__be32		reg_type;
+	__be32		reg_num;
+	__be64		reg_val;
+} __attribute__((packed));
+
 #endif /* __PPC64_OPAL_FA_DUMP_H__ */