Patchwork [1/3] powerpc: bare minimum checkpoint/restart implementation

login
register
mail settings
Submitter Nathan Lynch
Date Jan. 28, 2009, 10:41 p.m.
Message ID <1233182478-27113-2-git-send-email-ntl@pobox.com>
Download mbox | patch
Permalink /patch/20775/
State Changes Requested
Headers show

Comments

Nathan Lynch - Jan. 28, 2009, 10:41 p.m.
The only thing of significance here is that the checkpointed task's
pt_regs and fp state are saved and restored (see cr_write_cpu and
cr_read_cpu); the rest of the code consists of dummy implementations
of the APIs the arch needs to provide to the checkpoint/restart core.

What works:
* self and external checkpoint of simple (single thread, one open
  file) 32- and 64-bit processes on a ppc64 kernel

What doesn't work:
* restarting a 32-bit task from a 64-bit task and vice versa

Untested:
* ppc32 (but it builds)

Signed-off-by: Nathan Lynch <ntl@pobox.com>
---
 arch/powerpc/include/asm/checkpoint_hdr.h |   40 +++++
 arch/powerpc/mm/Makefile                  |    1 +
 arch/powerpc/mm/checkpoint.c              |  261 +++++++++++++++++++++++++++++
 3 files changed, 302 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/include/asm/checkpoint_hdr.h
 create mode 100644 arch/powerpc/mm/checkpoint.c
Oren Laadan - Jan. 29, 2009, 6:41 a.m.
Nathan,

Thanks for the patch. Looks good, see some comments below.
(disclaimer: I'm not very familiar with ppc architecture)

Nathan Lynch wrote:
> The only thing of significance here is that the checkpointed task's
> pt_regs and fp state are saved and restored (see cr_write_cpu and
> cr_read_cpu); the rest of the code consists of dummy implementations
> of the APIs the arch needs to provide to the checkpoint/restart core.
> 
> What works:
> * self and external checkpoint of simple (single thread, one open
>   file) 32- and 64-bit processes on a ppc64 kernel
> 
> What doesn't work:
> * restarting a 32-bit task from a 64-bit task and vice versa

Is there a test to bail if we attempt to checkpoint such tasks ?

> 
> Untested:
> * ppc32 (but it builds)
> 
> Signed-off-by: Nathan Lynch <ntl@pobox.com>
> ---
>  arch/powerpc/include/asm/checkpoint_hdr.h |   40 +++++
>  arch/powerpc/mm/Makefile                  |    1 +
>  arch/powerpc/mm/checkpoint.c              |  261 +++++++++++++++++++++++++++++
>  3 files changed, 302 insertions(+), 0 deletions(-)
>  create mode 100644 arch/powerpc/include/asm/checkpoint_hdr.h
>  create mode 100644 arch/powerpc/mm/checkpoint.c
> 
> diff --git a/arch/powerpc/include/asm/checkpoint_hdr.h b/arch/powerpc/include/asm/checkpoint_hdr.h
> new file mode 100644
> index 0000000..752c53f
> --- /dev/null
> +++ b/arch/powerpc/include/asm/checkpoint_hdr.h
> @@ -0,0 +1,40 @@
> +#ifndef __ASM_PPC_CKPT_HDR_H
> +#define __ASM_PPC_CKPT_HDR_H
> +/*
> + *  Checkpoint/restart - architecture specific headers ppc
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#include <linux/types.h>
> +#include <asm/ptrace.h>
> +#include <asm/mmu.h>
> +#include <asm/processor.h>
> +
> +struct cr_hdr_head_arch {
> +	__u32 unimplemented;
> +};
> +
> +struct cr_hdr_thread {
> +	__u32 unimplemented;
> +};
> +
> +struct cr_hdr_cpu {
> +	struct pt_regs pt_regs;

It has been suggested (as done in x86/32 code) not to use 'struct pt_regs'
because it "can (and has) changed on x86" and because "it only container
the registers that the kernel trashes, not all usermode registers".

https://lists.linux-foundation.org/pipermail/containers/2008-August/012355.html

> +	/* relevant fields from thread_struct */
> +	double fpr[32][TS_FPRWIDTH];

Can TS_FPRWIDTH change between sub-archs or kernel versions ?  If so, it
needs to be stated explicitly.

> +	unsigned int fpscr;
> +	int fpexc_mode;
> +	/* unsigned int align_ctl; this is never updated? */
> +	unsigned long dabr;

Are these fields always guarantee to compile to the same number of bytes
regardless of 32/64 bit choice of compiler (or sub-arch?) ?

In the x86(32/64) architecture we use types with explicit size such as
__u32 and the like to ensure that it always compiled to the same size.

> +};
> +
> +struct cr_hdr_mm_context {
> +	__u32 unimplemented;
> +};
> +
> +#endif /* __ASM_PPC_CKPT_HDR__H */
> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> index e7392b4..8a523a0 100644
> --- a/arch/powerpc/mm/Makefile
> +++ b/arch/powerpc/mm/Makefile
> @@ -24,3 +24,4 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
>  obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
>  obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
>  obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
> +obj-$(CONFIG_CHECKPOINT_RESTART) += checkpoint.o
> diff --git a/arch/powerpc/mm/checkpoint.c b/arch/powerpc/mm/checkpoint.c
> new file mode 100644
> index 0000000..8cdff24
> --- /dev/null
> +++ b/arch/powerpc/mm/checkpoint.c
> @@ -0,0 +1,261 @@
> +/*
> + *  Checkpoint/restart - architecture specific support for powerpc.
> + *  Based on x86 implementation.
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *  Copyright 2009 IBM Corp.
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#define DEBUG 1 /* for pr_debug */
> +
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/kernel.h>
> +#include <asm/processor.h>
> +
> +static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
> +{
> +	hdr->type = type;
> +	hdr->len = len;
> +	hdr->parent = parent;
> +}
> +

This function is rather generic and useful to non-arch-dependent and other
architectures code. Perhaps put in a separate patch ?

> +/* dump the thread_struct of a given task */
> +int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr_thread *thread_hdr;
> +	struct cr_hdr cr_hdr;
> +	u32 parent;
> +	int ret;
> +
> +	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
> +	if (!thread_hdr)
> +		return -ENOMEM;
> +
> +	parent = task_pid_vnr(t);
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_THREAD, sizeof(*thread_hdr), parent);
> +
> +	thread_hdr->unimplemented = 0xdeadbeef;
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, thread_hdr);
> +	cr_hbuf_put(ctx, sizeof(*thread_hdr));
> +
> +	return ret;
> +}
> +
> +/* dump the cpu state and registers of a given task */
> +int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr_cpu *cpu_hdr;
> +	struct pt_regs *pt_regs;
> +	struct cr_hdr cr_hdr;
> +	u32 parent;
> +	int ret;
> +
> +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
> +	if (!cpu_hdr)
> +		return -ENOMEM;
> +
> +	parent = task_pid_vnr(t);
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_CPU, sizeof(*cpu_hdr), parent);
> +
> +	/* pt_regs: GPRs, MSR, etc */
> +	pt_regs = task_pt_regs(t);
> +	cpu_hdr->pt_regs = *pt_regs;
> +
> +	/* FP state */
> +	memcpy(cpu_hdr->fpr, t->thread.fpr, sizeof(cpu_hdr->fpr));

As note above, is sizeof(cpu_hdr->fpr) the same on all chips ?

> +	cpu_hdr->fpscr = t->thread.fpscr.val;
> +	cpu_hdr->fpexc_mode = t->thread.fpexc_mode;
> +
> +	/* Handle DABR for now, dbcr[01] later */
> +	cpu_hdr->dabr = t->thread.dabr;
> +
> +	/* ToDo: Altivec/VSX/SPE state */
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, cpu_hdr);
> +	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
> +
> +	return ret;
> +}
> +
> +int cr_write_head_arch(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_head_arch *arch_hdr;
> +	struct cr_hdr cr_hdr;
> +	int ret;
> +
> +	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
> +	if (!arch_hdr)
> +		return -ENOMEM;
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_HEAD_ARCH, sizeof(*arch_hdr), 0);
> +
> +	arch_hdr->unimplemented = 0xdeadbeef;
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, arch_hdr);
> +	cr_hbuf_put(ctx, sizeof(*arch_hdr));
> +
> +	return ret;
> +}
> +
> +/* dump the mm->context state */
> +int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int parent)
> +{
> +	struct cr_hdr_mm_context *mm_hdr;
> +	struct cr_hdr cr_hdr;
> +	size_t size;
> +	int ret;
> +
> +	size = sizeof(*mm_hdr);
> +
> +	mm_hdr = cr_hbuf_get(ctx, size);
> +	if (!mm_hdr)
> +		return -ENOMEM;
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_MM_CONTEXT, size, parent);
> +
> +	mm_hdr->unimplemented = 0xdeadbeef;
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, mm_hdr);
> +	cr_hbuf_put(ctx, size);
> +
> +	return ret;
> +}
> +
> +/* restart APIs */
> +

The restart APIs belong in a separate file: arch/powerpc/mm/restart.c

> +/* read the thread_struct into the current task */
> +int cr_read_thread(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_thread *thread_hdr;
> +	int ret;
> +
> +	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
> +	if (!thread_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, thread_hdr, sizeof(*thread_hdr),
> +				  CR_HDR_THREAD);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	if (thread_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)thread_hdr->unimplemented);

Given the macro for 'pr_fmt' in include/linux/checkpoint.h, the use of
__func__ is redunant.

> +		ret = -EINVAL;
> +	}
> +out:
> +	cr_hbuf_put(ctx, sizeof(*thread_hdr));
> +	return ret;
> +}
> +
> +/* Based on the MSR value from a checkpoint image, produce an MSR
> + * value that is appropriate for the restored task.  Right now we only
> + * check for MSR_SF (64-bit) for PPC64.
> + */
> +static unsigned long sanitize_msr(unsigned long msr_ckpt)
> +{
> +#ifdef CONFIG_PPC32
> +	return MSR_USER;
> +#else
> +	if (msr_ckpt & MSR_SF)
> +		return MSR_USER64;
> +	return MSR_USER32;
> +#endif
> +}
> +
> +int cr_read_cpu(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_cpu *cpu_hdr;
> +	struct pt_regs *regs;
> +	int ret;
> +
> +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
> +	if (!cpu_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, cpu_hdr, sizeof(*cpu_hdr),
> +				  CR_HDR_CPU);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	regs = task_pt_regs(current);
> +	*regs = cpu_hdr->pt_regs;
> +
> +	regs->msr = sanitize_msr(regs->msr);
> +
> +	/* FP state */
> +	memcpy(current->thread.fpr, cpu_hdr->fpr, sizeof(current->thread.fpr));
> +	current->thread.fpscr.val = cpu_hdr->fpscr;
> +	current->thread.fpexc_mode = cpu_hdr->fpexc_mode;
> +
> +	/* debug registers */
> +	current->thread.dabr = cpu_hdr->dabr;

I'm unfamiliar with powerpc; is it necessary to sanitize any of the registers
here ?  For instance, can the user cause harm with specially crafted values
of some registers ?

> +out:
> +	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
> +	return ret;
> +}
> +
> +int cr_read_head_arch(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_head_arch *arch_hdr;
> +	int ret;
> +
> +	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
> +	if (!arch_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, arch_hdr, sizeof(*arch_hdr),
> +				  CR_HDR_HEAD_ARCH);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	if (arch_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected arch_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)arch_hdr->unimplemented);

Same as above: __func__ is redundant.

> +		ret = -EINVAL;
> +	}
> +out:
> +	cr_hbuf_put(ctx, sizeof(*arch_hdr));
> +	return ret;
> +}
> +
> +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
> +{
> +	struct cr_hdr_mm_context *mm_hdr;
> +	int ret;
> +
> +	mm_hdr = cr_hbuf_get(ctx, sizeof(*mm_hdr));
> +	if (!mm_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, mm_hdr, sizeof(*mm_hdr),
> +				  CR_HDR_MM_CONTEXT);
> +	if (ret != rparent)
> +		goto out;

Seems like 'ret' isn't set to an error value if the 'goto' executes.

> +
> +	ret = 0;
> +
> +	if (mm_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected mm_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)mm_hdr->unimplemented);

Again, __func__ redundant.

> +		ret = -EINVAL;
> +	}
> +
> +out:
> +	cr_hbuf_put(ctx, sizeof(*mm_hdr));
> +	return ret;
> +}

Thanks,

Oren.
Nathan Lynch - Jan. 29, 2009, 9:40 p.m.
Hey Oren, thanks for taking a look.

Oren Laadan wrote:
> 
> Nathan Lynch wrote:
> > 
> > What doesn't work:
> > * restarting a 32-bit task from a 64-bit task and vice versa
> 
> Is there a test to bail if we attempt to checkpoint such tasks ?

No, but I'll add one if it looks too hard to fix for the next round.


> > +struct cr_hdr_cpu {
> > +	struct pt_regs pt_regs;
> 
> It has been suggested (as done in x86/32 code) not to use 'struct pt_regs'
> because it "can (and has) changed on x86" and because "it only container
> the registers that the kernel trashes, not all usermode registers".
> 
> https://lists.linux-foundation.org/pipermail/containers/2008-August/012355.html

Yeah, I considered that discussion, but the situation is different for
powerpc (someone on linuxppc-dev smack me if I'm wrong here :)
pt_regs is part of the ABI, and it encompasses all user mode registers
except for floating point, which are handled separately.


> > +	/* relevant fields from thread_struct */
> > +	double fpr[32][TS_FPRWIDTH];
> 
> Can TS_FPRWIDTH change between sub-archs or kernel versions ?  If so, it
> needs to be stated explicitly.
> 
> > +	unsigned int fpscr;
> > +	int fpexc_mode;
> > +	/* unsigned int align_ctl; this is never updated? */
> > +	unsigned long dabr;
> 
> Are these fields always guarantee to compile to the same number of bytes
> regardless of 32/64 bit choice of compiler (or sub-arch?) ?
> 
> In the x86(32/64) architecture we use types with explicit size such as
> __u32 and the like to ensure that it always compiled to the same
> size.

Yeah, I'll have to fix these up.



> > +static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
> > +{
> > +	hdr->type = type;
> > +	hdr->len = len;
> > +	hdr->parent = parent;
> > +}
> > +
> 
> This function is rather generic and useful to non-arch-dependent and other
> architectures code. Perhaps put in a separate patch ?

Alright.  By the way, why are cr_hdr->type and cr_hdr->len signed
types?


> > +int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
> > +{
> > +	struct cr_hdr_cpu *cpu_hdr;
> > +	struct pt_regs *pt_regs;
> > +	struct cr_hdr cr_hdr;
> > +	u32 parent;
> > +	int ret;
> > +
> > +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
> > +	if (!cpu_hdr)
> > +		return -ENOMEM;
> > +
> > +	parent = task_pid_vnr(t);
> > +
> > +	cr_hdr_init(&cr_hdr, CR_HDR_CPU, sizeof(*cpu_hdr), parent);
> > +
> > +	/* pt_regs: GPRs, MSR, etc */
> > +	pt_regs = task_pt_regs(t);
> > +	cpu_hdr->pt_regs = *pt_regs;
> > +
> > +	/* FP state */
> > +	memcpy(cpu_hdr->fpr, t->thread.fpr, sizeof(cpu_hdr->fpr));
> 
> As note above, is sizeof(cpu_hdr->fpr) the same on all chips ?

It can differ depending on kernel configuration.


> > +/* restart APIs */
> > +
> 
> The restart APIs belong in a separate file: arch/powerpc/mm/restart.c

Explain why, please?  This isn't a lot of code, and it seems likely
that checkpoint and restart paths will share data structures and tend
to be modified together over time.


> > +		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
> > +			 __func__, (unsigned long)thread_hdr->unimplemented);
> 
> Given the macro for 'pr_fmt' in include/linux/checkpoint.h, the use of
> __func__ is redunant.

It seems to me that defining your own pr_fmt in a "public" header like
that is inappropriate, or at least unconventional.  Any file that
happens to include linux/checkpoint.h will have any prior definitions
of pr_fmt overridden, no?


> > +	regs = task_pt_regs(current);
> > +	*regs = cpu_hdr->pt_regs;
> > +
> > +	regs->msr = sanitize_msr(regs->msr);
> > +
> > +	/* FP state */
> > +	memcpy(current->thread.fpr, cpu_hdr->fpr, sizeof(current->thread.fpr));
> > +	current->thread.fpscr.val = cpu_hdr->fpscr;
> > +	current->thread.fpexc_mode = cpu_hdr->fpexc_mode;
> > +
> > +	/* debug registers */
> > +	current->thread.dabr = cpu_hdr->dabr;
> 
> I'm unfamiliar with powerpc; is it necessary to sanitize any of the registers
> here ?  For instance, can the user cause harm with specially crafted values
> of some registers ?

I had this in mind with the treatment of MSR, but I'll check on the
others, thanks.


> > +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
> > +{
> > +	struct cr_hdr_mm_context *mm_hdr;
> > +	int ret;
> > +
> > +	mm_hdr = cr_hbuf_get(ctx, sizeof(*mm_hdr));
> > +	if (!mm_hdr)
> > +		return -ENOMEM;
> > +
> > +	ret = cr_read_obj_type(ctx, mm_hdr, sizeof(*mm_hdr),
> > +				  CR_HDR_MM_CONTEXT);
> > +	if (ret != rparent)
> > +		goto out;
> 
> Seems like 'ret' isn't set to an error value if the 'goto' executes.

It returns whatever error value cr_read_obj_type() returns.  Hrm.  I
guess if the image is garbage, cr_read_obj_type can potentially return
a non-error value that still isn't the desired value, is that right?
Oren Laadan - Jan. 30, 2009, 12:11 a.m.
Nathan Lynch wrote:
> Hey Oren, thanks for taking a look.
> 
> Oren Laadan wrote:
>> Nathan Lynch wrote:
>>> What doesn't work:
>>> * restarting a 32-bit task from a 64-bit task and vice versa
>> Is there a test to bail if we attempt to checkpoint such tasks ?
> 
> No, but I'll add one if it looks too hard to fix for the next round.
> 
> 
>>> +struct cr_hdr_cpu {
>>> +	struct pt_regs pt_regs;
>> It has been suggested (as done in x86/32 code) not to use 'struct pt_regs'
>> because it "can (and has) changed on x86" and because "it only container
>> the registers that the kernel trashes, not all usermode registers".
>>
>> https://lists.linux-foundation.org/pipermail/containers/2008-August/012355.html
> 
> Yeah, I considered that discussion, but the situation is different for
> powerpc (someone on linuxppc-dev smack me if I'm wrong here :)
> pt_regs is part of the ABI, and it encompasses all user mode registers
> except for floating point, which are handled separately.
> 
> 
>>> +	/* relevant fields from thread_struct */
>>> +	double fpr[32][TS_FPRWIDTH];
>> Can TS_FPRWIDTH change between sub-archs or kernel versions ?  If so, it
>> needs to be stated explicitly.
>>
>>> +	unsigned int fpscr;
>>> +	int fpexc_mode;
>>> +	/* unsigned int align_ctl; this is never updated? */
>>> +	unsigned long dabr;
>> Are these fields always guarantee to compile to the same number of bytes
>> regardless of 32/64 bit choice of compiler (or sub-arch?) ?
>>
>> In the x86(32/64) architecture we use types with explicit size such as
>> __u32 and the like to ensure that it always compiled to the same
>> size.
> 
> Yeah, I'll have to fix these up.
> 
> 
> 
>>> +static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
>>> +{
>>> +	hdr->type = type;
>>> +	hdr->len = len;
>>> +	hdr->parent = parent;
>>> +}
>>> +
>> This function is rather generic and useful to non-arch-dependent and other
>> architectures code. Perhaps put in a separate patch ?
> 
> Alright.  By the way, why are cr_hdr->type and cr_hdr->len signed
> types?
> 

No particular reason. I can change that in v14.

> 
>>> +int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
>>> +{
>>> +	struct cr_hdr_cpu *cpu_hdr;
>>> +	struct pt_regs *pt_regs;
>>> +	struct cr_hdr cr_hdr;
>>> +	u32 parent;
>>> +	int ret;
>>> +
>>> +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
>>> +	if (!cpu_hdr)
>>> +		return -ENOMEM;
>>> +
>>> +	parent = task_pid_vnr(t);
>>> +
>>> +	cr_hdr_init(&cr_hdr, CR_HDR_CPU, sizeof(*cpu_hdr), parent);
>>> +
>>> +	/* pt_regs: GPRs, MSR, etc */
>>> +	pt_regs = task_pt_regs(t);
>>> +	cpu_hdr->pt_regs = *pt_regs;
>>> +
>>> +	/* FP state */
>>> +	memcpy(cpu_hdr->fpr, t->thread.fpr, sizeof(cpu_hdr->fpr));
>> As note above, is sizeof(cpu_hdr->fpr) the same on all chips ?
> 
> It can differ depending on kernel configuration.

So the actual size needs to be explicitly indicated (and compared with).

> 
> 
>>> +/* restart APIs */
>>> +
>> The restart APIs belong in a separate file: arch/powerpc/mm/restart.c
> 
> Explain why, please?  This isn't a lot of code, and it seems likely
> that checkpoint and restart paths will share data structures and tend
> to be modified together over time.

This one has little code, but usually that isn't the case, and many of
the data structures shared are anyway exported. Since the split makes
sense in other cases, it makes sense to follow convention.

Personally I don't have a strong opinion on this. However one of the
initial feedbacks for the existing patchset requested that I split the
functionality between files (and to separate commits).

In other words, if nobody else cries, I won't spoil it ;)

> 
> 
>>> +		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
>>> +			 __func__, (unsigned long)thread_hdr->unimplemented);
>> Given the macro for 'pr_fmt' in include/linux/checkpoint.h, the use of
>> __func__ is redunant.
> 
> It seems to me that defining your own pr_fmt in a "public" header like
> that is inappropriate, or at least unconventional.  Any file that
> happens to include linux/checkpoint.h will have any prior definitions
> of pr_fmt overridden, no?
> 

Hmmm.. didn't think of it this way. Using the pr_debug() there was yet
another feedback from LKML, and it seemed reasonable to me. Can you
think of a case where linux/checkpoint.h will happen to be included
in checkpoint-related code ?

> 
>>> +	regs = task_pt_regs(current);
>>> +	*regs = cpu_hdr->pt_regs;
>>> +
>>> +	regs->msr = sanitize_msr(regs->msr);
>>> +
>>> +	/* FP state */
>>> +	memcpy(current->thread.fpr, cpu_hdr->fpr, sizeof(current->thread.fpr));
>>> +	current->thread.fpscr.val = cpu_hdr->fpscr;
>>> +	current->thread.fpexc_mode = cpu_hdr->fpexc_mode;
>>> +
>>> +	/* debug registers */
>>> +	current->thread.dabr = cpu_hdr->dabr;
>> I'm unfamiliar with powerpc; is it necessary to sanitize any of the registers
>> here ?  For instance, can the user cause harm with specially crafted values
>> of some registers ?
> 
> I had this in mind with the treatment of MSR, but I'll check on the
> others, thanks.
> 
> 
>>> +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
>>> +{
>>> +	struct cr_hdr_mm_context *mm_hdr;
>>> +	int ret;
>>> +
>>> +	mm_hdr = cr_hbuf_get(ctx, sizeof(*mm_hdr));
>>> +	if (!mm_hdr)
>>> +		return -ENOMEM;
>>> +
>>> +	ret = cr_read_obj_type(ctx, mm_hdr, sizeof(*mm_hdr),
>>> +				  CR_HDR_MM_CONTEXT);
>>> +	if (ret != rparent)
>>> +		goto out;
>> Seems like 'ret' isn't set to an error value if the 'goto' executes.
> 
> It returns whatever error value cr_read_obj_type() returns.  Hrm.  I
> guess if the image is garbage, cr_read_obj_type can potentially return
> a non-error value that still isn't the desired value, is that right?
> 

True.

Thanks,

Oren.
Serge E. Hallyn - Jan. 30, 2009, 3:55 a.m.
Quoting Nathan Lynch (ntl@pobox.com):
> The only thing of significance here is that the checkpointed task's
> pt_regs and fp state are saved and restored (see cr_write_cpu and
> cr_read_cpu); the rest of the code consists of dummy implementations
> of the APIs the arch needs to provide to the checkpoint/restart core.
> 
> What works:
> * self and external checkpoint of simple (single thread, one open
>   file) 32- and 64-bit processes on a ppc64 kernel
> 
> What doesn't work:
> * restarting a 32-bit task from a 64-bit task and vice versa
> 
> Untested:
> * ppc32 (but it builds)
> 
> Signed-off-by: Nathan Lynch <ntl@pobox.com>
> ---
>  arch/powerpc/include/asm/checkpoint_hdr.h |   40 +++++
>  arch/powerpc/mm/Makefile                  |    1 +
>  arch/powerpc/mm/checkpoint.c              |  261 +++++++++++++++++++++++++++++
>  3 files changed, 302 insertions(+), 0 deletions(-)
>  create mode 100644 arch/powerpc/include/asm/checkpoint_hdr.h
>  create mode 100644 arch/powerpc/mm/checkpoint.c
> 
> diff --git a/arch/powerpc/include/asm/checkpoint_hdr.h b/arch/powerpc/include/asm/checkpoint_hdr.h
> new file mode 100644
> index 0000000..752c53f
> --- /dev/null
> +++ b/arch/powerpc/include/asm/checkpoint_hdr.h
> @@ -0,0 +1,40 @@
> +#ifndef __ASM_PPC_CKPT_HDR_H
> +#define __ASM_PPC_CKPT_HDR_H
> +/*
> + *  Checkpoint/restart - architecture specific headers ppc
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#include <linux/types.h>
> +#include <asm/ptrace.h>
> +#include <asm/mmu.h>
> +#include <asm/processor.h>
> +
> +struct cr_hdr_head_arch {
> +	__u32 unimplemented;
> +};
> +
> +struct cr_hdr_thread {
> +	__u32 unimplemented;
> +};
> +
> +struct cr_hdr_cpu {
> +	struct pt_regs pt_regs;
> +	/* relevant fields from thread_struct */
> +	double fpr[32][TS_FPRWIDTH];
> +	unsigned int fpscr;
> +	int fpexc_mode;
> +	/* unsigned int align_ctl; this is never updated? */
> +	unsigned long dabr;
> +};
> +
> +struct cr_hdr_mm_context {
> +	__u32 unimplemented;
> +};
> +
> +#endif /* __ASM_PPC_CKPT_HDR__H */
> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> index e7392b4..8a523a0 100644
> --- a/arch/powerpc/mm/Makefile
> +++ b/arch/powerpc/mm/Makefile
> @@ -24,3 +24,4 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
>  obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
>  obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
>  obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
> +obj-$(CONFIG_CHECKPOINT_RESTART) += checkpoint.o
> diff --git a/arch/powerpc/mm/checkpoint.c b/arch/powerpc/mm/checkpoint.c
> new file mode 100644
> index 0000000..8cdff24
> --- /dev/null
> +++ b/arch/powerpc/mm/checkpoint.c
> @@ -0,0 +1,261 @@
> +/*
> + *  Checkpoint/restart - architecture specific support for powerpc.
> + *  Based on x86 implementation.
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *  Copyright 2009 IBM Corp.
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#define DEBUG 1 /* for pr_debug */
> +
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/kernel.h>
> +#include <asm/processor.h>
> +
> +static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
> +{
> +	hdr->type = type;
> +	hdr->len = len;
> +	hdr->parent = parent;
> +}
> +
> +/* dump the thread_struct of a given task */
> +int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr_thread *thread_hdr;
> +	struct cr_hdr cr_hdr;
> +	u32 parent;
> +	int ret;
> +
> +	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
> +	if (!thread_hdr)
> +		return -ENOMEM;
> +
> +	parent = task_pid_vnr(t);
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_THREAD, sizeof(*thread_hdr), parent);

All you're writing here is the vpid, right?  The arch-independent
code already stores that?

> +
> +	thread_hdr->unimplemented = 0xdeadbeef;
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, thread_hdr);
> +	cr_hbuf_put(ctx, sizeof(*thread_hdr));
> +
> +	return ret;
> +}
> +
> +/* dump the cpu state and registers of a given task */
> +int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr_cpu *cpu_hdr;
> +	struct pt_regs *pt_regs;
> +	struct cr_hdr cr_hdr;
> +	u32 parent;
> +	int ret;
> +
> +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
> +	if (!cpu_hdr)
> +		return -ENOMEM;
> +
> +	parent = task_pid_vnr(t);
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_CPU, sizeof(*cpu_hdr), parent);
> +
> +	/* pt_regs: GPRs, MSR, etc */
> +	pt_regs = task_pt_regs(t);
> +	cpu_hdr->pt_regs = *pt_regs;
> +
> +	/* FP state */
> +	memcpy(cpu_hdr->fpr, t->thread.fpr, sizeof(cpu_hdr->fpr));
> +	cpu_hdr->fpscr = t->thread.fpscr.val;
> +	cpu_hdr->fpexc_mode = t->thread.fpexc_mode;
> +
> +	/* Handle DABR for now, dbcr[01] later */
> +	cpu_hdr->dabr = t->thread.dabr;
> +
> +	/* ToDo: Altivec/VSX/SPE state */
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, cpu_hdr);
> +	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
> +
> +	return ret;
> +}
> +
> +int cr_write_head_arch(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_head_arch *arch_hdr;
> +	struct cr_hdr cr_hdr;
> +	int ret;
> +
> +	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
> +	if (!arch_hdr)
> +		return -ENOMEM;
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_HEAD_ARCH, sizeof(*arch_hdr), 0);
> +
> +	arch_hdr->unimplemented = 0xdeadbeef;

No need to write this just for the sake of writing something, imo.

> +	ret = cr_write_obj(ctx, &cr_hdr, arch_hdr);
> +	cr_hbuf_put(ctx, sizeof(*arch_hdr));
> +
> +	return ret;
> +}
> +
> +/* dump the mm->context state */
> +int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int parent)
> +{
> +	struct cr_hdr_mm_context *mm_hdr;
> +	struct cr_hdr cr_hdr;
> +	size_t size;
> +	int ret;
> +
> +	size = sizeof(*mm_hdr);
> +
> +	mm_hdr = cr_hbuf_get(ctx, size);
> +	if (!mm_hdr)
> +		return -ENOMEM;
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_MM_CONTEXT, size, parent);
> +
> +	mm_hdr->unimplemented = 0xdeadbeef;

Again

> +	ret = cr_write_obj(ctx, &cr_hdr, mm_hdr);
> +	cr_hbuf_put(ctx, size);
> +
> +	return ret;
> +}
> +
> +/* restart APIs */
> +
> +/* read the thread_struct into the current task */
> +int cr_read_thread(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_thread *thread_hdr;
> +	int ret;
> +
> +	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
> +	if (!thread_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, thread_hdr, sizeof(*thread_hdr),
> +				  CR_HDR_THREAD);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	if (thread_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)thread_hdr->unimplemented);
> +		ret = -EINVAL;
> +	}

DROP

> +out:
> +	cr_hbuf_put(ctx, sizeof(*thread_hdr));
> +	return ret;
> +}
> +
> +/* Based on the MSR value from a checkpoint image, produce an MSR
> + * value that is appropriate for the restored task.  Right now we only
> + * check for MSR_SF (64-bit) for PPC64.
> + */
> +static unsigned long sanitize_msr(unsigned long msr_ckpt)
> +{
> +#ifdef CONFIG_PPC32
> +	return MSR_USER;
> +#else
> +	if (msr_ckpt & MSR_SF)
> +		return MSR_USER64;
> +	return MSR_USER32;
> +#endif
> +}
> +
> +int cr_read_cpu(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_cpu *cpu_hdr;
> +	struct pt_regs *regs;
> +	int ret;
> +
> +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
> +	if (!cpu_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, cpu_hdr, sizeof(*cpu_hdr),
> +				  CR_HDR_CPU);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	regs = task_pt_regs(current);
> +	*regs = cpu_hdr->pt_regs;
> +
> +	regs->msr = sanitize_msr(regs->msr);
> +
> +	/* FP state */
> +	memcpy(current->thread.fpr, cpu_hdr->fpr, sizeof(current->thread.fpr));
> +	current->thread.fpscr.val = cpu_hdr->fpscr;
> +	current->thread.fpexc_mode = cpu_hdr->fpexc_mode;
> +
> +	/* debug registers */
> +	current->thread.dabr = cpu_hdr->dabr;
> +out:
> +	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
> +	return ret;
> +}
> +
> +int cr_read_head_arch(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_head_arch *arch_hdr;
> +	int ret;
> +
> +	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
> +	if (!arch_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, arch_hdr, sizeof(*arch_hdr),
> +				  CR_HDR_HEAD_ARCH);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	if (arch_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected arch_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)arch_hdr->unimplemented);
> +		ret = -EINVAL;

DROP

> +	}
> +out:
> +	cr_hbuf_put(ctx, sizeof(*arch_hdr));
> +	return ret;
> +}
> +
> +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
> +{
> +	struct cr_hdr_mm_context *mm_hdr;
> +	int ret;
> +
> +	mm_hdr = cr_hbuf_get(ctx, sizeof(*mm_hdr));
> +	if (!mm_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, mm_hdr, sizeof(*mm_hdr),
> +				  CR_HDR_MM_CONTEXT);
> +	if (ret != rparent)
> +		goto out;
> +
> +	ret = 0;
> +
> +	if (mm_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected mm_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)mm_hdr->unimplemented);
> +		ret = -EINVAL;

DROP

> +	}
> +
> +out:
> +	cr_hbuf_put(ctx, sizeof(*mm_hdr));
> +	return ret;
> +}
> -- 
> 1.5.5
> 
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
Serge E. Hallyn - Jan. 30, 2009, 4:01 a.m.
Quoting Oren Laadan (orenl@cs.columbia.edu):
> > +static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
> > +{
> > +	hdr->type = type;
> > +	hdr->len = len;
> > +	hdr->parent = parent;
> > +}
> > +
> 
> This function is rather generic and useful to non-arch-dependent and other
> architectures code. Perhaps put in a separate patch ?

BTW I disagree here - looking through the patch I found
the use of this fn to be very distracting.  To replace 3
simple in-line assignments, I have to verify the order of
4 weird-looking arguments, and actually first try to
remember what 'cr_hdr_init' is supposed to be and do?

That's not meant to be a complaint, just explanation :)

I prefer dropping its use altogether.  Since I believe
most of the functions calling it in this patch shouldn't
exist anyway (just writing 0xdeadbeef into the file), it
all the more should just go away.

-serge
Nathan Lynch - Jan. 30, 2009, 8:25 p.m.
Oren Laadan wrote:
> 
> Nathan Lynch wrote:
> > 
> > Oren Laadan wrote:
> >> Nathan Lynch wrote:
> >>> +		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
> >>> +			 __func__, (unsigned long)thread_hdr->unimplemented);
> >> Given the macro for 'pr_fmt' in include/linux/checkpoint.h, the use of
> >> __func__ is redunant.
> > 
> > It seems to me that defining your own pr_fmt in a "public" header like
> > that is inappropriate, or at least unconventional.  Any file that
> > happens to include linux/checkpoint.h will have any prior definitions
> > of pr_fmt overridden, no?
> > 
> 
> Hmmm.. didn't think of it this way. Using the pr_debug() there was yet
> another feedback from LKML, and it seemed reasonable to me. Can you
> think of a case where linux/checkpoint.h will happen to be included
> in checkpoint-related code ?

(Assume you meant "included in checkpoint-unrelated code")

I could see checkpoint.h being included by files that don't
exclusively deal with C/R.  If you want a uniform debug statement
format for C/R-related code, that's fine, but this isn't the way to do
it.  See the existing users (almost all in drivers/s390).
Benjamin Herrenschmidt - Feb. 4, 2009, 3:39 a.m.
> +struct cr_hdr_cpu {
> +	struct pt_regs pt_regs;
> +	/* relevant fields from thread_struct */
> +	double fpr[32][TS_FPRWIDTH];
> +	unsigned int fpscr;
> +	int fpexc_mode;
> +	/* unsigned int align_ctl; this is never updated? */
> +	unsigned long dabr;
> +};

Is there some version or other identification somewhere ? If not there
should be. ie, we're going to add things here. For example, what about
the vector registers ? Also, some CPUs will have more HW debug registers
than just the DABR (we plan to add support for all the BookE architected
IACs and DACs for example), etc...

Ben.
Serge E. Hallyn - Feb. 4, 2009, 3:54 p.m.
Quoting Benjamin Herrenschmidt (benh@kernel.crashing.org):
> 
> > +struct cr_hdr_cpu {
> > +	struct pt_regs pt_regs;
> > +	/* relevant fields from thread_struct */
> > +	double fpr[32][TS_FPRWIDTH];
> > +	unsigned int fpscr;
> > +	int fpexc_mode;
> > +	/* unsigned int align_ctl; this is never updated? */
> > +	unsigned long dabr;
> > +};
> 
> Is there some version or other identification somewhere ? If not there
> should be. ie, we're going to add things here. For example, what about
> the vector registers ? Also, some CPUs will have more HW debug registers
> than just the DABR (we plan to add support for all the BookE architected
> IACs and DACs for example), etc...

The arch-independent checkpoint header does have kernel
maj:min:rev:patch info.  We expect to have to do more,
assuming that the .config can change the arch-dependent
cpu header (i.e. perhaps TS_FPRWIDTH could be changed).

-serge
Benjamin Herrenschmidt - Feb. 4, 2009, 8:58 p.m.
On Wed, 2009-02-04 at 09:54 -0600, Serge E. Hallyn wrote:
> Quoting Benjamin Herrenschmidt (benh@kernel.crashing.org):
> > 
> > > +struct cr_hdr_cpu {
> > > +	struct pt_regs pt_regs;
> > > +	/* relevant fields from thread_struct */
> > > +	double fpr[32][TS_FPRWIDTH];
> > > +	unsigned int fpscr;
> > > +	int fpexc_mode;
> > > +	/* unsigned int align_ctl; this is never updated? */
> > > +	unsigned long dabr;
> > > +};
> > 
> > Is there some version or other identification somewhere ? If not there
> > should be. ie, we're going to add things here. For example, what about
> > the vector registers ? Also, some CPUs will have more HW debug registers
> > than just the DABR (we plan to add support for all the BookE architected
> > IACs and DACs for example), etc...
> 
> The arch-independent checkpoint header does have kernel
> maj:min:rev:patch info.  We expect to have to do more,
> assuming that the .config can change the arch-dependent
> cpu header (i.e. perhaps TS_FPRWIDTH could be changed).

It could to a certain extent... things like VSX, VSR, or freescale SPE,
or even the Cell SPU state etc....

I wonder if we want a tagged structure so we can easily add things...

Ben.
Oren Laadan - Feb. 4, 2009, 11:44 p.m.
Benjamin Herrenschmidt wrote:
> On Wed, 2009-02-04 at 09:54 -0600, Serge E. Hallyn wrote:
>> Quoting Benjamin Herrenschmidt (benh@kernel.crashing.org):
>>>> +struct cr_hdr_cpu {
>>>> +	struct pt_regs pt_regs;
>>>> +	/* relevant fields from thread_struct */
>>>> +	double fpr[32][TS_FPRWIDTH];
>>>> +	unsigned int fpscr;
>>>> +	int fpexc_mode;
>>>> +	/* unsigned int align_ctl; this is never updated? */
>>>> +	unsigned long dabr;
>>>> +};
>>> Is there some version or other identification somewhere ? If not there
>>> should be. ie, we're going to add things here. For example, what about
>>> the vector registers ? Also, some CPUs will have more HW debug registers
>>> than just the DABR (we plan to add support for all the BookE architected
>>> IACs and DACs for example), etc...
>> The arch-independent checkpoint header does have kernel
>> maj:min:rev:patch info.  We expect to have to do more,
>> assuming that the .config can change the arch-dependent
>> cpu header (i.e. perhaps TS_FPRWIDTH could be changed).
> 
> It could to a certain extent... things like VSX, VSR, or freescale SPE,
> or even the Cell SPU state etc....
> 
> I wonder if we want a tagged structure so we can easily add things...

From the little bit I read hear, I suspect that the sub-arch classification
is best done in an arch-dependent header. I'd follow the following rule
of thumb:

* Anything that is decided at compiled time should probably go to the arch-
dependent header.

* Anything that can change at boot time (e.g., for x86 that would include
the capabilities of the FPU), or even run time (is there any ?) should
be described to the letter (in fine print) in 'struct cr_hdr_cpu' and
friends.

Oren.
Benjamin Herrenschmidt - Feb. 5, 2009, 12:16 a.m.
On Wed, 2009-02-04 at 18:44 -0500, Oren Laadan wrote:
> * Anything that is decided at compiled time should probably go to the arch-
> dependent header.
> 
> * Anything that can change at boot time (e.g., for x86 that would include
> the capabilities of the FPU), or even run time (is there any ?) should
> be described to the letter (in fine print) in 'struct cr_hdr_cpu' and
> friends.

I think we should avoid compile time completely.

We mostly try to have kernels running on everything anyway, and there's
no reason not to be able to move a snapshot to a different CPU if it's
not using a feature of the CPU that is different.

Nathan, what about you start the structure with a 64 bit bitmask that
indicates what "records" are present followed by concatenated records ?

IE. The "main" state (pt_regs) wouldn't change, but then, you could have
a list of things:

 - FPRs
 - old style VSX
 - VSRs
 - Freescale SPE state
 - DABR
 - BookE IAC/DACs
 - tbd...

Then, when resuming a snapshot, we can use some bit masks trickery
indicating the validity on a given target. IE. If CPU has no VSX and
original program uses VSX then you can't resume. But if CPU has VSR you
can.. etc... We can keep it trivial at fist, especting the same
features, and try to be smart later.

Ben.
Oren Laadan - Feb. 5, 2009, 3:30 a.m.
Benjamin Herrenschmidt wrote:
> On Wed, 2009-02-04 at 18:44 -0500, Oren Laadan wrote:
>> * Anything that is decided at compiled time should probably go to the arch-
>> dependent header.
>>
>> * Anything that can change at boot time (e.g., for x86 that would include
>> the capabilities of the FPU), or even run time (is there any ?) should
>> be described to the letter (in fine print) in 'struct cr_hdr_cpu' and
>> friends.
> 
> I think we should avoid compile time completely.

For instance, TASK_COMM_LEN is currently defined as 16; but in future
(or custom) kernel it may be different; so in the task header I put a
field that explicitly indicates this length, just in case. I think it's
useful to be able to detect such inconsistencies.

(of course this example is not arch-specific; and it would be wiser to
have one such entry for the entire checkpoint image instead of one for
each process)

I concur with the rest below.

Oren

> 
> We mostly try to have kernels running on everything anyway, and there's
> no reason not to be able to move a snapshot to a different CPU if it's
> not using a feature of the CPU that is different.
> 
> Nathan, what about you start the structure with a 64 bit bitmask that
> indicates what "records" are present followed by concatenated records ?
> 
> IE. The "main" state (pt_regs) wouldn't change, but then, you could have
> a list of things:
> 
>  - FPRs
>  - old style VSX
>  - VSRs
>  - Freescale SPE state
>  - DABR
>  - BookE IAC/DACs
>  - tbd...
> 
> Then, when resuming a snapshot, we can use some bit masks trickery
> indicating the validity on a given target. IE. If CPU has no VSX and
> original program uses VSX then you can't resume. But if CPU has VSR you
> can.. etc... We can keep it trivial at fist, especting the same
> features, and try to be smart later.
> 
> Ben.
> 
> 
>
Serge E. Hallyn - Feb. 5, 2009, 4:09 p.m.
Quoting Benjamin Herrenschmidt (benh@kernel.crashing.org):
> On Wed, 2009-02-04 at 18:44 -0500, Oren Laadan wrote:
> > * Anything that is decided at compiled time should probably go to the arch-
> > dependent header.
> > 
> > * Anything that can change at boot time (e.g., for x86 that would include
> > the capabilities of the FPU), or even run time (is there any ?) should
> > be described to the letter (in fine print) in 'struct cr_hdr_cpu' and
> > friends.
> 
> I think we should avoid compile time completely.
> 
> We mostly try to have kernels running on everything anyway, and there's
> no reason not to be able to move a snapshot to a different CPU if it's
> not using a feature of the CPU that is different.

Absolutely, but the accepted way to handle that so far is that if
you want to run an "incompatible" checkpoint image on a new cpu,
a userspace program will rewrite the image to be correct for the target
cpu.

But what you list below seems more usable than trying to encapsulate
that info in some hokey version number system.

> Nathan, what about you start the structure with a 64 bit bitmask that
> indicates what "records" are present followed by concatenated records ?
> 
> IE. The "main" state (pt_regs) wouldn't change, but then, you could have
> a list of things:
> 
>  - FPRs
>  - old style VSX
>  - VSRs
>  - Freescale SPE state
>  - DABR
>  - BookE IAC/DACs
>  - tbd...
> 
> Then, when resuming a snapshot, we can use some bit masks trickery
> indicating the validity on a given target. IE. If CPU has no VSX and
> original program uses VSX then you can't resume. But if CPU has VSR you
> can.. etc... We can keep it trivial at fist, especting the same
> features, and try to be smart later.
> 
> Ben.
>
Benjamin Herrenschmidt - Feb. 5, 2009, 9:01 p.m.
> Absolutely, but the accepted way to handle that so far is that if
> you want to run an "incompatible" checkpoint image on a new cpu,
> a userspace program will rewrite the image to be correct for the target
> cpu.

That doesn't sound nice and definitely not something we want to do on
PowerPC. There are lots of reasons for that including the fact that the
actual feature set may depend on what the FW enabled which itself can
depend on the kernel version .. or not, etc etc...

I'd rather keep all that logic in the kernel to be honest.

> But what you list below seems more usable than trying to encapsulate
> that info in some hokey version number system.

There are things however that I can't expose and for which we can't do
much about.

IE. The kernel exposes some features to userspace, such as the PowerPC
ISA level supported, the presence of some optional instructions, etc...

We don't know whether the user space program is using that stuff
though... ie, we could get into situations where userspace is trying to
use, let's say, 44x MAC instructions, and thus that program will fail to
resume on some other processor, and we have no way to know about it from
the kernel.

But I'll leave that problem for later... Maybe we should implement some
way, using personalities or something similar, to run programs so that
they see a limited set of CPU features, for people who explicitely want
them to be migrateable.. a bit similar to what our Hypervisor does if
you want a partition to be migrateable between different processors, it
only advertises the common subset of functionality.

Ben.
Nathan Lynch - Feb. 17, 2009, 7:03 a.m.
Nathan Lynch <ntl@pobox.com> wrote:
>
> Oren Laadan wrote:
> > 
> > Nathan Lynch wrote:
> > > 
> > > What doesn't work:
> > > * restarting a 32-bit task from a 64-bit task and vice versa
> > 
> > Is there a test to bail if we attempt to checkpoint such tasks ?
> 
> No, but I'll add one if it looks too hard to fix for the next round.

Unfortunately, adding a check for this is hard.

The "point of no return" in the restart path is cr_read_mm, which tears
down current's address space.  cr_read_mm runs way before cr_read_cpu,
which is the only restart method I've implemented for powerpc so far.
So, checking for this condition in cr_read_cpu is too late if I want
restart(2) to return an error and leave the caller's memory map
intact.  (And I do want this: restart should be as robust as execve.)

Well okay then, cr_read_head_arch seems to be the right place in the
restart sequence for the architecture code to handle this.  However,
cr_write_head_arch (which produces the buffer that cr_read_head_arch
consumes) is not provided a reference to the task to be checkpointed,
nor can it assume that it's operating on current.  I need a reference
to a task before I can determine whether it's running in 32- or 64-bit
mode, or using the FPU, Altivec, SPE, whatever.

In any case, mixing 32- and 64-bit tasks across restart is something I
eventually want to support, not reject.  But the problem I've outlined
applies to FPU state and vector extensions (VMX, SPE), as well as
sanity-checking debug register (DABR) contents.  We'll need to be able
to error out gracefully from restart when a checkpoint image specifies a
feature unsupported by the current kernel or hardware.  But I don't see
how to do it with the current architecture.  Am I missing something?
Serge E. Hallyn - Feb. 24, 2009, 7:58 p.m.
Quoting Nathan Lynch (ntl@pobox.com):
> Nathan Lynch <ntl@pobox.com> wrote:
> >
> > Oren Laadan wrote:
> > > 
> > > Nathan Lynch wrote:
> > > > 
> > > > What doesn't work:
> > > > * restarting a 32-bit task from a 64-bit task and vice versa
> > > 
> > > Is there a test to bail if we attempt to checkpoint such tasks ?
> > 
> > No, but I'll add one if it looks too hard to fix for the next round.
> 
> Unfortunately, adding a check for this is hard.
> 
> The "point of no return" in the restart path is cr_read_mm, which tears
> down current's address space.  cr_read_mm runs way before cr_read_cpu,
> which is the only restart method I've implemented for powerpc so far.
> So, checking for this condition in cr_read_cpu is too late if I want
> restart(2) to return an error and leave the caller's memory map
> intact.  (And I do want this: restart should be as robust as execve.)
> 
> Well okay then, cr_read_head_arch seems to be the right place in the
> restart sequence for the architecture code to handle this.  However,
> cr_write_head_arch (which produces the buffer that cr_read_head_arch
> consumes) is not provided a reference to the task to be checkpointed,
> nor can it assume that it's operating on current.  I need a reference
> to a task before I can determine whether it's running in 32- or 64-bit
> mode, or using the FPU, Altivec, SPE, whatever.
> 
> In any case, mixing 32- and 64-bit tasks across restart is something I
> eventually want to support, not reject.  But the problem I've outlined
> applies to FPU state and vector extensions (VMX, SPE), as well as
> sanity-checking debug register (DABR) contents.  We'll need to be able
> to error out gracefully from restart when a checkpoint image specifies a
> feature unsupported by the current kernel or hardware.  But I don't see
> how to do it with the current architecture.  Am I missing something?

I suspect I can guess the response to this suggestion, but how about we
accept that if sys_restart() fails due to something like this, the
task is lost and can't exit gracefully?  The end-user can then run
a user-space program to run through the checkpoint image ahead of
time (if they want) and verify whether restart can be expected to
succeed on the current hardware and os version?

If a minor difference is detected, the user-space program might
simply rewrite (a copy of) the checkpoint image into one which can
in fact succeed.

-serge
Nathan Lynch - Feb. 24, 2009, 9:11 p.m.
On Tue, 24 Feb 2009 13:58:26 -0600
"Serge E. Hallyn" <serue@us.ibm.com> wrote:

> Quoting Nathan Lynch (ntl@pobox.com):
> > Nathan Lynch <ntl@pobox.com> wrote:
> > >
> > > Oren Laadan wrote:
> > > > 
> > > > Nathan Lynch wrote:
> > > > > 
> > > > > What doesn't work:
> > > > > * restarting a 32-bit task from a 64-bit task and vice versa
> > > > 
> > > > Is there a test to bail if we attempt to checkpoint such tasks ?
> > > 
> > > No, but I'll add one if it looks too hard to fix for the next round.
> > 
> > Unfortunately, adding a check for this is hard.
> > 
> > The "point of no return" in the restart path is cr_read_mm, which tears
> > down current's address space.  cr_read_mm runs way before cr_read_cpu,
> > which is the only restart method I've implemented for powerpc so far.
> > So, checking for this condition in cr_read_cpu is too late if I want
> > restart(2) to return an error and leave the caller's memory map
> > intact.  (And I do want this: restart should be as robust as execve.)
> > 
> > Well okay then, cr_read_head_arch seems to be the right place in the
> > restart sequence for the architecture code to handle this.  However,
> > cr_write_head_arch (which produces the buffer that cr_read_head_arch
> > consumes) is not provided a reference to the task to be checkpointed,
> > nor can it assume that it's operating on current.  I need a reference
> > to a task before I can determine whether it's running in 32- or 64-bit
> > mode, or using the FPU, Altivec, SPE, whatever.
> > 
> > In any case, mixing 32- and 64-bit tasks across restart is something I
> > eventually want to support, not reject.  But the problem I've outlined
> > applies to FPU state and vector extensions (VMX, SPE), as well as
> > sanity-checking debug register (DABR) contents.  We'll need to be able
> > to error out gracefully from restart when a checkpoint image specifies a
> > feature unsupported by the current kernel or hardware.  But I don't see
> > how to do it with the current architecture.  Am I missing something?
> 
> I suspect I can guess the response to this suggestion, but how about we
> accept that if sys_restart() fails due to something like this, the
> task is lost and can't exit gracefully?

In the short term it might be necessary.  But the restart code should
forcibly kill the task instead of returning an error back up to
userspace in this case.  Once the memory map of the process has been
altered, there is no point in allowing it to continue (and likely dump
a useless core).  Btw, this failure mode seems to apply when
cr_read_files() fails, too...

But in the long term, things need to be more robust (e.g. restart(2)
returns ENOEXEC without messing with current->mm).  I think it's worth
looking at how execve operates... if I understand correctly, it sets up
a new mm_struct disconnected from the current task and activates it at
the last moment.
Oren Laadan - March 13, 2009, 3:31 a.m.
Nathan Lynch wrote:
> Nathan Lynch <ntl@pobox.com> wrote:
>> Oren Laadan wrote:
>>> Nathan Lynch wrote:
>>>> What doesn't work:
>>>> * restarting a 32-bit task from a 64-bit task and vice versa
>>> Is there a test to bail if we attempt to checkpoint such tasks ?
>> No, but I'll add one if it looks too hard to fix for the next round.
> 
> Unfortunately, adding a check for this is hard.
> 
> The "point of no return" in the restart path is cr_read_mm, which tears
> down current's address space.  cr_read_mm runs way before cr_read_cpu,
> which is the only restart method I've implemented for powerpc so far.
> So, checking for this condition in cr_read_cpu is too late if I want
> restart(2) to return an error and leave the caller's memory map
> intact.  (And I do want this: restart should be as robust as execve.)

In the case of restarting a container, I think it's ok if a restarting
tasks dies in an "ugly" way -- this will be observed and handled by the
initiating task outside the container, which will gracefully report to
the caller/user.

Even if you close this hole, then any other failure later on during
restart - even a failure to allocate kernel memory due to memory pressure,
will give that undesired effect that you are trying to avoid.

That said, any difference in the architecture that may cause restart to
fail is probably best placed in cr_write_head_arch.

> 
> Well okay then, cr_read_head_arch seems to be the right place in the
> restart sequence for the architecture code to handle this.  However,
> cr_write_head_arch (which produces the buffer that cr_read_head_arch
> consumes) is not provided a reference to the task to be checkpointed,
> nor can it assume that it's operating on current.  I need a reference
> to a task before I can determine whether it's running in 32- or 64-bit
> mode, or using the FPU, Altivec, SPE, whatever.
> 
> In any case, mixing 32- and 64-bit tasks across restart is something I
> eventually want to support, not reject.  But the problem I've outlined
> applies to FPU state and vector extensions (VMX, SPE), as well as
> sanity-checking debug register (DABR) contents.  We'll need to be able
> to error out gracefully from restart when a checkpoint image specifies a
> feature unsupported by the current kernel or hardware.  But I don't see
> how to do it with the current architecture.  Am I missing something?
> 

More specifically, I envision restart to work like this:

1) user invokes user-land utility (e.g. "cr --restart ..."
2) 'cr' will create a new container
3) 'cr' will start a child in that container
4) child will create rest of tree (in kernel or in user space - tbd)
5) each task in that tree will restore itself
6) 'cr' monitors this process
7) if all goes well - 'cr' report ok.
8) if something goes bad, 'cr' notices and notifies caller/user

so tasks that are restarting may just as well die badly - we don't care.

Does that make sense ?

Oren.
Oren Laadan - March 13, 2009, 3:36 a.m.
Nathan Lynch wrote:
> On Tue, 24 Feb 2009 13:58:26 -0600
> "Serge E. Hallyn" <serue@us.ibm.com> wrote:
> 
>> Quoting Nathan Lynch (ntl@pobox.com):
>>> Nathan Lynch <ntl@pobox.com> wrote:
>>>> Oren Laadan wrote:
>>>>> Nathan Lynch wrote:
>>>>>> What doesn't work:
>>>>>> * restarting a 32-bit task from a 64-bit task and vice versa
>>>>> Is there a test to bail if we attempt to checkpoint such tasks ?
>>>> No, but I'll add one if it looks too hard to fix for the next round.
>>> Unfortunately, adding a check for this is hard.
>>>
>>> The "point of no return" in the restart path is cr_read_mm, which tears
>>> down current's address space.  cr_read_mm runs way before cr_read_cpu,
>>> which is the only restart method I've implemented for powerpc so far.
>>> So, checking for this condition in cr_read_cpu is too late if I want
>>> restart(2) to return an error and leave the caller's memory map
>>> intact.  (And I do want this: restart should be as robust as execve.)
>>>
>>> Well okay then, cr_read_head_arch seems to be the right place in the
>>> restart sequence for the architecture code to handle this.  However,
>>> cr_write_head_arch (which produces the buffer that cr_read_head_arch
>>> consumes) is not provided a reference to the task to be checkpointed,
>>> nor can it assume that it's operating on current.  I need a reference
>>> to a task before I can determine whether it's running in 32- or 64-bit
>>> mode, or using the FPU, Altivec, SPE, whatever.
>>>
>>> In any case, mixing 32- and 64-bit tasks across restart is something I
>>> eventually want to support, not reject.  But the problem I've outlined
>>> applies to FPU state and vector extensions (VMX, SPE), as well as
>>> sanity-checking debug register (DABR) contents.  We'll need to be able
>>> to error out gracefully from restart when a checkpoint image specifies a
>>> feature unsupported by the current kernel or hardware.  But I don't see
>>> how to do it with the current architecture.  Am I missing something?
>> I suspect I can guess the response to this suggestion, but how about we
>> accept that if sys_restart() fails due to something like this, the
>> task is lost and can't exit gracefully?
> 
> In the short term it might be necessary.  But the restart code should
> forcibly kill the task instead of returning an error back up to
> userspace in this case.  Once the memory map of the process has been
> altered, there is no point in allowing it to continue (and likely dump
> a useless core).  Btw, this failure mode seems to apply when
> cr_read_files() fails, too...
> 
> But in the long term, things need to be more robust (e.g. restart(2)
> returns ENOEXEC without messing with current->mm).  I think it's worth
> looking at how execve operates... if I understand correctly, it sets up
> a new mm_struct disconnected from the current task and activates it at
> the last moment.
> 

That's a good idea, and I have considered it in the past.

However, it is easier to restarti a task in its own, new, context,
including the MM. For instance, you can leverage all memory syscalls.

An in-between way would be to switch to the new MM but not tear down
the original one, but rather save it along side. If a failure occur -
restore it.

Then, you'll have to ask the same question about all other resources -
signal handlers, open files, etc. Either you make all changes atomic
at once, or none - if you want the operation to be non-intrusive in
the case of an error.

However, I do think that this is not necessary: the tasks that are
doing the restart have been created from scratch for that purpose,
so they need not return any specific value to the user. It is the
task that initiates the restart that needs to handle error gracefully.
The scheme I proposed in the previous email does exactly that.

(This does not apply to self-restart, for obvious reasons, but that
is a special case anyway).

Oren.
Cedric Le Goater - March 13, 2009, 3:42 p.m.
> More specifically, I envision restart to work like this:
> 
> 1) user invokes user-land utility (e.g. "cr --restart ..."
> 2) 'cr' will create a new container
> 3) 'cr' will start a child in that container

process 1 in its private namespaces.

> 4) child will create rest of tree (in kernel or in user space - tbd)
> 5) each task in that tree will restore itself
> 6) 'cr' monitors this process
> 7) if all goes well - 'cr' report ok.
> 8) if something goes bad, 'cr' notices and notifies caller/user

that's MCR implementation of restart. 
 
> so tasks that are restarting may just as well die badly - we don't care.

just sigkill them, but at end, before releasing the container from 
a frozen state, we have to make sure the right number of tasks have 
restarted ... so you need to track them along the way.

C.
Nathan Lynch - March 16, 2009, 6:37 p.m.
Oren Laadan <orenl@cs.columbia.edu> wrote:
> 
> Nathan Lynch wrote:
> > Nathan Lynch <ntl@pobox.com> wrote:
> >> Oren Laadan wrote:
> >>> Nathan Lynch wrote:
> >>>> What doesn't work:
> >>>> * restarting a 32-bit task from a 64-bit task and vice versa
> >>> Is there a test to bail if we attempt to checkpoint such tasks ?
> >> No, but I'll add one if it looks too hard to fix for the next round.
> > 
> > Unfortunately, adding a check for this is hard.
> > 
> > The "point of no return" in the restart path is cr_read_mm, which tears
> > down current's address space.  cr_read_mm runs way before cr_read_cpu,
> > which is the only restart method I've implemented for powerpc so far.
> > So, checking for this condition in cr_read_cpu is too late if I want
> > restart(2) to return an error and leave the caller's memory map
> > intact.  (And I do want this: restart should be as robust as execve.)
> 
> In the case of restarting a container, I think it's ok if a restarting
> tasks dies in an "ugly" way -- this will be observed and handled by the
> initiating task outside the container, which will gracefully report to
> the caller/user.

How would task exit be observed?  Are all tasks in a restarted
container guaranteed to be children (in the sense that wait(2) would
work) of the initiating task?


> Even if you close this hole, then any other failure later on during
> restart - even a failure to allocate kernel memory due to memory pressure,
> will give that undesired effect that you are trying to avoid.

Kernel memory allocation failure is not the kind of problem I'm trying
to address.  I am trying to address the case of restarting a checkpoint
image that needs features that are not present, where the set of
features used by the checkpoint image can be compared against the set
of features the platform provides.


> That said, any difference in the architecture that may cause restart to
> fail is probably best placed in cr_write_head_arch.

I think I explained in my earlier mail why the current implementation's
cr_write_head_arch doesn't help in this case:

> > Well okay then, cr_read_head_arch seems to be the right place in the
> > restart sequence for the architecture code to handle this.  However,
> > cr_write_head_arch (which produces the buffer that cr_read_head_arch
> > consumes) is not provided a reference to the task to be checkpointed,
> > nor can it assume that it's operating on current.  I need a reference
> > to a task before I can determine whether it's running in 32- or 64-bit
> > mode, or using the FPU, Altivec, SPE, whatever.
> > 
> > In any case, mixing 32- and 64-bit tasks across restart is something I
> > eventually want to support, not reject.  But the problem I've outlined
> > applies to FPU state and vector extensions (VMX, SPE), as well as
> > sanity-checking debug register (DABR) contents.  We'll need to be able
> > to error out gracefully from restart when a checkpoint image specifies a
> > feature unsupported by the current kernel or hardware.  But I don't see
> > how to do it with the current architecture.  Am I missing something?
> > 
> 
> More specifically, I envision restart to work like this:
> 
> 1) user invokes user-land utility (e.g. "cr --restart ..."
> 2) 'cr' will create a new container
> 3) 'cr' will start a child in that container
> 4) child will create rest of tree (in kernel or in user space - tbd)
> 5) each task in that tree will restore itself
> 6) 'cr' monitors this process
> 7) if all goes well - 'cr' report ok.
> 8) if something goes bad, 'cr' notices and notifies caller/user

Again, how would 'cr' obtain exit status for these tasks, and how would
it distinguish failure from normal operation?
Cedric Le Goater - March 17, 2009, 6:55 a.m.
> Again, how would 'cr' obtain exit status for these tasks, and how would
> it distinguish failure from normal operation?

Here's our solution to this issue.

mcr maintains in its kernel container object an exitcode attribute for 
the mcr-restart process. This process is detached from the fork tree of 
the restarted application.  

when the restart is finished, an mcr-wait command can be called to reap 
this exitcode. This make it possible to distinguish an exit of the 
application process from an exit of the mcr-restart process.

This is a must-have for batch managers in an HPC environment. 

Cheers,

C.
Oren Laadan - March 18, 2009, 9:15 a.m.
An alternative: the task that created the container namely, is the parent
(outside the container) of the container init(1). In turn, init(1) creates
a special 'monitor' thread that monitors the restart, and the outside task
reaps the exit status of that thread (and only that thread).

[Hmmm... thinking about this - what happens if the container init(1) calls
clone() with CLONE_PARENT ??  does it not generate sort of a competing
container init(1) ??!!

Oren.


Cedric Le Goater wrote:
>> Again, how would 'cr' obtain exit status for these tasks, and how would
>> it distinguish failure from normal operation?
> 
> Here's our solution to this issue.
> 
> mcr maintains in its kernel container object an exitcode attribute for 
> the mcr-restart process. This process is detached from the fork tree of 
> the restarted application.  
> 
> when the restart is finished, an mcr-wait command can be called to reap 
> this exitcode. This make it possible to distinguish an exit of the 
> application process from an exit of the mcr-restart process.
> 
> This is a must-have for batch managers in an HPC environment. 
> 
> Cheers,
> 
> C.
>

Patch

diff --git a/arch/powerpc/include/asm/checkpoint_hdr.h b/arch/powerpc/include/asm/checkpoint_hdr.h
new file mode 100644
index 0000000..752c53f
--- /dev/null
+++ b/arch/powerpc/include/asm/checkpoint_hdr.h
@@ -0,0 +1,40 @@ 
+#ifndef __ASM_PPC_CKPT_HDR_H
+#define __ASM_PPC_CKPT_HDR_H
+/*
+ *  Checkpoint/restart - architecture specific headers ppc
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+#include <asm/mmu.h>
+#include <asm/processor.h>
+
+struct cr_hdr_head_arch {
+	__u32 unimplemented;
+};
+
+struct cr_hdr_thread {
+	__u32 unimplemented;
+};
+
+struct cr_hdr_cpu {
+	struct pt_regs pt_regs;
+	/* relevant fields from thread_struct */
+	double fpr[32][TS_FPRWIDTH];
+	unsigned int fpscr;
+	int fpexc_mode;
+	/* unsigned int align_ctl; this is never updated? */
+	unsigned long dabr;
+};
+
+struct cr_hdr_mm_context {
+	__u32 unimplemented;
+};
+
+#endif /* __ASM_PPC_CKPT_HDR__H */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index e7392b4..8a523a0 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -24,3 +24,4 @@  obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
+obj-$(CONFIG_CHECKPOINT_RESTART) += checkpoint.o
diff --git a/arch/powerpc/mm/checkpoint.c b/arch/powerpc/mm/checkpoint.c
new file mode 100644
index 0000000..8cdff24
--- /dev/null
+++ b/arch/powerpc/mm/checkpoint.c
@@ -0,0 +1,261 @@ 
+/*
+ *  Checkpoint/restart - architecture specific support for powerpc.
+ *  Based on x86 implementation.
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *  Copyright 2009 IBM Corp.
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define DEBUG 1 /* for pr_debug */
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/kernel.h>
+#include <asm/processor.h>
+
+static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
+{
+	hdr->type = type;
+	hdr->len = len;
+	hdr->parent = parent;
+}
+
+/* dump the thread_struct of a given task */
+int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
+{
+	struct cr_hdr_thread *thread_hdr;
+	struct cr_hdr cr_hdr;
+	u32 parent;
+	int ret;
+
+	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
+	if (!thread_hdr)
+		return -ENOMEM;
+
+	parent = task_pid_vnr(t);
+
+	cr_hdr_init(&cr_hdr, CR_HDR_THREAD, sizeof(*thread_hdr), parent);
+
+	thread_hdr->unimplemented = 0xdeadbeef;
+
+	ret = cr_write_obj(ctx, &cr_hdr, thread_hdr);
+	cr_hbuf_put(ctx, sizeof(*thread_hdr));
+
+	return ret;
+}
+
+/* dump the cpu state and registers of a given task */
+int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
+{
+	struct cr_hdr_cpu *cpu_hdr;
+	struct pt_regs *pt_regs;
+	struct cr_hdr cr_hdr;
+	u32 parent;
+	int ret;
+
+	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
+	if (!cpu_hdr)
+		return -ENOMEM;
+
+	parent = task_pid_vnr(t);
+
+	cr_hdr_init(&cr_hdr, CR_HDR_CPU, sizeof(*cpu_hdr), parent);
+
+	/* pt_regs: GPRs, MSR, etc */
+	pt_regs = task_pt_regs(t);
+	cpu_hdr->pt_regs = *pt_regs;
+
+	/* FP state */
+	memcpy(cpu_hdr->fpr, t->thread.fpr, sizeof(cpu_hdr->fpr));
+	cpu_hdr->fpscr = t->thread.fpscr.val;
+	cpu_hdr->fpexc_mode = t->thread.fpexc_mode;
+
+	/* Handle DABR for now, dbcr[01] later */
+	cpu_hdr->dabr = t->thread.dabr;
+
+	/* ToDo: Altivec/VSX/SPE state */
+
+	ret = cr_write_obj(ctx, &cr_hdr, cpu_hdr);
+	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
+
+	return ret;
+}
+
+int cr_write_head_arch(struct cr_ctx *ctx)
+{
+	struct cr_hdr_head_arch *arch_hdr;
+	struct cr_hdr cr_hdr;
+	int ret;
+
+	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
+	if (!arch_hdr)
+		return -ENOMEM;
+
+	cr_hdr_init(&cr_hdr, CR_HDR_HEAD_ARCH, sizeof(*arch_hdr), 0);
+
+	arch_hdr->unimplemented = 0xdeadbeef;
+
+	ret = cr_write_obj(ctx, &cr_hdr, arch_hdr);
+	cr_hbuf_put(ctx, sizeof(*arch_hdr));
+
+	return ret;
+}
+
+/* dump the mm->context state */
+int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int parent)
+{
+	struct cr_hdr_mm_context *mm_hdr;
+	struct cr_hdr cr_hdr;
+	size_t size;
+	int ret;
+
+	size = sizeof(*mm_hdr);
+
+	mm_hdr = cr_hbuf_get(ctx, size);
+	if (!mm_hdr)
+		return -ENOMEM;
+
+	cr_hdr_init(&cr_hdr, CR_HDR_MM_CONTEXT, size, parent);
+
+	mm_hdr->unimplemented = 0xdeadbeef;
+
+	ret = cr_write_obj(ctx, &cr_hdr, mm_hdr);
+	cr_hbuf_put(ctx, size);
+
+	return ret;
+}
+
+/* restart APIs */
+
+/* read the thread_struct into the current task */
+int cr_read_thread(struct cr_ctx *ctx)
+{
+	struct cr_hdr_thread *thread_hdr;
+	int ret;
+
+	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
+	if (!thread_hdr)
+		return -ENOMEM;
+
+	ret = cr_read_obj_type(ctx, thread_hdr, sizeof(*thread_hdr),
+				  CR_HDR_THREAD);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+
+	if (thread_hdr->unimplemented != 0xdeadbeef) {
+		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
+			 __func__, (unsigned long)thread_hdr->unimplemented);
+		ret = -EINVAL;
+	}
+out:
+	cr_hbuf_put(ctx, sizeof(*thread_hdr));
+	return ret;
+}
+
+/* Based on the MSR value from a checkpoint image, produce an MSR
+ * value that is appropriate for the restored task.  Right now we only
+ * check for MSR_SF (64-bit) for PPC64.
+ */
+static unsigned long sanitize_msr(unsigned long msr_ckpt)
+{
+#ifdef CONFIG_PPC32
+	return MSR_USER;
+#else
+	if (msr_ckpt & MSR_SF)
+		return MSR_USER64;
+	return MSR_USER32;
+#endif
+}
+
+int cr_read_cpu(struct cr_ctx *ctx)
+{
+	struct cr_hdr_cpu *cpu_hdr;
+	struct pt_regs *regs;
+	int ret;
+
+	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
+	if (!cpu_hdr)
+		return -ENOMEM;
+
+	ret = cr_read_obj_type(ctx, cpu_hdr, sizeof(*cpu_hdr),
+				  CR_HDR_CPU);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+
+	regs = task_pt_regs(current);
+	*regs = cpu_hdr->pt_regs;
+
+	regs->msr = sanitize_msr(regs->msr);
+
+	/* FP state */
+	memcpy(current->thread.fpr, cpu_hdr->fpr, sizeof(current->thread.fpr));
+	current->thread.fpscr.val = cpu_hdr->fpscr;
+	current->thread.fpexc_mode = cpu_hdr->fpexc_mode;
+
+	/* debug registers */
+	current->thread.dabr = cpu_hdr->dabr;
+out:
+	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
+	return ret;
+}
+
+int cr_read_head_arch(struct cr_ctx *ctx)
+{
+	struct cr_hdr_head_arch *arch_hdr;
+	int ret;
+
+	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
+	if (!arch_hdr)
+		return -ENOMEM;
+
+	ret = cr_read_obj_type(ctx, arch_hdr, sizeof(*arch_hdr),
+				  CR_HDR_HEAD_ARCH);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+
+	if (arch_hdr->unimplemented != 0xdeadbeef) {
+		pr_debug("%s: unexpected arch_hdr contents: 0x%lx\n",
+			 __func__, (unsigned long)arch_hdr->unimplemented);
+		ret = -EINVAL;
+	}
+out:
+	cr_hbuf_put(ctx, sizeof(*arch_hdr));
+	return ret;
+}
+
+int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
+{
+	struct cr_hdr_mm_context *mm_hdr;
+	int ret;
+
+	mm_hdr = cr_hbuf_get(ctx, sizeof(*mm_hdr));
+	if (!mm_hdr)
+		return -ENOMEM;
+
+	ret = cr_read_obj_type(ctx, mm_hdr, sizeof(*mm_hdr),
+				  CR_HDR_MM_CONTEXT);
+	if (ret != rparent)
+		goto out;
+
+	ret = 0;
+
+	if (mm_hdr->unimplemented != 0xdeadbeef) {
+		pr_debug("%s: unexpected mm_hdr contents: 0x%lx\n",
+			 __func__, (unsigned long)mm_hdr->unimplemented);
+		ret = -EINVAL;
+	}
+
+out:
+	cr_hbuf_put(ctx, sizeof(*mm_hdr));
+	return ret;
+}