diff mbox

[U-Boot,v3] Fix board init code to use a valid C runtime environment

Message ID 1447425803-24281-1-git-send-email-albert.u.boot@aribaud.net
State Superseded
Delegated to: Tom Rini
Headers show

Commit Message

Albert ARIBAUD Nov. 13, 2015, 2:43 p.m. UTC
board_init_f_mem() alters the C runtime environment's
stack it ls actually already using. This is not a valid
C runtime environment.

Split board_init_f_mem into C functions which do not
alter their own stack and therefore run in a valid C
runtime environment.

Signed-off-by: Albert ARIBAUD <albert.u.boot@aribaud.net>
---
For NIOS2, this patch contains all manual v1 and v2
fixes by Thomas.

For x86, this patch contains all manual v2 fixes by Bin.

Changes in v3:
- Rebase after commit 9ac4fc82
- Simplify malloc conditional as per commit 9ac4fc82
- Fix NIOS2 return value register (r2, not r4)
- Fix x86 subl argument inversion
- Group allocations in a single function
- Group initializations in a single function

Changes in v2:
- Fix all checkpatch issues
- Fix board_init_f_malloc prototype mismatch
- Fix board_init_[f_]xxx typo in NIOS2
- Fix aarch64 asm 'sub' syntax error

 arch/arc/lib/start.S            | 12 +++++---
 arch/arm/lib/crt0.S             |  6 ++--
 arch/arm/lib/crt0_64.S          |  6 ++--
 arch/microblaze/cpu/start.S     |  4 +--
 arch/nios2/cpu/start.S          | 13 +++++---
 arch/powerpc/cpu/ppc4xx/start.S | 14 ++++++---
 arch/x86/cpu/start.S            |  7 +++--
 arch/x86/lib/fsp/fsp_common.c   |  4 +--
 common/init/board_init.c        | 67 +++++++++++++++++++++++++++++++++--------
 include/common.h                | 33 ++++++++------------
 10 files changed, 108 insertions(+), 58 deletions(-)

Comments

Simon Glass Nov. 14, 2015, 2:29 a.m. UTC | #1
Hi Albert,

On 13 November 2015 at 07:43, Albert ARIBAUD <albert.u.boot@aribaud.net> wrote:
> board_init_f_mem() alters the C runtime environment's
> stack it ls actually already using. This is not a valid
> C runtime environment.

I think it is doing something like alloca() and reserving space on the
stack. It does not actually change the stack pointer in C code. It
changes data that is below the stack pointer, and therefore by
definition not part of the stack, I think.

What actually goes wrong here? I read your info in the other thread
but I'm afraid I still don't understand it.

>
> Split board_init_f_mem into C functions which do not
> alter their own stack and therefore run in a valid C
> runtime environment.
>
> Signed-off-by: Albert ARIBAUD <albert.u.boot@aribaud.net>
> ---
> For NIOS2, this patch contains all manual v1 and v2
> fixes by Thomas.
>
> For x86, this patch contains all manual v2 fixes by Bin.
>
> Changes in v3:
> - Rebase after commit 9ac4fc82
> - Simplify malloc conditional as per commit 9ac4fc82
> - Fix NIOS2 return value register (r2, not r4)
> - Fix x86 subl argument inversion
> - Group allocations in a single function
> - Group initializations in a single function
>
> Changes in v2:
> - Fix all checkpatch issues
> - Fix board_init_f_malloc prototype mismatch
> - Fix board_init_[f_]xxx typo in NIOS2
> - Fix aarch64 asm 'sub' syntax error
>
>  arch/arc/lib/start.S            | 12 +++++---
>  arch/arm/lib/crt0.S             |  6 ++--
>  arch/arm/lib/crt0_64.S          |  6 ++--
>  arch/microblaze/cpu/start.S     |  4 +--
>  arch/nios2/cpu/start.S          | 13 +++++---
>  arch/powerpc/cpu/ppc4xx/start.S | 14 ++++++---
>  arch/x86/cpu/start.S            |  7 +++--
>  arch/x86/lib/fsp/fsp_common.c   |  4 +--
>  common/init/board_init.c        | 67 +++++++++++++++++++++++++++++++++--------
>  include/common.h                | 33 ++++++++------------
>  10 files changed, 108 insertions(+), 58 deletions(-)
>
> diff --git a/arch/arc/lib/start.S b/arch/arc/lib/start.S
> index 26a5934..2a99193 100644
> --- a/arch/arc/lib/start.S
> +++ b/arch/arc/lib/start.S
> @@ -54,14 +54,16 @@ ENTRY(_start)
>         mov     %sp, CONFIG_SYS_INIT_SP_ADDR
>         mov     %fp, %sp
>
> -       /* Allocate and zero GD, update SP */
> -       mov     %r0, %sp
> -       bl      board_init_f_mem
> -
> +       /* Get reserved area size, update SP and FP */
> +       bl      board_init_f_get_reserve_size
>         /* Update stack- and frame-pointers */
> -       mov     %sp, %r0
> +       sub     %sp, %sp, %r0
>         mov     %fp, %sp
>
> +       /* Initialize reserved area */
> +       mov     %r0, %sp
> +       bl      board_init_f_init_reserve
> +
>         /* Zero the one and only argument of "board_init_f" */
>         mov_s   %r0, 0
>         j       board_init_f
> diff --git a/arch/arm/lib/crt0.S b/arch/arm/lib/crt0.S
> index 80548eb..4ec89a4 100644
> --- a/arch/arm/lib/crt0.S
> +++ b/arch/arm/lib/crt0.S
> @@ -82,9 +82,11 @@ ENTRY(_main)
>  #else
>         bic     sp, sp, #7      /* 8-byte alignment for ABI compliance */
>  #endif
> +       bl      board_init_f_get_reserve_size
> +       sub     sp, sp, r0
> +       mov     r9, sp
>         mov     r0, sp
> -       bl      board_init_f_mem
> -       mov     sp, r0
> +       bl      board_init_f_init_reserve
>
>         mov     r0, #0
>         bl      board_init_f
> diff --git a/arch/arm/lib/crt0_64.S b/arch/arm/lib/crt0_64.S
> index cef1c71..d0f8db8 100644
> --- a/arch/arm/lib/crt0_64.S
> +++ b/arch/arm/lib/crt0_64.S
> @@ -75,8 +75,10 @@ ENTRY(_main)
>         ldr     x0, =(CONFIG_SYS_INIT_SP_ADDR)
>  #endif
>         bic     sp, x0, #0xf    /* 16-byte alignment for ABI compliance */
> -       bl      board_init_f_mem
> -       mov     sp, x0
> +       bl      board_init_f_reserve_size
> +       sub     sp, sp, x0
> +       mov     x0, sp
> +       bl      board_init_f_init_reserve
>
>         mov     x0, #0
>         bl      board_init_f
> diff --git a/arch/microblaze/cpu/start.S b/arch/microblaze/cpu/start.S
> index 14f46a8..206be3e 100644
> --- a/arch/microblaze/cpu/start.S
> +++ b/arch/microblaze/cpu/start.S
> @@ -25,7 +25,7 @@ _start:
>
>         addi    r8, r0, __end
>         mts     rslr, r8
> -       /* TODO: Redo this code to call board_init_f_mem() */
> +       /* TODO: Redo this code to call board_init_f_*() */
>  #if defined(CONFIG_SPL_BUILD)
>         addi    r1, r0, CONFIG_SPL_STACK_ADDR
>         mts     rshr, r1
> @@ -142,7 +142,7 @@ _start:
>         ori     r12, r12, 0x1a0
>         mts     rmsr, r12
>
> -       /* TODO: Redo this code to call board_init_f_mem() */
> +       /* TODO: Redo this code to call board_init_f_*() */
>  clear_bss:
>         /* clear BSS segments */
>         addi    r5, r0, __bss_start
> diff --git a/arch/nios2/cpu/start.S b/arch/nios2/cpu/start.S
> index 54787c5..c95f1f4 100644
> --- a/arch/nios2/cpu/start.S
> +++ b/arch/nios2/cpu/start.S
> @@ -106,14 +106,17 @@ _reloc:
>         stw     r0, 4(sp)
>         mov     fp, sp
>
> -       /* Allocate and zero GD, update SP */
> +       /* Allocate and initialize reserved area, update SP */
> +       movhi   r2, %hi(board_init_f_reserve_size@h)
> +       ori     r2, r2, %lo(board_init_f_reserve_size@h)
> +       callr   r2
> +       sub     sp, sp, r2
>         mov     r4, sp
> -       movhi   r2, %hi(board_init_f_mem@h)
> -       ori     r2, r2, %lo(board_init_f_mem@h)
> +       movhi   r2, %hi(board_init_f_init_reserve@h)
> +       ori     r2, r2, %lo(board_init_f_init_reserve@h)
>         callr   r2
>
> -       /* Update stack- and frame-pointers */
> -       mov     sp, r2
> +       /* Update frame-pointer */
>         mov     fp, sp
>
>         /* Call board_init_f -- never returns */
> diff --git a/arch/powerpc/cpu/ppc4xx/start.S b/arch/powerpc/cpu/ppc4xx/start.S
> index 77d4040..cc6e4ec 100644
> --- a/arch/powerpc/cpu/ppc4xx/start.S
> +++ b/arch/powerpc/cpu/ppc4xx/start.S
> @@ -761,9 +761,10 @@ _start:
>
>         bl      cpu_init_f      /* run low-level CPU init code     (from Flash) */
>  #ifdef CONFIG_SYS_GENERIC_BOARD
> +       bl      board_init_f_reserve_size
> +       sub     r1, r1, r3
>         mr      r3, r1
> -       bl      board_init_f_mem
> -       mr      r1, r3
> +       bl      board_init_f_init_reserve
>         li      r0,0
>         stwu    r0, -4(r1)
>         stwu    r0, -4(r1)
> @@ -1037,9 +1038,14 @@ _start:
>
>         bl      cpu_init_f      /* run low-level CPU init code     (from Flash) */
>  #ifdef CONFIG_SYS_GENERIC_BOARD
> +       bl      board_init_f_gd_size
> +       sub     r1, r1, r3
>         mr      r3, r1
> -       bl      board_init_f_mem
> -       mr      r1, r3
> +       bl      board_init_f_gd
> +       bl      board_init_f_malloc_size
> +       sub     r1, r1, r3
> +       mr      r3, r1
> +       bl      board_init_f_malloc
>         stwu    r0, -4(r1)
>         stwu    r0, -4(r1)
>  #endif
> diff --git a/arch/x86/cpu/start.S b/arch/x86/cpu/start.S
> index 5b4ee79..b30383f 100644
> --- a/arch/x86/cpu/start.S
> +++ b/arch/x86/cpu/start.S
> @@ -121,10 +121,11 @@ car_init_ret:
>          * with esi holding the HOB list address returned by the FSP.
>          */
>  #endif
> -       /* Set up global data */
> +       /* Reserve 'globals' on the stack */
> +       call    board_init_f_reserve_size
> +       subl    %eax, %esp
>         mov     %esp, %eax
> -       call    board_init_f_mem
> -       mov     %eax, %esp
> +       call    board_init_f_init_reserve
>
>  #ifdef CONFIG_DEBUG_UART
>         call    debug_uart_init
> diff --git a/arch/x86/lib/fsp/fsp_common.c b/arch/x86/lib/fsp/fsp_common.c
> index c78df94..5249455 100644
> --- a/arch/x86/lib/fsp/fsp_common.c
> +++ b/arch/x86/lib/fsp/fsp_common.c
> @@ -95,8 +95,8 @@ int x86_fsp_init(void)
>                 /*
>                  * The second time we enter here, adjust the size of malloc()
>                  * pool before relocation. Given gd->malloc_base was adjusted
> -                * after the call to board_init_f_mem() in arch/x86/cpu/start.S,
> -                * we should fix up gd->malloc_limit here.
> +                * after the call to board_init_f_malloc() in arch/x86/cpu/
> +                * start.S, we should fix up gd->malloc_limit here.
>                  */
>                 gd->malloc_limit += CONFIG_FSP_SYS_MALLOC_F_LEN;
>         }
> diff --git a/common/init/board_init.c b/common/init/board_init.c
> index 1c6126d..e033253 100644
> --- a/common/init/board_init.c
> +++ b/common/init/board_init.c
> @@ -21,39 +21,80 @@ DECLARE_GLOBAL_DATA_PTR;
>  #define _USE_MEMCPY
>  #endif
>
> -/* Unfortunately x86 can't compile this code as gd cannot be assigned */
> -#ifndef CONFIG_X86
> +/* Unfortunately x86 or ARM can't compile this code as gd cannot be assigned */
> +#if !defined(CONFIG_X86) && !defined(CONFIG_SYS_THUMB_BUILD)
>  __weak void arch_setup_gd(struct global_data *gd_ptr)
>  {
>         gd = gd_ptr;
>  }
> -#endif /* !CONFIG_X86 */
> +#endif /* !CONFIG_X86 && !CONFIG_SYS_THUMB_BUILD */
>
> -ulong board_init_f_mem(ulong top)
> +/*
> + * Compute size of space to reserve on stack for use as 'globals'
> + * and return size request for reserved area.
> + *
> + * Notes:
> + *
> + * Reservation cannot be actually done from within a C function
> + * as it implies modifying the C stack.
> + *
> + */
> +
> +ulong board_init_f_get_reserve_size(void)
> +{
> +       ulong size;
> +
> +       /* Reserve GD (rounded up to a multiple of 16 bytes) */
> +       size = roundup(sizeof(struct global_data), 16);
> +
> +       /* Reserve malloc arena */
> +#if defined(CONFIG_SYS_MALLOC_F)
> +       size += CONFIG_SYS_MALLOC_F_LEN;
> +#endif
> +
> +       return size;
> +}
> +
> +/*
> + * Initialize reserved space (which has been safely allocated on the C
> + * stack from the C runtime environment handling code).
> + *
> + * Do not use 'gd->' until arch_setup_gd() has been called!
> + */
> +
> +void board_init_f_init_reserve(ulong base)
>  {
>         struct global_data *gd_ptr;
>  #ifndef _USE_MEMCPY
>         int *ptr;
>  #endif
>
> -       /* Leave space for the stack we are running with now */
> -       top -= 0x40;
> +       /*
> +        * clear GD entirely
> +        */
>
> -       top -= sizeof(struct global_data);
> -       top = ALIGN(top, 16);
> -       gd_ptr = (struct global_data *)top;
> +       gd_ptr = (struct global_data *)base;
> +       /* go down one GD plus 16-byte alignment */
> +       base += roundup(sizeof(struct global_data), 16);

But base is passed in, and on ARM sp and r9 are already set to this
address. So I don't think you can potentially adjust it here.

> +       /* zero the area */
>  #ifdef _USE_MEMCPY
>         memset(gd_ptr, '\0', sizeof(*gd));
>  #else
>         for (ptr = (int *)gd_ptr; ptr < (int *)(gd_ptr + 1); )
>                 *ptr++ = 0;
>  #endif
> +       /* set GD unless architecture did it already */
> +#if !defined(CONFIG_X86) && !defined(CONFIG_SYS_THUMB_BUILD)
>         arch_setup_gd(gd_ptr);
> +#endif
> +
> +       /*
> +        * record malloc arena start
> +        */
>
>  #if defined(CONFIG_SYS_MALLOC_F)
> -       top -= CONFIG_SYS_MALLOC_F_LEN;
> -       gd->malloc_base = top;
> +       /* go down one malloc arena */
> +       gd->malloc_base = base;
> +       base += CONFIG_SYS_MALLOC_F_LEN;
>  #endif
> -

Somewhat subtle, but you are allocating the memory in
board_init_f_get_reserve_size() in this order, from top to bottom:

global_data
CONIG_SYS_MALLOC_F_LEN

and in board_init_f_init_reserve(), from bottom to top:

global_data
CONIG_SYS_MALLOC_F_LEN

Now we put malloc() above global_data. That's fine I think. We now
have two pieces of code to keep in sync, which is probably
unavoidable.

> -       return top;
>  }
> diff --git a/include/common.h b/include/common.h
> index 09a131d..988d67a 100644
> --- a/include/common.h
> +++ b/include/common.h
> @@ -225,32 +225,25 @@ void board_init_f(ulong);
>  void board_init_r(gd_t *, ulong) __attribute__ ((noreturn));
>
>  /**
> - * board_init_f_mem() - Allocate global data and set stack position
> + * ulong board_init_f_get_reserve_size - return size of reserved area
>   *
>   * This function is called by each architecture very early in the start-up
> - * code to set up the environment for board_init_f(). It allocates space for
> - * global_data (see include/asm-generic/global_data.h) and places the stack
> - * below this.
> + * code to allow the C runtime to reserve space on the stack for writable
> + * 'globals' such as GD and the malloc arena.
>   *
> - * This function requires a stack[1] Normally this is at @top. The function
> - * starts allocating space from 64 bytes below @top. First it creates space
> - * for global_data. Then it calls arch_setup_gd() which sets gd to point to
> - * the global_data space and can reserve additional bytes of space if
> - * required). Finally it allocates early malloc() memory
> - * (CONFIG_SYS_MALLOC_F_LEN). The new top of the stack is just below this,
> - * and it returned by this function.
> + * @return:    size of reserved area
> + */
> +ulong board_init_f_get_reserve_size(void);
> +
> +/**
> + * board_init_f_init_reserve - initialize the reserved area(s)
>   *
> - * [1] Strictly speaking it would be possible to implement this function
> - * in C on many archs such that it does not require a stack. However this
> - * does not seem hugely important as only 64 byte are wasted. The 64 bytes
> - * are used to handle the calling standard which generally requires pushing
> - * addresses or registers onto the stack. We should be able to get away with
> - * less if this becomes important.
> + * This function is called once the C runtime has allocated the reserved
> + * area on the stack. It must initialize the GD at the base of that area.
>   *
> - * @top:       Top of available memory, also normally the top of the stack
> - * @return:    New stack location
> + * @base:      top from which reservation was done
>   */
> -ulong board_init_f_mem(ulong top);
> +void board_init_f_init_reserve(ulong base);
>
>  /**
>   * arch_setup_gd() - Set up the global_data pointer
> --
> 2.5.0

Regards,
Simon
Thomas Chou Nov. 14, 2015, 6:23 a.m. UTC | #2
Hi Albert,

On 2015年11月13日 22:43, Albert ARIBAUD wrote:
> +/*
> + * Initialize reserved space (which has been safely allocated on the C
> + * stack from the C runtime environment handling code).
> + *
> + * Do not use 'gd->' until arch_setup_gd() has been called!
> + */
> +
> +void board_init_f_init_reserve(ulong base)
>   {
>   	struct global_data *gd_ptr;
>   #ifndef _USE_MEMCPY
>   	int *ptr;
>   #endif
>
> -	/* Leave space for the stack we are running with now */
> -	top -= 0x40;
> +	/*
> +	 * clear GD entirely
> +	 */
>
> -	top -= sizeof(struct global_data);
> -	top = ALIGN(top, 16);
> -	gd_ptr = (struct global_data *)top;
> +	gd_ptr = (struct global_data *)base;
> +	/* go down one GD plus 16-byte alignment */
> +	base += roundup(sizeof(struct global_data), 16);

Maybe it can keep the original order, top--gd--malloc--base.

#if defined(CONFIG_SYS_MALLOC_F)
	base += CONFIG_SYS_MALLOC_F_LEN;
#endif
	gd_ptr = (struct global_data *)base;

> +	/* zero the area */
>   #ifdef _USE_MEMCPY
>   	memset(gd_ptr, '\0', sizeof(*gd));
>   #else
>   	for (ptr = (int *)gd_ptr; ptr < (int *)(gd_ptr + 1); )
>   		*ptr++ = 0;
>   #endif
> +	/* set GD unless architecture did it already */
> +#if !defined(CONFIG_X86) && !defined(CONFIG_SYS_THUMB_BUILD)
>   	arch_setup_gd(gd_ptr);
> +#endif
> +
> +	/*
> +	 * record malloc arena start
> +	 */
>
>   #if defined(CONFIG_SYS_MALLOC_F)
> -	top -= CONFIG_SYS_MALLOC_F_LEN;
> -	gd->malloc_base = top;
> +	/* go down one malloc arena */
> +	gd->malloc_base = base;

#if defined(CONFIG_SYS_MALLOC_F)
	base -= CONFIG_SYS_MALLOC_F_LEN;
	gd->malloc_base = base;
#endif

> +	base += CONFIG_SYS_MALLOC_F_LEN;

Useless statement.

Best regards,
Thomas
Albert ARIBAUD Nov. 14, 2015, 2:36 p.m. UTC | #3
Hello Simon,

On Fri, 13 Nov 2015 19:29:21 -0700, Simon Glass <sjg@chromium.org>
wrote:
> Hi Albert,
> 
> On 13 November 2015 at 07:43, Albert ARIBAUD <albert.u.boot@aribaud.net> wrote:
> > board_init_f_mem() alters the C runtime environment's
> > stack it ls actually already using. This is not a valid
> > C runtime environment.
> 
> I think it is doing something like alloca() and reserving space on the
> stack.

AFAICT, board_init_f_mem does *not* "do something like alloca()",
it does something that alloca() precisely does *not* do.

What alloca() does is, it creates a local variable on the stack, only
the size of that local is determined at run time; but that is still a
*local* variable, that will be freed like all locals when the current
function returns. The location of the alloca reservation is determined
by the compiler, which knows how which way the stack works and how much
of it is being used by the calling function already; and the resulting
pointer cannot be accessed beyond the size that was allocated.

What board_init_f_mem does is *completely* different: it wild-guesses
how much of the stack it is using, it assumes which way the stack grows,
and it purposefully accesses space that it was not given access to, and
it expects that allocation to survive upon returning to the calling
context.

Now, if you look at alloca's definition in glibc, you'll see it is
defined to be a gcc built-in, not a C function. There is a reason for
this: stack allocation within a C function requires the compiler's help,
because the compiler must be made aware of the alloca use in order to
properly free the alloca()ed space on every exit point in the function.
Only a built-in can get that level of compiler support.

board_init_f_mem's case is worse yet, as, contrary to alloca() which
only expects /local/ effects, board_init_f_mem expects the allocated
space to survive the return to the calling context. Doing this would
require *more* compiler help than alloca does, because it is not more
a matter of managing the stack pointer; it is a matter of moving
things around so that the allocated space ends up *above* the call
frame for board_init_f.

> It does not actually change the stack pointer in C code.

Non, it does not indeed; it could not, in fact, do this at all without
resorting to the same technique that is resorted to for implementing
alloca(), that is, it would have to be a built-in of gcc, as alloca()
is -- and, as I have just explained, it would require extensive help
from the compiler, as all compiler-generated epilog code would have to
account for that potential allocation.

> It changes data that is below the stack pointer, and therefore by
> definition not part of the stack, I think.

It changes data that it /assumes/ is below the stack pointer and that it
/assumes/ it can write to; but these two assumptions are unfounded, see
below.

> What actually goes wrong here? I read your info in the other thread
> but I'm afraid I still don't understand it.

At least three things are formally wrong here with board_init_f_mem:

- it assumes the stack grows downward.

- it assumes it never uses more than 0x40 bytes of the stack.

- it assumes it can write to the stack beyond what it was allocated for
  local variables, temporaries and arguments.

Each of these assumptions is unfounded in code which is supposed to be
architecture-agnostic. Not all stacks grow downward. There is no way to
ensure that only 0x40 bytes are used by board_init_f_mem. There is no
guarantee that a C function can write to its own stack beyond what it
was allocated -- in fact, there is no guarantee that a C function can
write to its stack anywhere else than its declared or alloca()ed
locals. Such writes are undefined behavior.

These assumptions are just as bad as assuming any pointer-to-long value
is valid when we could ensure it is; and we would fix such a situation,
would we not?

Same here. This patch ensures that the code in board_init_f_mem becomes
compliant with C runtime environment constraints. This requires
splitting the C code in two functions so that the first one does not
access its stack (except the allowed accesses generated by the
compiler) and the second accesses an area which is in fact not the stack
any more, thus is safe to access.

> Regards,
> Simon

Amicalement,
Albert ARIBAUD Nov. 14, 2015, 3:06 p.m. UTC | #4
Hello Thomas,

On Sat, 14 Nov 2015 14:23:31 +0800, Thomas Chou <thomas@wytron.com.tw>
wrote:
> Hi Albert,
> 
> On 2015年11月13日 22:43, Albert ARIBAUD wrote:
> > +/*
> > + * Initialize reserved space (which has been safely allocated on the C
> > + * stack from the C runtime environment handling code).
> > + *
> > + * Do not use 'gd->' until arch_setup_gd() has been called!
> > + */
> > +
> > +void board_init_f_init_reserve(ulong base)
> >   {
> >   	struct global_data *gd_ptr;
> >   #ifndef _USE_MEMCPY
> >   	int *ptr;
> >   #endif
> >
> > -	/* Leave space for the stack we are running with now */
> > -	top -= 0x40;
> > +	/*
> > +	 * clear GD entirely
> > +	 */
> >
> > -	top -= sizeof(struct global_data);
> > -	top = ALIGN(top, 16);
> > -	gd_ptr = (struct global_data *)top;
> > +	gd_ptr = (struct global_data *)base;
> > +	/* go down one GD plus 16-byte alignment */
> > +	base += roundup(sizeof(struct global_data), 16);
> 
> Maybe it can keep the original order, top--gd--malloc--base.

Actually, I inverted the two between v2 and v3 for a reason, but I
should have made it more explicit.

This reason is, for architectures which may not be able to write to
gd from arch_setup_gd(), the C runtime environment handling code (crt0.S
for ARM, etc) needs a way to easily find our where gd_ptr was allocated.

So, it was either:

- keeping *gd_ptr above the malloc arena and having to pass gd_ptr back
  to board_init_f_reserve_size's caller; but board_init_f_reserve_size
  already returns a value back to its caller, and returning two values
  from a C function takes resources (a pointer argument and a memory
  location at least);

or:
 
- putting *gd_ptr at the base of the allocated space, below the malloc
  arena, so that the C runtime environment handling code knows where to
  point gd to without incurring the cost of passing additional results
  back from board_init_f_reserve_init.

This is the reason why I placed the global data below the malloc arena.

[besides, considering that our malloc implementation starts from the
base of the arena and allocates upward, it is (admittedly very slightly)
safer to have GD placed below it than above.]

I will add a note in v4 about the reason for this (re)ordering of
allocations.

> > +	base += CONFIG_SYS_MALLOC_F_LEN;
> 
> Useless statement.

Right now it is indeed useless. However, if and when some other space
gets reserved and initialized besides GD and the malloc arena, this
statement will be needed before allocating / accessing the space
allocated above the malloc arena.

Note that this statement will not result in any useless *code*, as the
compiler's optimizer detects that base remains unused after it, and
thus removes it from the generated object code.

So I'll insist on keeping this statement, although I will add a comment
next to it explaining why it matters despite not having any visible
effect.

> Best regards,
> Thomas

Amicalement,
Simon Glass Nov. 14, 2015, 3:41 p.m. UTC | #5
Hi Albert,

On 14 November 2015 at 07:36, Albert ARIBAUD <albert.u.boot@aribaud.net> wrote:
> Hello Simon,
>
> On Fri, 13 Nov 2015 19:29:21 -0700, Simon Glass <sjg@chromium.org>
> wrote:
>> Hi Albert,
>>
>> On 13 November 2015 at 07:43, Albert ARIBAUD <albert.u.boot@aribaud.net> wrote:
>> > board_init_f_mem() alters the C runtime environment's
>> > stack it ls actually already using. This is not a valid
>> > C runtime environment.
>>
>> I think it is doing something like alloca() and reserving space on the
>> stack.
>
> AFAICT, board_init_f_mem does *not* "do something like alloca()",
> it does something that alloca() precisely does *not* do.
>
> What alloca() does is, it creates a local variable on the stack, only
> the size of that local is determined at run time; but that is still a
> *local* variable, that will be freed like all locals when the current
> function returns. The location of the alloca reservation is determined
> by the compiler, which knows how which way the stack works and how much
> of it is being used by the calling function already; and the resulting
> pointer cannot be accessed beyond the size that was allocated.
>
> What board_init_f_mem does is *completely* different: it wild-guesses
> how much of the stack it is using, it assumes which way the stack grows,
> and it purposefully accesses space that it was not given access to, and
> it expects that allocation to survive upon returning to the calling
> context.
>
> Now, if you look at alloca's definition in glibc, you'll see it is
> defined to be a gcc built-in, not a C function. There is a reason for
> this: stack allocation within a C function requires the compiler's help,
> because the compiler must be made aware of the alloca use in order to
> properly free the alloca()ed space on every exit point in the function.
> Only a built-in can get that level of compiler support.
>
> board_init_f_mem's case is worse yet, as, contrary to alloca() which
> only expects /local/ effects, board_init_f_mem expects the allocated
> space to survive the return to the calling context. Doing this would
> require *more* compiler help than alloca does, because it is not more
> a matter of managing the stack pointer; it is a matter of moving
> things around so that the allocated space ends up *above* the call
> frame for board_init_f.

OK. It doesn't matter much - I think we both understand what is going
on in the function.

>
>> It does not actually change the stack pointer in C code.
>
> Non, it does not indeed; it could not, in fact, do this at all without
> resorting to the same technique that is resorted to for implementing
> alloca(), that is, it would have to be a built-in of gcc, as alloca()
> is -- and, as I have just explained, it would require extensive help
> from the compiler, as all compiler-generated epilog code would have to
> account for that potential allocation.
>
>> It changes data that is below the stack pointer, and therefore by
>> definition not part of the stack, I think.
>
> It changes data that it /assumes/ is below the stack pointer and that it
> /assumes/ it can write to; but these two assumptions are unfounded, see
> below.
>
>> What actually goes wrong here? I read your info in the other thread
>> but I'm afraid I still don't understand it.
>
> At least three things are formally wrong here with board_init_f_mem:
>
> - it assumes the stack grows downward.
>
> - it assumes it never uses more than 0x40 bytes of the stack.
>
> - it assumes it can write to the stack beyond what it was allocated for
>   local variables, temporaries and arguments.
>
> Each of these assumptions is unfounded in code which is supposed to be
> architecture-agnostic. Not all stacks grow downward. There is no way to
> ensure that only 0x40 bytes are used by board_init_f_mem. There is no
> guarantee that a C function can write to its own stack beyond what it
> was allocated -- in fact, there is no guarantee that a C function can
> write to its stack anywhere else than its declared or alloca()ed
> locals. Such writes are undefined behavior.
>
> These assumptions are just as bad as assuming any pointer-to-long value
> is valid when we could ensure it is; and we would fix such a situation,
> would we not?
>
> Same here. This patch ensures that the code in board_init_f_mem becomes
> compliant with C runtime environment constraints. This requires
> splitting the C code in two functions so that the first one does not
> access its stack (except the allowed accesses generated by the
> compiler) and the second accesses an area which is in fact not the stack
> any more, thus is safe to access.

Reading between the lines, there isn't actually a problem with this in
practice, but it is doing things that C should not really be doing. Is
that right? To me it seems OK provided we can assume that the stack
needed to call the function is less than 64 bytes. But I'm sure there
is a spec somewhere that the current function violates. Your idea of
what constitutes the stack seems slightly different to mine - I think
of it as the data already written, whereas you think of it as data yet
to write, There is some associated uncertainty since the compiler is
free to fiddle with the stack pointer. In practice in a boot loader we
always make some architectural assumptions based on current operation.
But I agree we should minimise this.

Anyway, I think your solution only adds one more function and still
has most of the code in C rather than assembler, so that it is common
to all archs. That was my main goal with the change.

I prefer to set up gd in the C function if we can, to avoid the
oddness that gd has to be at the bottom and the associated 16-byte
alignment issue. But let's see how that goes. If we need to do it in
assembler, it might be better to require the caller to do it before
calling board_init_f_reserve().

Regards,
Simon
Thomas Chou Nov. 15, 2015, 2:11 a.m. UTC | #6
Hi Albert,

On 2015年11月14日 23:06, Albert ARIBAUD wrote:
> Hello Thomas,
>
> On Sat, 14 Nov 2015 14:23:31 +0800, Thomas Chou <thomas@wytron.com.tw>
> wrote:
>> Hi Albert,
>>
>> On 2015年11月13日 22:43, Albert ARIBAUD wrote:
>>> +/*
>>> + * Initialize reserved space (which has been safely allocated on the C
>>> + * stack from the C runtime environment handling code).
>>> + *
>>> + * Do not use 'gd->' until arch_setup_gd() has been called!
>>> + */
>>> +
>>> +void board_init_f_init_reserve(ulong base)
>>>    {
>>>    	struct global_data *gd_ptr;
>>>    #ifndef _USE_MEMCPY
>>>    	int *ptr;
>>>    #endif
>>>
>>> -	/* Leave space for the stack we are running with now */
>>> -	top -= 0x40;
>>> +	/*
>>> +	 * clear GD entirely
>>> +	 */
>>>
>>> -	top -= sizeof(struct global_data);
>>> -	top = ALIGN(top, 16);
>>> -	gd_ptr = (struct global_data *)top;
>>> +	gd_ptr = (struct global_data *)base;
>>> +	/* go down one GD plus 16-byte alignment */
>>> +	base += roundup(sizeof(struct global_data), 16);
>>
>> Maybe it can keep the original order, top--gd--malloc--base.
>
> Actually, I inverted the two between v2 and v3 for a reason, but I
> should have made it more explicit.
>
> This reason is, for architectures which may not be able to write to
> gd from arch_setup_gd(), the C runtime environment handling code (crt0.S
> for ARM, etc) needs a way to easily find our where gd_ptr was allocated.
>
> So, it was either:
>
> - keeping *gd_ptr above the malloc arena and having to pass gd_ptr back
>    to board_init_f_reserve_size's caller; but board_init_f_reserve_size
>    already returns a value back to its caller, and returning two values
>    from a C function takes resources (a pointer argument and a memory
>    location at least);
>
> or:
>
> - putting *gd_ptr at the base of the allocated space, below the malloc
>    arena, so that the C runtime environment handling code knows where to
>    point gd to without incurring the cost of passing additional results
>    back from board_init_f_reserve_init.
>
> This is the reason why I placed the global data below the malloc arena.
>
> [besides, considering that our malloc implementation starts from the
> base of the arena and allocates upward, it is (admittedly very slightly)
> safer to have GD placed below it than above.]

Agree.

>
> I will add a note in v4 about the reason for this (re)ordering of
> allocations.
>
>>> +	base += CONFIG_SYS_MALLOC_F_LEN;
>>
>> Useless statement.
>
> Right now it is indeed useless. However, if and when some other space
> gets reserved and initialized besides GD and the malloc arena, this
> statement will be needed before allocating / accessing the space
> allocated above the malloc arena.
>
> Note that this statement will not result in any useless *code*, as the
> compiler's optimizer detects that base remains unused after it, and
> thus removes it from the generated object code.
>
> So I'll insist on keeping this statement, although I will add a comment
> next to it explaining why it matters despite not having any visible
> effect.
>

OK.

Best regards,
Thomas
Albert ARIBAUD Nov. 15, 2015, 11:44 a.m. UTC | #7
Hello Simon,

On Sat, 14 Nov 2015 08:41:01 -0700, Simon Glass <sjg@chromium.org>
wrote:

> Reading between the lines, there isn't actually a problem with this in
> practice, but it is doing things that C should not really be doing. Is
> that right?

You did not even need to read between the lines :) as the very first
comment to the v1 of this patch was an explicit question whether I had
seen actual failure cause by board_init_f_mem() -- and my explicit
answer was that no, I hadn't, but as an analogy, I hadn't personally
seen any car crash caused by worn tires either, and that did not deter
me from changing worn tires as soon as I noticed them rather than wait
for the tires to fail.

> To me it seems OK provided we can assume that the stack
> needed to call the function is less than 64 bytes. But I'm sure there
> is a spec somewhere that the current function violates. Your idea of
> what constitutes the stack seems slightly different to mine - I think
> of it as the data already written, whereas you think of it as data yet
> to write, There is some associated uncertainty since the compiler is
> free to fiddle with the stack pointer. In practice in a boot loader we
> always make some architectural assumptions based on current operation.
> But I agree we should minimise this.

Ok.

> Anyway, I think your solution only adds one more function and still
> has most of the code in C rather than assembler, so that it is common
> to all archs. That was my main goal with the change.

Just to clarify, keeping as much C as possible is my goal too.
Especially, the patch keeps the size computation in a C function rather
than reverting back to computing it in crt0.S as it was 'in olden
days'; that is because computing the size in a C function is much more
flexible than it would be in assembly language, and accessible to many
more developers.

My general view of assembly vs C language is that assembly language
should only be used to perform things that cannot be done in C language
without violating the C runtime environment, or for which a C runtime
environment is not available, or which may fail depending on the
compiler implementation. This essentially covers:

- the entry point: when it is directly pointed at by the reset vector,
  it obviously does not have a C runtime environment, and it usually
  does not have one either even when it it entered from ROM code. In
  fact, the general goal of the entry point is to set up the C runtime
  environment(s).

- exception vectors: except for rare architectures for which the
  compiler provides a way to directly write interrupt routines in C,
  exception vectors have to be written in assembly language; again,
  their role *is* to set up the C runtime environment for the C
  language exception handler.

- exceptional cases where any implementation of some design in C
  conflicts with the compiler implementation. Of course, these case are
  specific to an architecture and frequently even a compiler version
  (see below for the r9-as-GD issue) and must only resort to assembly
  language for said architecture, and use as little of it as possible.
  Typically, such designs are related to an architecture's system ABI,
  which a compiler does not necessarily handle -- again, r9 as a system
  wide constant register comes to the mind of the ARM custodian -- or
  the need for specific instructions for which the compiler does not
  provide support in C (mcr and mrc are ARM examples).

Also, while the patch does add one C function, it does not change the C
code functionally overall, and adds very little assembly code, thereby
resulting in a negligible global code increase (actually, on ARM, I see
more boards experiencing a slight global code size *decrease* than boards
experiencing an *increase*).

> I prefer to set up gd in the C function if we can, to avoid the
> oddness that gd has to be at the bottom and the associated 16-byte
> alignment issue. But let's see how that goes. If we need to do it in
> assembler, it might be better to require the caller to do it before
> calling board_init_f_reserve().

Right now I only see two architectures which cannot set GD in
arch_setup_gd(): x86 and ARM; my patch takes care not to affect other
arches, nor do I have an intent to do so. They can set GD from C, and
that's fine.

Note about the ARM case:

Regarding ARM, I do change the behaviour for all of ARM, whereas the
issue of arch_setup_gd() failing to set r9 only happens for Thumb-1 code
generation and only with gcc 5.2.1 (and possibly a few other versions).

This is because, as I said elsewhere, I fear that the gcc -ffixed-reg
behaviour change is compliant with a strict interpretation of the
option (all accesses to r9 from user C code will return the same
value) and therefore I expect later gcc versions to not change the
Thumb-1 behaviour; it might even happen that this behaviour change be
'ported over' to Thumb-2 or even ARM code generation.

As the ARM custodian, I do not view tracking gcc behaviour changes as
a rewarding task, nor do I consider it a sane solution to pepper
gcc-version conditionals around some C code because its success happens
to depend on which implementation of gcc it was compiled with.

Plus, I do not know right now how LLVM handles r9 but I do remember
some hours of... fun... with it, and I don't want to discover in later
times that LLVM suddenly changed its mind about r9 handling too.

This is why, for ARM, I prefer cutting the problem short; taking the
strictest option and only trusting -ffixed-reg to provide a constant r9
for reading, not writing, from within C code; therefore, setting r9 to
GD from outside any C context; and doing all of this regardless of ARM
instruction set or compiler version or even compiler choice.

> Regards,
> Simon

Amicalement,
diff mbox

Patch

diff --git a/arch/arc/lib/start.S b/arch/arc/lib/start.S
index 26a5934..2a99193 100644
--- a/arch/arc/lib/start.S
+++ b/arch/arc/lib/start.S
@@ -54,14 +54,16 @@  ENTRY(_start)
 	mov	%sp, CONFIG_SYS_INIT_SP_ADDR
 	mov	%fp, %sp
 
-	/* Allocate and zero GD, update SP */
-	mov	%r0, %sp
-	bl	board_init_f_mem
-
+	/* Get reserved area size, update SP and FP */
+	bl	board_init_f_get_reserve_size
 	/* Update stack- and frame-pointers */
-	mov	%sp, %r0
+	sub	%sp, %sp, %r0
 	mov	%fp, %sp
 
+	/* Initialize reserved area */
+	mov	%r0, %sp
+	bl	board_init_f_init_reserve
+
 	/* Zero the one and only argument of "board_init_f" */
 	mov_s	%r0, 0
 	j	board_init_f
diff --git a/arch/arm/lib/crt0.S b/arch/arm/lib/crt0.S
index 80548eb..4ec89a4 100644
--- a/arch/arm/lib/crt0.S
+++ b/arch/arm/lib/crt0.S
@@ -82,9 +82,11 @@  ENTRY(_main)
 #else
 	bic	sp, sp, #7	/* 8-byte alignment for ABI compliance */
 #endif
+	bl	board_init_f_get_reserve_size
+	sub	sp, sp, r0
+	mov	r9, sp
 	mov	r0, sp
-	bl	board_init_f_mem
-	mov	sp, r0
+	bl	board_init_f_init_reserve
 
 	mov	r0, #0
 	bl	board_init_f
diff --git a/arch/arm/lib/crt0_64.S b/arch/arm/lib/crt0_64.S
index cef1c71..d0f8db8 100644
--- a/arch/arm/lib/crt0_64.S
+++ b/arch/arm/lib/crt0_64.S
@@ -75,8 +75,10 @@  ENTRY(_main)
 	ldr	x0, =(CONFIG_SYS_INIT_SP_ADDR)
 #endif
 	bic	sp, x0, #0xf	/* 16-byte alignment for ABI compliance */
-	bl	board_init_f_mem
-	mov	sp, x0
+	bl	board_init_f_reserve_size
+	sub	sp, sp, x0
+	mov	x0, sp
+	bl	board_init_f_init_reserve
 
 	mov	x0, #0
 	bl	board_init_f
diff --git a/arch/microblaze/cpu/start.S b/arch/microblaze/cpu/start.S
index 14f46a8..206be3e 100644
--- a/arch/microblaze/cpu/start.S
+++ b/arch/microblaze/cpu/start.S
@@ -25,7 +25,7 @@  _start:
 
 	addi	r8, r0, __end
 	mts	rslr, r8
-	/* TODO: Redo this code to call board_init_f_mem() */
+	/* TODO: Redo this code to call board_init_f_*() */
 #if defined(CONFIG_SPL_BUILD)
 	addi	r1, r0, CONFIG_SPL_STACK_ADDR
 	mts	rshr, r1
@@ -142,7 +142,7 @@  _start:
 	ori	r12, r12, 0x1a0
 	mts	rmsr, r12
 
-	/* TODO: Redo this code to call board_init_f_mem() */
+	/* TODO: Redo this code to call board_init_f_*() */
 clear_bss:
 	/* clear BSS segments */
 	addi	r5, r0, __bss_start
diff --git a/arch/nios2/cpu/start.S b/arch/nios2/cpu/start.S
index 54787c5..c95f1f4 100644
--- a/arch/nios2/cpu/start.S
+++ b/arch/nios2/cpu/start.S
@@ -106,14 +106,17 @@  _reloc:
 	stw	r0, 4(sp)
 	mov	fp, sp
 
-	/* Allocate and zero GD, update SP */
+	/* Allocate and initialize reserved area, update SP */
+	movhi	r2, %hi(board_init_f_reserve_size@h)
+	ori	r2, r2, %lo(board_init_f_reserve_size@h)
+	callr	r2
+	sub	sp, sp, r2
 	mov	r4, sp
-	movhi	r2, %hi(board_init_f_mem@h)
-	ori	r2, r2, %lo(board_init_f_mem@h)
+	movhi	r2, %hi(board_init_f_init_reserve@h)
+	ori	r2, r2, %lo(board_init_f_init_reserve@h)
 	callr	r2
 
-	/* Update stack- and frame-pointers */
-	mov	sp, r2
+	/* Update frame-pointer */
 	mov	fp, sp
 
 	/* Call board_init_f -- never returns */
diff --git a/arch/powerpc/cpu/ppc4xx/start.S b/arch/powerpc/cpu/ppc4xx/start.S
index 77d4040..cc6e4ec 100644
--- a/arch/powerpc/cpu/ppc4xx/start.S
+++ b/arch/powerpc/cpu/ppc4xx/start.S
@@ -761,9 +761,10 @@  _start:
 
 	bl	cpu_init_f	/* run low-level CPU init code	   (from Flash) */
 #ifdef CONFIG_SYS_GENERIC_BOARD
+	bl	board_init_f_reserve_size
+	sub	r1, r1, r3
 	mr	r3, r1
-	bl	board_init_f_mem
-	mr	r1, r3
+	bl	board_init_f_init_reserve
 	li	r0,0
 	stwu	r0, -4(r1)
 	stwu	r0, -4(r1)
@@ -1037,9 +1038,14 @@  _start:
 
 	bl	cpu_init_f	/* run low-level CPU init code	   (from Flash) */
 #ifdef CONFIG_SYS_GENERIC_BOARD
+	bl	board_init_f_gd_size
+	sub	r1, r1, r3
 	mr	r3, r1
-	bl	board_init_f_mem
-	mr	r1, r3
+	bl	board_init_f_gd
+	bl	board_init_f_malloc_size
+	sub	r1, r1, r3
+	mr	r3, r1
+	bl	board_init_f_malloc
 	stwu	r0, -4(r1)
 	stwu	r0, -4(r1)
 #endif
diff --git a/arch/x86/cpu/start.S b/arch/x86/cpu/start.S
index 5b4ee79..b30383f 100644
--- a/arch/x86/cpu/start.S
+++ b/arch/x86/cpu/start.S
@@ -121,10 +121,11 @@  car_init_ret:
 	 * with esi holding the HOB list address returned by the FSP.
 	 */
 #endif
-	/* Set up global data */
+	/* Reserve 'globals' on the stack */
+	call	board_init_f_reserve_size
+	subl	%eax, %esp
 	mov	%esp, %eax
-	call	board_init_f_mem
-	mov	%eax, %esp
+	call	board_init_f_init_reserve
 
 #ifdef CONFIG_DEBUG_UART
 	call	debug_uart_init
diff --git a/arch/x86/lib/fsp/fsp_common.c b/arch/x86/lib/fsp/fsp_common.c
index c78df94..5249455 100644
--- a/arch/x86/lib/fsp/fsp_common.c
+++ b/arch/x86/lib/fsp/fsp_common.c
@@ -95,8 +95,8 @@  int x86_fsp_init(void)
 		/*
 		 * The second time we enter here, adjust the size of malloc()
 		 * pool before relocation. Given gd->malloc_base was adjusted
-		 * after the call to board_init_f_mem() in arch/x86/cpu/start.S,
-		 * we should fix up gd->malloc_limit here.
+		 * after the call to board_init_f_malloc() in arch/x86/cpu/
+		 * start.S, we should fix up gd->malloc_limit here.
 		 */
 		gd->malloc_limit += CONFIG_FSP_SYS_MALLOC_F_LEN;
 	}
diff --git a/common/init/board_init.c b/common/init/board_init.c
index 1c6126d..e033253 100644
--- a/common/init/board_init.c
+++ b/common/init/board_init.c
@@ -21,39 +21,80 @@  DECLARE_GLOBAL_DATA_PTR;
 #define _USE_MEMCPY
 #endif
 
-/* Unfortunately x86 can't compile this code as gd cannot be assigned */
-#ifndef CONFIG_X86
+/* Unfortunately x86 or ARM can't compile this code as gd cannot be assigned */
+#if !defined(CONFIG_X86) && !defined(CONFIG_SYS_THUMB_BUILD)
 __weak void arch_setup_gd(struct global_data *gd_ptr)
 {
 	gd = gd_ptr;
 }
-#endif /* !CONFIG_X86 */
+#endif /* !CONFIG_X86 && !CONFIG_SYS_THUMB_BUILD */
 
-ulong board_init_f_mem(ulong top)
+/*
+ * Compute size of space to reserve on stack for use as 'globals'
+ * and return size request for reserved area.
+ *
+ * Notes:
+ *
+ * Reservation cannot be actually done from within a C function
+ * as it implies modifying the C stack.
+ *
+ */
+
+ulong board_init_f_get_reserve_size(void)
+{
+	ulong size;
+
+	/* Reserve GD (rounded up to a multiple of 16 bytes) */
+	size = roundup(sizeof(struct global_data), 16);
+
+	/* Reserve malloc arena */
+#if defined(CONFIG_SYS_MALLOC_F)
+	size += CONFIG_SYS_MALLOC_F_LEN;
+#endif
+
+	return size;
+}
+
+/*
+ * Initialize reserved space (which has been safely allocated on the C
+ * stack from the C runtime environment handling code).
+ *
+ * Do not use 'gd->' until arch_setup_gd() has been called!
+ */
+
+void board_init_f_init_reserve(ulong base)
 {
 	struct global_data *gd_ptr;
 #ifndef _USE_MEMCPY
 	int *ptr;
 #endif
 
-	/* Leave space for the stack we are running with now */
-	top -= 0x40;
+	/*
+	 * clear GD entirely
+	 */
 
-	top -= sizeof(struct global_data);
-	top = ALIGN(top, 16);
-	gd_ptr = (struct global_data *)top;
+	gd_ptr = (struct global_data *)base;
+	/* go down one GD plus 16-byte alignment */
+	base += roundup(sizeof(struct global_data), 16);
+	/* zero the area */
 #ifdef _USE_MEMCPY
 	memset(gd_ptr, '\0', sizeof(*gd));
 #else
 	for (ptr = (int *)gd_ptr; ptr < (int *)(gd_ptr + 1); )
 		*ptr++ = 0;
 #endif
+	/* set GD unless architecture did it already */
+#if !defined(CONFIG_X86) && !defined(CONFIG_SYS_THUMB_BUILD)
 	arch_setup_gd(gd_ptr);
+#endif
+
+	/*
+	 * record malloc arena start
+	 */
 
 #if defined(CONFIG_SYS_MALLOC_F)
-	top -= CONFIG_SYS_MALLOC_F_LEN;
-	gd->malloc_base = top;
+	/* go down one malloc arena */
+	gd->malloc_base = base;
+	base += CONFIG_SYS_MALLOC_F_LEN;
 #endif
-
-	return top;
 }
diff --git a/include/common.h b/include/common.h
index 09a131d..988d67a 100644
--- a/include/common.h
+++ b/include/common.h
@@ -225,32 +225,25 @@  void board_init_f(ulong);
 void board_init_r(gd_t *, ulong) __attribute__ ((noreturn));
 
 /**
- * board_init_f_mem() - Allocate global data and set stack position
+ * ulong board_init_f_get_reserve_size - return size of reserved area
  *
  * This function is called by each architecture very early in the start-up
- * code to set up the environment for board_init_f(). It allocates space for
- * global_data (see include/asm-generic/global_data.h) and places the stack
- * below this.
+ * code to allow the C runtime to reserve space on the stack for writable
+ * 'globals' such as GD and the malloc arena.
  *
- * This function requires a stack[1] Normally this is at @top. The function
- * starts allocating space from 64 bytes below @top. First it creates space
- * for global_data. Then it calls arch_setup_gd() which sets gd to point to
- * the global_data space and can reserve additional bytes of space if
- * required). Finally it allocates early malloc() memory
- * (CONFIG_SYS_MALLOC_F_LEN). The new top of the stack is just below this,
- * and it returned by this function.
+ * @return:	size of reserved area
+ */
+ulong board_init_f_get_reserve_size(void);
+
+/**
+ * board_init_f_init_reserve - initialize the reserved area(s)
  *
- * [1] Strictly speaking it would be possible to implement this function
- * in C on many archs such that it does not require a stack. However this
- * does not seem hugely important as only 64 byte are wasted. The 64 bytes
- * are used to handle the calling standard which generally requires pushing
- * addresses or registers onto the stack. We should be able to get away with
- * less if this becomes important.
+ * This function is called once the C runtime has allocated the reserved
+ * area on the stack. It must initialize the GD at the base of that area.
  *
- * @top:	Top of available memory, also normally the top of the stack
- * @return:	New stack location
+ * @base:	top from which reservation was done
  */
-ulong board_init_f_mem(ulong top);
+void board_init_f_init_reserve(ulong base);
 
 /**
  * arch_setup_gd() - Set up the global_data pointer