diff mbox

[U-Boot,RFC,4/9] x86: quark: Add utility codes needed for MRC

Message ID 1422963927-32421-5-git-send-email-bmeng.cn@gmail.com
State Superseded
Delegated to: Simon Glass
Headers show

Commit Message

Bin Meng Feb. 3, 2015, 11:45 a.m. UTC
Add various utility codes needed for Quark MRC.

Signed-off-by: Bin Meng <bmeng.cn@gmail.com>

---
There are 12 checkpatch warnings in this patch, which are:

warning: arch/x86/cpu/quark/mrc_util.c,1446: Too many leading tabs - consider code refactoring
warning: arch/x86/cpu/quark/mrc_util.c,1450: line over 80 characters
...

Fixing 'Too many leading tabs ...' will be very dangerous, as I don't have
all the details on how Intel's MRC codes are actually written to play with
the hardware. Trying to refactor them may lead to a non-working MRC codes.
For the 'line over 80 characters' issue, we have to leave them as is now
due to the 'Too many leading tabs ...', sigh.

 arch/x86/cpu/quark/hte.c      |  398 +++++++++++
 arch/x86/cpu/quark/hte.h      |   44 ++
 arch/x86/cpu/quark/mrc_util.c | 1499 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/cpu/quark/mrc_util.h |  153 +++++
 4 files changed, 2094 insertions(+)
 create mode 100644 arch/x86/cpu/quark/hte.c
 create mode 100644 arch/x86/cpu/quark/hte.h
 create mode 100644 arch/x86/cpu/quark/mrc_util.c
 create mode 100644 arch/x86/cpu/quark/mrc_util.h

Comments

Simon Glass Feb. 4, 2015, 4:24 p.m. UTC | #1
Hi Bin,

On 3 February 2015 at 04:45, Bin Meng <bmeng.cn@gmail.com> wrote:
> Add various utility codes needed for Quark MRC.
>
> Signed-off-by: Bin Meng <bmeng.cn@gmail.com>
>
> ---
> There are 12 checkpatch warnings in this patch, which are:
>
> warning: arch/x86/cpu/quark/mrc_util.c,1446: Too many leading tabs - consider code refactoring
> warning: arch/x86/cpu/quark/mrc_util.c,1450: line over 80 characters
> ...
>
> Fixing 'Too many leading tabs ...' will be very dangerous, as I don't have
> all the details on how Intel's MRC codes are actually written to play with
> the hardware. Trying to refactor them may lead to a non-working MRC codes.
> For the 'line over 80 characters' issue, we have to leave them as is now
> due to the 'Too many leading tabs ...', sigh.

The code looks fine for the most part - I only have nits.

I'm not keen on BIT though. See my comments and what improvements you
can make. It would be great to drop BIT.

Re the debug macros, I suppose they are OK to keep. U-Boot doesn't
have the concept of debug() for different categories or levels of
verbosity.

>
>  arch/x86/cpu/quark/hte.c      |  398 +++++++++++
>  arch/x86/cpu/quark/hte.h      |   44 ++
>  arch/x86/cpu/quark/mrc_util.c | 1499 +++++++++++++++++++++++++++++++++++++++++
>  arch/x86/cpu/quark/mrc_util.h |  153 +++++
>  4 files changed, 2094 insertions(+)
>  create mode 100644 arch/x86/cpu/quark/hte.c
>  create mode 100644 arch/x86/cpu/quark/hte.h
>  create mode 100644 arch/x86/cpu/quark/mrc_util.c
>  create mode 100644 arch/x86/cpu/quark/mrc_util.h
>
> diff --git a/arch/x86/cpu/quark/hte.c b/arch/x86/cpu/quark/hte.c
> new file mode 100644
> index 0000000..d813c9c
> --- /dev/null
> +++ b/arch/x86/cpu/quark/hte.c
> @@ -0,0 +1,398 @@
> +/*
> + * Copyright (C) 2013, Intel Corporation
> + * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
> + *
> + * Ported from Intel released Quark UEFI BIOS
> + * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/

Remove training slash?

> + *
> + * SPDX-License-Identifier:    Intel
> + */
> +
> +#include <common.h>
> +#include <asm/arch/mrc.h>
> +#include <asm/arch/msg_port.h>
> +#include "mrc_util.h"
> +#include "hte.h"
> +
> +/**
> + * This function enables HTE to detect all possible errors for

s/This function// globally

I'd suggest present tense, like "enable HTE to detect all possible errors for...

> + * the given training parameters (per-bit or full byte lane).
> + */
> +static void hte_enable_all_errors(void)
> +{
> +       msg_port_write(HTE, 0x000200A2, 0xFFFFFFFF);
> +       msg_port_write(HTE, 0x000200A3, 0x000000FF);
> +       msg_port_write(HTE, 0x000200A4, 0x00000000);

Lower case hex again.

> +}
> +
> +/**
> + * This function goes and reads the HTE register in order to find any error
> + *
> + * @return: The errors detected in the HTE status register
> + */
> +static u32 hte_check_errors(void)
> +{
> +       return msg_port_read(HTE, 0x000200A7);
> +}
> +
> +/**
> + * This function waits until HTE finishes
> + */
> +static void hte_wait_for_complete(void)
> +{
> +       u32 tmp;
> +
> +       ENTERFN();
> +
> +       do {} while ((msg_port_read(HTE, 0x00020012) & BIT30) != 0);
> +
> +       tmp = msg_port_read(HTE, 0x00020011);
> +       tmp |= BIT9;
> +       tmp &= ~(BIT12 | BIT13);
> +       msg_port_write(HTE, 0x00020011, tmp);
> +
> +       LEAVEFN();
> +}
> +
> +/**
> + * This function clears registers related with errors in the HTE
> + */
> +static void hte_clear_error_regs(void)
> +{
> +       u32 tmp;
> +
> +       /*
> +        * Clear all HTE errors and enable error checking
> +        * for burst and chunk.
> +        */
> +       tmp = msg_port_read(HTE, 0x000200A1);
> +       tmp |= BIT8;
> +       msg_port_write(HTE, 0x000200A1, tmp);
> +}
> +
> +/**
> + * This function executes basic single cache line memory write/read/verify
> + * test using simple constant pattern, different for READ_RAIN and

REAS_TRAIN?

> + * WRITE_TRAIN modes.
> + *
> + * See hte_basic_write_read() which is external visible wrapper.

the external (fix below also)

> + *
> + * @mrc_params: host struture for all MRC global data
> + * @addr: memory adress being tested (must hit specific channel/rank)
> + * @first_run: if set then hte registers are configured, otherwise it is

the hte?

> + *             assumed configuration is done and just re-run the test

assumed configuration is done the we just re-run the test
,
(fix below also)

> + * @mode: READ_TRAIN or WRITE_TRAIN (the difference is in the pattern)
> + *
> + * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
> + */
> +static u16 hte_basic_data_cmp(struct mrc_params *mrc_params, u32 addr,
> +                             u8 first_run, u8 mode)
> +{
> +       u32 pattern;
> +       u32 offset;
> +
> +       if (first_run) {
> +               msg_port_write(HTE, 0x00020020, 0x01B10021);
> +               msg_port_write(HTE, 0x00020021, 0x06000000);
> +               msg_port_write(HTE, 0x00020022, addr >> 6);
> +               msg_port_write(HTE, 0x00020062, 0x00800015);
> +               msg_port_write(HTE, 0x00020063, 0xAAAAAAAA);
> +               msg_port_write(HTE, 0x00020064, 0xCCCCCCCC);
> +               msg_port_write(HTE, 0x00020065, 0xF0F0F0F0);
> +               msg_port_write(HTE, 0x00020061, 0x00030008);
> +
> +               if (mode == WRITE_TRAIN)
> +                       pattern = 0xC33C0000;
> +               else /* READ_TRAIN */
> +                       pattern = 0xAA5555AA;
> +
> +               for (offset = 0x80; offset <= 0x8F; offset++)
> +                       msg_port_write(HTE, offset, pattern);
> +       }
> +
> +       msg_port_write(HTE, 0x000200A1, 0xFFFF1000);
> +       msg_port_write(HTE, 0x00020011, 0x00011000);
> +       msg_port_write(HTE, 0x00020011, 0x00011100);
> +
> +       hte_wait_for_complete();
> +
> +       /*
> +        * Return bits 15:8 of HTE_CH0_ERR_XSTAT to check for
> +        * any bytelane errors.
> +        */
> +       return (hte_check_errors() >> 8) & 0xFF;
> +}
> +
> +/**
> + * This function examines single cache line memory with write/read/verify
> + * test using multiple data patterns (victim-aggressor algorithm).
> + *
> + * See hte_write_stress_bit_lanes() which is external visible wrapper.
> + *
> + * @mrc_params: host struture for all MRC global data

structure

> + * @addr: memory adress being tested (must hit specific channel/rank)
> + * @loop_cnt: number of test iterations
> + * @seed_victim: victim data pattern seed
> + * @seed_aggressor: aggressor data pattern seed
> + * @victim_bit: should be 0 as auto rotate feature is in use

auto-rotate

> + * @first_run: if set then hte registers are configured, otherwise it is

Actually I wonder if HTE would be better than hte, which looks like a
'the' typo, particularly if you leave out 'the'. Also can you please
comment at the top of the file (first function) what HTE stands for)?

> + *             assumed configuration is done and just re-run the test
> + *
> + * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
> + */
> +static u16 hte_rw_data_cmp(struct mrc_params *mrc_params, u32 addr,
> +                          u8 loop_cnt, u32 seed_victim, u32 seed_aggressor,
> +                          u8 victim_bit, u8 first_run)
> +{
> +       u32 offset;
> +       u32 tmp;
> +
> +       if (first_run) {
> +               msg_port_write(HTE, 0x00020020, 0x00910024);
> +               msg_port_write(HTE, 0x00020023, 0x00810024);
> +               msg_port_write(HTE, 0x00020021, 0x06070000);
> +               msg_port_write(HTE, 0x00020024, 0x06070000);
> +               msg_port_write(HTE, 0x00020022, addr >> 6);
> +               msg_port_write(HTE, 0x00020025, addr >> 6);
> +               msg_port_write(HTE, 0x00020062, 0x0000002A);
> +               msg_port_write(HTE, 0x00020063, seed_victim);
> +               msg_port_write(HTE, 0x00020064, seed_aggressor);
> +               msg_port_write(HTE, 0x00020065, seed_victim);
> +
> +               /*
> +                * Write the pattern buffers to select the victim bit
> +                *
> +                * Start with bit0
> +                */
> +               for (offset = 0x80; offset <= 0x8F; offset++) {
> +                       if ((offset % 8) == victim_bit)
> +                               msg_port_write(HTE, offset, 0x55555555);
> +                       else
> +                               msg_port_write(HTE, offset, 0xCCCCCCCC);
> +               }
> +
> +               msg_port_write(HTE, 0x00020061, 0x00000000);
> +               msg_port_write(HTE, 0x00020066, 0x03440000);
> +               msg_port_write(HTE, 0x000200A1, 0xFFFF1000);
> +       }
> +
> +       tmp = 0x10001000 | (loop_cnt << 16);
> +       msg_port_write(HTE, 0x00020011, tmp);
> +       msg_port_write(HTE, 0x00020011, tmp | BIT8);
> +
> +       hte_wait_for_complete();
> +
> +       /*
> +        * Return bits 15:8 of HTE_CH0_ERR_XSTAT to check for
> +        * any bytelane errors.
> +        */
> +       return (hte_check_errors() >> 8) & 0xFF;
> +}
> +
> +/**
> + * This function uses HW HTE engine to initialize or test all memory attached
> + * to a given DUNIT. If flag is MRC_MEM_INIT, this routine writes 0s to all
> + * memory locations to initialize ECC. If flag is MRC_MEM_TEST, this routine
> + * will send an 5AA55AA5 pattern to all memory locations on the RankMask and
> + * then read it back. Then it sends an A55AA55A pattern to all memory locations
> + * on the RankMask and reads it back.
> + *
> + * @mrc_params: host struture for all MRC global data
> + * @flag: MRC_MEM_INIT or MRC_MEM_TEST
> + *
> + * @return: errors register showing HTE failures. Also prints out which rank
> + *          failed the HTE test if failure occurs. For rank detection to work,
> + *          the address map must be left in its default state. If MRC changes
> + *          the address map, this function must be modified to change it back
> + *          to default at the beginning, then restore it at the end.
> + */
> +u32 hte_mem_init(struct mrc_params *mrc_params, u8 flag)
> +{
> +       u32 offset;
> +       int test_num;
> +       int i;
> +
> +       /*
> +        * Clear out the error registers at the start of each memory
> +        * init or memory test run.
> +        */
> +       hte_clear_error_regs();
> +
> +       msg_port_write(HTE, 0x00020062, 0x00000015);
> +
> +       for (offset = 0x80; offset <= 0x8F; offset++)
> +               msg_port_write(HTE, offset, ((offset & 1) ? 0xA55A : 0x5AA5));
> +
> +       msg_port_write(HTE, 0x00020021, 0x00000000);
> +       msg_port_write(HTE, 0x00020022, (mrc_params->mem_size >> 6) - 1);
> +       msg_port_write(HTE, 0x00020063, 0xAAAAAAAA);
> +       msg_port_write(HTE, 0x00020064, 0xCCCCCCCC);
> +       msg_port_write(HTE, 0x00020065, 0xF0F0F0F0);
> +       msg_port_write(HTE, 0x00020066, 0x03000000);
> +
> +       switch (flag) {
> +       case MRC_MEM_INIT:
> +               /*
> +                * Only 1 write pass through memory is needed
> +                * to initialize ECC
> +                */
> +               test_num = 1;
> +               break;
> +       case MRC_MEM_TEST:
> +               /* Write/read then write/read with inverted pattern */
> +               test_num = 4;
> +               break;
> +       default:
> +               DPF(D_INFO, "Unknown parameter for flag: %d\n", flag);
> +               return 0xFFFFFFFF;
> +       }
> +
> +       DPF(D_INFO, "hte_mem_init");

debug()

> +
> +       for (i = 0; i < test_num; i++) {
> +               DPF(D_INFO, ".");
> +
> +               if (i == 0) {
> +                       msg_port_write(HTE, 0x00020061, 0x00000000);
> +                       msg_port_write(HTE, 0x00020020, 0x00110010);
> +               } else if (i == 1) {
> +                       msg_port_write(HTE, 0x00020061, 0x00000000);
> +                       msg_port_write(HTE, 0x00020020, 0x00010010);
> +               } else if (i == 2) {
> +                       msg_port_write(HTE, 0x00020061, 0x00010100);
> +                       msg_port_write(HTE, 0x00020020, 0x00110010);
> +               } else {
> +                       msg_port_write(HTE, 0x00020061, 0x00010100);
> +                       msg_port_write(HTE, 0x00020020, 0x00010010);
> +               }
> +
> +               msg_port_write(HTE, 0x00020011, 0x00111000);
> +               msg_port_write(HTE, 0x00020011, 0x00111100);
> +
> +               hte_wait_for_complete();
> +
> +               /* If this is a READ pass, check for errors at the end */
> +               if ((i % 2) == 1) {
> +                       /* Return immediately if error */
> +                       if (hte_check_errors())
> +                               break;
> +               }
> +       }
> +
> +       DPF(D_INFO, "done\n");
> +
> +       return hte_check_errors();
> +}
> +
> +/**
> + * This function executes basic single cache line memory write/read/verify

'executes a basic'

> + * test using simple constant pattern, different for READ_RAIN and
> + * WRITE_TRAIN modes.
> + *
> + * @mrc_params: host struture for all MRC global data

structure, please fix globally

> + * @addr: memory adress being tested (must hit specific channel/rank)
> + * @first_run: if set then hte registers are configured, otherwise it is
> + *             assumed configuration is done and just re-run the test
> + * @mode: READ_TRAIN or WRITE_TRAIN (the difference is in the pattern)
> + *
> + * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
> + */
> +u16 hte_basic_write_read(struct mrc_params *mrc_params, u32 addr,
> +                        u8 first_run, u8 mode)

Why do we use u8 for these? Would uint be good enough? Just a suggestion.

> +{
> +       u16 errors;
> +
> +       ENTERFN();
> +
> +       /* Enable all error reporting in preparation for HTE test */
> +       hte_enable_all_errors();
> +       hte_clear_error_regs();
> +
> +       errors = hte_basic_data_cmp(mrc_params, addr, first_run, mode);
> +
> +       LEAVEFN();
> +
> +       return errors;
> +}
> +
> +/**
> + * This function examines single cache line memory with write/read/verify

examines a single-cache-line memory

(at least I think this is what it is saying)

> + * test using multiple data patterns (victim-aggressor algorithm).
> + *
> + * @mrc_params: host struture for all MRC global data
> + * @addr: memory adress being tested (must hit specific channel/rank)
> + * @first_run: if set then hte registers are configured, otherwise it is
> + *             assumed configuration is done and just re-run the test
> + *
> + * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
> + */
> +u16 hte_write_stress_bit_lanes(struct mrc_params *mrc_params,
> +                              u32 addr, u8 first_run)
> +{
> +       u16 errors;
> +       u8 victim_bit = 0;
> +
> +       ENTERFN();
> +
> +       /* Enable all error reporting in preparation for HTE test */
> +       hte_enable_all_errors();
> +       hte_clear_error_regs();
> +
> +       /*
> +        * Loop through each bit in the bytelane.
> +        *
> +        * Each pass creates a victim bit while keeping all other bits the same
> +        * as aggressors. AVN HTE adds an auto-rotate feature which allows us
> +        * to program the entire victim/aggressor sequence in 1 step.

What is AVN?

> +        *
> +        * The victim bit rotates on each pass so no need to have software
> +        * implement a victim bit loop like on VLV.

VLV? I think it is sometimes better to write these out and put the
abbreviation in brackets after it, at least once in the file.

> +        */
> +       errors = hte_rw_data_cmp(mrc_params, addr, HTE_LOOP_CNT,
> +                                HTE_LFSR_VICTIM_SEED, HTE_LFSR_AGRESSOR_SEED,
> +                                victim_bit, first_run);
> +
> +       LEAVEFN();
> +
> +       return errors;
> +}
> +
> +/**
> + * This function execute basic single cache line memory write or read.

as above

> + * This is just for receive enable / fine write levelling purpose.

write-levelling (I think that's what you mean)

> + *
> + * @addr: memory adress being tested (must hit specific channel/rank)
> + * @first_run: if set then hte registers are configured, otherwise it is
> + *             assumed configuration is done and just re-run the test
> + * @is_write: when non-zero memory write operation executed, otherwise read
> + */
> +void hte_mem_op(u32 addr, u8 first_run, u8 is_write)
> +{
> +       u32 offset;
> +       u32 tmp;
> +
> +       hte_enable_all_errors();
> +       hte_clear_error_regs();
> +
> +       if (first_run) {
> +               tmp = is_write ? 0x01110021 : 0x01010021;
> +               msg_port_write(HTE, 0x00020020, tmp);
> +
> +               msg_port_write(HTE, 0x00020021, 0x06000000);
> +               msg_port_write(HTE, 0x00020022, addr >> 6);
> +               msg_port_write(HTE, 0x00020062, 0x00800015);
> +               msg_port_write(HTE, 0x00020063, 0xAAAAAAAA);
> +               msg_port_write(HTE, 0x00020064, 0xCCCCCCCC);
> +               msg_port_write(HTE, 0x00020065, 0xF0F0F0F0);
> +               msg_port_write(HTE, 0x00020061, 0x00030008);
> +
> +               for (offset = 0x80; offset <= 0x8F; offset++)
> +                       msg_port_write(HTE, offset, 0xC33C0000);
> +       }
> +
> +       msg_port_write(HTE, 0x000200A1, 0xFFFF1000);
> +       msg_port_write(HTE, 0x00020011, 0x00011000);
> +       msg_port_write(HTE, 0x00020011, 0x00011100);
> +
> +       hte_wait_for_complete();
> +}
> diff --git a/arch/x86/cpu/quark/hte.h b/arch/x86/cpu/quark/hte.h
> new file mode 100644
> index 0000000..3a173ea
> --- /dev/null
> +++ b/arch/x86/cpu/quark/hte.h
> @@ -0,0 +1,44 @@
> +/*
> + * Copyright (C) 2013, Intel Corporation
> + * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
> + *
> + * Ported from Intel released Quark UEFI BIOS
> + * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
> + *
> + * SPDX-License-Identifier:    Intel
> + */
> +
> +#ifndef _HTE_H_
> +#define _HTE_H_
> +
> +enum {
> +       MRC_MEM_INIT,
> +       MRC_MEM_TEST
> +};
> +
> +enum {
> +       READ_TRAIN,
> +       WRITE_TRAIN
> +};
> +
> +/*
> + * EXP_LOOP_CNT field of HTE_CMD_CTL
> + *
> + * This CANNOT be less than 4!
> + */
> +#define HTE_LOOP_CNT           5
> +
> +/* random seed for victim */
> +#define HTE_LFSR_VICTIM_SEED   0xF294BA21
> +
> +/* random seed for aggressor */
> +#define HTE_LFSR_AGRESSOR_SEED 0xEBA7492D
> +
> +u32 hte_mem_init(struct mrc_params *mrc_params, u8 flag);
> +u16 hte_basic_write_read(struct mrc_params *mrc_params, u32 addr,
> +                        u8 first_run, u8 mode);
> +u16 hte_write_stress_bit_lanes(struct mrc_params *mrc_params,
> +                              u32 addr, u8 first_run);
> +void hte_mem_op(u32 addr, u8 first_run, u8 is_write);

Can you move the comments from the .c to the .h for these exported functions?


> +
> +#endif /* _HTE_H_ */
> diff --git a/arch/x86/cpu/quark/mrc_util.c b/arch/x86/cpu/quark/mrc_util.c
> new file mode 100644
> index 0000000..1ae42d6
> --- /dev/null
> +++ b/arch/x86/cpu/quark/mrc_util.c
> @@ -0,0 +1,1499 @@
> +/*
> + * Copyright (C) 2013, Intel Corporation
> + * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
> + *
> + * Ported from Intel released Quark UEFI BIOS
> + * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
> + *
> + * SPDX-License-Identifier:    Intel
> + */
> +
> +#include <common.h>
> +#include <asm/arch/device.h>
> +#include <asm/arch/mrc.h>
> +#include <asm/arch/msg_port.h>
> +#include "mrc_util.h"
> +#include "hte.h"
> +#include "smc.h"
> +
> +static const uint8_t vref_codes[64] = {
> +       /* lowest to highest */
> +       0x3F, 0x3E, 0x3D, 0x3C, 0x3B, 0x3A, 0x39, 0x38,
> +       0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30,
> +       0x2F, 0x2E, 0x2D, 0x2C, 0x2B, 0x2A, 0x29, 0x28,
> +       0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20,
> +       0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> +       0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
> +       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
> +       0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
> +};
> +
> +void mrc_write_mask(u32 unit, u32 addr, u32 data, u32 mask)
> +{
> +       msg_port_write(unit, addr,
> +                      (msg_port_read(unit, addr) & ~(mask)) |
> +                      ((data) & (mask)));
> +}
> +
> +void mrc_alt_write_mask(u32 unit, u32 addr, u32 data, u32 mask)
> +{
> +       msg_port_alt_write(unit, addr,
> +                          (msg_port_alt_read(unit, addr) & ~(mask)) |
> +                          ((data) & (mask)));
> +}
> +
> +void mrc_post_code(uint8_t major, uint8_t minor)
> +{
> +       /* send message to UART */
> +       DPF(D_INFO, "POST: 0x%01x%02x\n", major, minor);
> +
> +       /* error check */
> +       if (major == 0xEE)
> +               hang();
> +}
> +
> +/* Delay number of nanoseconds */
> +void delay_n(uint32_t ns)
> +{
> +       /* 1000 MHz clock has 1ns period --> no conversion required */
> +       uint64_t final_tsc = rdtsc();

blank line here after declarations end

> +       final_tsc += ((get_tbclk_mhz() * ns) / 1000);
> +
> +       while (rdtsc() < final_tsc)
> +               ;
> +}
> +
> +/* Delay number of microseconds */
> +void delay_u(uint32_t ms)
> +{
> +       /* 64-bit math is not an option, just use loops */
> +       while (ms--)
> +               delay_n(1000);
> +}

Some day I suspect these could be pulled out into general x86
functions. Let's see if anything else needs them first.

> +
> +/* Select Memory Manager as the source for PRI interface */
> +void select_mem_mgr(void)
> +{
> +       u32 dco;
> +
> +       ENTERFN();
> +
> +       dco = msg_port_read(MEM_CTLR, DCO);
> +       dco &= ~BIT28;

~(1 << 28)

Ah but I see you are using this everywhere.

U-Boot tries to avoid defining this sort of thing. See some comments
below about this.

> +       msg_port_write(MEM_CTLR, DCO, dco);
> +
> +       LEAVEFN();
> +}
> +
> +/* Select HTE as the source for PRI interface */
> +void select_hte(void)
> +{
> +       u32 dco;
> +
> +       ENTERFN();
> +
> +       dco = msg_port_read(MEM_CTLR, DCO);
> +       dco |= BIT28;
> +       msg_port_write(MEM_CTLR, DCO, dco);
> +
> +       LEAVEFN();
> +}
> +
> +/*
> + * Send DRAM command
> + * data should be formated using DCMD_Xxxx macro or emrsXCommand structure
> + */
> +void dram_init_command(uint32_t data)
> +{
> +       pci_write_config_dword(QUARK_HOST_BRIDGE, MSG_DATA_REG, data);
> +       pci_write_config_dword(QUARK_HOST_BRIDGE, MSG_CTRL_EXT_REG, 0);
> +       msg_port_setup(MSG_OP_DRAM_INIT, MEM_CTLR, 0);
> +
> +       DPF(D_REGWR, "WR32 %03X %08X %08X\n", MEM_CTLR, 0, data);
> +}
> +
> +/* Send DRAM wake command using special MCU side-band WAKE opcode */
> +void dram_wake_command(void)
> +{
> +       ENTERFN();
> +
> +       msg_port_setup(MSG_OP_DRAM_WAKE, MEM_CTLR, 0);
> +
> +       LEAVEFN();
> +}
> +
> +void training_message(uint8_t channel, uint8_t rank, uint8_t byte_lane)
> +{
> +       /* send message to UART */
> +       DPF(D_INFO, "CH%01X RK%01X BL%01X\n", channel, rank, byte_lane);
> +}
> +
> +/*
> + * This function will program the RCVEN delays
> + *
> + * (currently doesn't comprehend rank)
> + */
> +void set_rcvn(uint8_t channel, uint8_t rank,
> +             uint8_t byte_lane, uint32_t pi_count)

reformat to 80cols. Should this or any other function in this file be static?

> +{
> +       uint32_t reg;
> +       uint32_t msk;
> +       uint32_t temp;
> +
> +       ENTERFN();
> +
> +       DPF(D_TRN, "Rcvn ch%d rnk%d ln%d : pi=%03X\n",
> +           channel, rank, byte_lane, pi_count);
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * BL0 -> B01PTRCTL0[11:08] (0x0-0xF)
> +        * BL1 -> B01PTRCTL0[23:20] (0x0-0xF)
> +        */
> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET);
> +       msk = (byte_lane & BIT0) ? (BIT23 | BIT22 | BIT21 | BIT20) :
> +               (BIT11 | BIT10 | BIT9 | BIT8);

Would this be better as:

(0xf << 20) | (0xf << 8)

It might be more meaningful also.

I really don't think these long strings of | are nice.

> +       temp = (byte_lane & BIT0) ? ((pi_count / HALF_CLK) << 20) :
> +               ((pi_count / HALF_CLK) << 8);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* Adjust PI_COUNT */
> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * BL0 -> B0DLLPICODER0[29:24] (0x00-0x3F)
> +        * BL1 -> B1DLLPICODER0[29:24] (0x00-0x3F)

lower case hex again

> +        */
> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET));
> +       msk = (BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24);
> +       temp = pi_count << 24;
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /*
> +        * DEADBAND
> +        * BL0/1 -> B01DBCTL1[08/11] (+1 select)
> +        * BL0/1 -> B01DBCTL1[02/05] (enable)
> +        */
> +       reg = B01DBCTL1 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET);
> +       msk = 0x00;
> +       temp = 0x00;
> +
> +       /* enable */
> +       msk |= (byte_lane & BIT0) ? (BIT5) : (BIT2);

Remove () around BIT5

> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
> +               temp |= msk;
> +
> +       /* select */
> +       msk |= (byte_lane & BIT0) ? (BIT11) : (BIT8);
> +       if (pi_count < EARLY_DB)
> +               temp |= msk;

These uses of BIT seem more useful to me.

Still it would be better to have #defines for the bits which actually
describe their meaning.

Maybe you don't know the meaning though...

> +
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* error check */
> +       if (pi_count > 0x3F) {
> +               training_message(channel, rank, byte_lane);
> +               mrc_post_code(0xEE, 0xE0);
> +       }
> +
> +       LEAVEFN();
> +}
> +
> +/*
> + * This function will return the current RCVEN delay on the given
> + * channel, rank, byte_lane as an absolute PI count.
> + *
> + * (currently doesn't comprehend rank)
> + */
> +uint32_t get_rcvn(uint8_t channel, uint8_t rank, uint8_t byte_lane)
> +{
> +       uint32_t reg;
> +       uint32_t temp;
> +       uint32_t pi_count;
> +
> +       ENTERFN();
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * BL0 -> B01PTRCTL0[11:08] (0x0-0xF)
> +        * BL1 -> B01PTRCTL0[23:20] (0x0-0xF)
> +        */
> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET);
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= (byte_lane & BIT0) ? (20) : (8);
> +       temp &= 0xF;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count = temp * HALF_CLK;
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * BL0 -> B0DLLPICODER0[29:24] (0x00-0x3F)
> +        * BL1 -> B1DLLPICODER0[29:24] (0x00-0x3F)
> +        */
> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);

Please avoid () around simple constants. Put them in the #define/enum if needed.

> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET));
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= 24;
> +       temp &= 0x3F;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count += temp;
> +
> +       LEAVEFN();
> +
> +       return pi_count;
> +}
> +
> +/*
> + * This function will program the RDQS delays based on an absolute
> + * amount of PIs.
> + *
> + * (currently doesn't comprehend rank)
> + */
> +void set_rdqs(uint8_t channel, uint8_t rank,
> +             uint8_t byte_lane, uint32_t pi_count)
> +{
> +       uint32_t reg;
> +       uint32_t msk;
> +       uint32_t temp;
> +
> +       ENTERFN();
> +       DPF(D_TRN, "Rdqs ch%d rnk%d ln%d : pi=%03X\n",
> +           channel, rank, byte_lane, pi_count);
> +
> +       /*
> +        * PI (1/128 MCLK)
> +        * BL0 -> B0RXDQSPICODE[06:00] (0x00-0x47)
> +        * BL1 -> B1RXDQSPICODE[06:00] (0x00-0x47)
> +        */
> +       reg = (byte_lane & BIT0) ? (B1RXDQSPICODE) : (B0RXDQSPICODE);
> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET));
> +       msk = (BIT6 | BIT5 | BIT4 | BIT3 | BIT2 | BIT1 | BIT0);
> +       temp = pi_count << 0;
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* error check (shouldn't go above 0x3F) */
> +       if (pi_count > 0x47) {
> +               training_message(channel, rank, byte_lane);
> +               mrc_post_code(0xEE, 0xE1);
> +       }
> +
> +       LEAVEFN();
> +}
> +
> +/*
> + * This function will return the current RDQS delay on the given
> + * channel, rank, byte_lane as an absolute PI count.
> + *
> + * (currently doesn't comprehend rank)
> + */
> +uint32_t get_rdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane)
> +{
> +       uint32_t reg;
> +       uint32_t temp;
> +       uint32_t pi_count;
> +
> +       ENTERFN();
> +
> +       /*
> +        * PI (1/128 MCLK)
> +        * BL0 -> B0RXDQSPICODE[06:00] (0x00-0x47)
> +        * BL1 -> B1RXDQSPICODE[06:00] (0x00-0x47)
> +        */
> +       reg = (byte_lane & BIT0) ? (B1RXDQSPICODE) : (B0RXDQSPICODE);
> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET));
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +
> +       /* Adjust PI_COUNT */
> +       pi_count = temp & 0x7F;
> +
> +       LEAVEFN();
> +
> +       return pi_count;
> +}
> +
> +/*
> + * This function will program the WDQS delays based on an absolute
> + * amount of PIs.
> + *
> + * (currently doesn't comprehend rank)
> + */
> +void set_wdqs(uint8_t channel, uint8_t rank,
> +             uint8_t byte_lane, uint32_t pi_count)
> +{
> +       uint32_t reg;
> +       uint32_t msk;
> +       uint32_t temp;
> +
> +       ENTERFN();
> +
> +       DPF(D_TRN, "Wdqs ch%d rnk%d ln%d : pi=%03X\n",
> +           channel, rank, byte_lane, pi_count);
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * BL0 -> B01PTRCTL0[07:04] (0x0-0xF)
> +        * BL1 -> B01PTRCTL0[19:16] (0x0-0xF)
> +        */
> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET);
> +       msk = (byte_lane & BIT0) ? (BIT19 | BIT18 | BIT17 | BIT16) :
> +               (BIT7 | BIT6 | BIT5 | BIT4);
> +       temp = pi_count / HALF_CLK;
> +       temp <<= (byte_lane & BIT0) ? (16) : (4);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* Adjust PI_COUNT */
> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * BL0 -> B0DLLPICODER0[21:16] (0x00-0x3F)
> +        * BL1 -> B1DLLPICODER0[21:16] (0x00-0x3F)
> +        */
> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET));
> +       msk = (BIT21 | BIT20 | BIT19 | BIT18 | BIT17 | BIT16);
> +       temp = pi_count << 16;
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /*
> +        * DEADBAND
> +        * BL0/1 -> B01DBCTL1[07/10] (+1 select)
> +        * BL0/1 -> B01DBCTL1[01/04] (enable)
> +        */
> +       reg = B01DBCTL1 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET);
> +       msk = 0x00;
> +       temp = 0x00;
> +
> +       /* enable */
> +       msk |= (byte_lane & BIT0) ? (BIT4) : (BIT1);
> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
> +               temp |= msk;
> +
> +       /* select */
> +       msk |= (byte_lane & BIT0) ? (BIT10) : (BIT7);
> +       if (pi_count < EARLY_DB)
> +               temp |= msk;
> +
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* error check */
> +       if (pi_count > 0x3F) {
> +               training_message(channel, rank, byte_lane);
> +               mrc_post_code(0xEE, 0xE2);
> +       }
> +
> +       LEAVEFN();
> +}
> +
> +/*
> + * This function will return the amount of WDQS delay on the given
> + * channel, rank, byte_lane as an absolute PI count.
> + *
> + * (currently doesn't comprehend rank)
> + */
> +uint32_t get_wdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane)
> +{
> +       uint32_t reg;
> +       uint32_t temp;
> +       uint32_t pi_count;
> +
> +       ENTERFN();
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * BL0 -> B01PTRCTL0[07:04] (0x0-0xF)
> +        * BL1 -> B01PTRCTL0[19:16] (0x0-0xF)
> +        */
> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET);
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= (byte_lane & BIT0) ? (16) : (4);
> +       temp &= 0xF;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count = (temp * HALF_CLK);
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * BL0 -> B0DLLPICODER0[21:16] (0x00-0x3F)
> +        * BL1 -> B1DLLPICODER0[21:16] (0x00-0x3F)
> +        */
> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET));
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= 16;
> +       temp &= 0x3F;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count += temp;
> +
> +       LEAVEFN();
> +
> +       return pi_count;
> +}
> +
> +/*
> + * This function will program the WDQ delays based on an absolute
> + * number of PIs.
> + *
> + * (currently doesn't comprehend rank)
> + */
> +void set_wdq(uint8_t channel, uint8_t rank,
> +            uint8_t byte_lane, uint32_t pi_count)
> +{
> +       uint32_t reg;
> +       uint32_t msk;
> +       uint32_t temp;
> +
> +       ENTERFN();
> +
> +       DPF(D_TRN, "Wdq ch%d rnk%d ln%d : pi=%03X\n",
> +           channel, rank, byte_lane, pi_count);
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * BL0 -> B01PTRCTL0[03:00] (0x0-0xF)
> +        * BL1 -> B01PTRCTL0[15:12] (0x0-0xF)
> +        */
> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET);
> +       msk = (byte_lane & BIT0) ? (BIT15 | BIT14 | BIT13 | BIT12) :
> +               (BIT3 | BIT2 | BIT1 | BIT0);
> +       temp = pi_count / HALF_CLK;
> +       temp <<= (byte_lane & BIT0) ? (12) : (0);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* Adjust PI_COUNT */
> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * BL0 -> B0DLLPICODER0[13:08] (0x00-0x3F)
> +        * BL1 -> B1DLLPICODER0[13:08] (0x00-0x3F)
> +        */
> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET));
> +       msk = (BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8);
> +       temp = pi_count << 8;
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /*
> +        * DEADBAND
> +        * BL0/1 -> B01DBCTL1[06/09] (+1 select)
> +        * BL0/1 -> B01DBCTL1[00/03] (enable)
> +        */
> +       reg = B01DBCTL1 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET);
> +       msk = 0x00;
> +       temp = 0x00;
> +
> +       /* enable */
> +       msk |= (byte_lane & BIT0) ? (BIT3) : (BIT0);
> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
> +               temp |= msk;
> +
> +       /* select */
> +       msk |= (byte_lane & BIT0) ? (BIT9) : (BIT6);
> +       if (pi_count < EARLY_DB)
> +               temp |= msk;
> +
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* error check */
> +       if (pi_count > 0x3F) {
> +               training_message(channel, rank, byte_lane);
> +               mrc_post_code(0xEE, 0xE3);
> +       }
> +
> +       LEAVEFN();
> +}
> +
> +/*
> + * This function will return the amount of WDQ delay on the given
> + * channel, rank, byte_lane as an absolute PI count.
> + *
> + * (currently doesn't comprehend rank)
> + */
> +uint32_t get_wdq(uint8_t channel, uint8_t rank, uint8_t byte_lane)
> +{
> +       uint32_t reg;
> +       uint32_t temp;
> +       uint32_t pi_count;
> +
> +       ENTERFN();
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * BL0 -> B01PTRCTL0[03:00] (0x0-0xF)
> +        * BL1 -> B01PTRCTL0[15:12] (0x0-0xF)
> +        */
> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET);
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= (byte_lane & BIT0) ? (12) : (0);
> +       temp &= 0xF;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count = (temp * HALF_CLK);
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * BL0 -> B0DLLPICODER0[13:08] (0x00-0x3F)
> +        * BL1 -> B1DLLPICODER0[13:08] (0x00-0x3F)
> +        */
> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
> +               (channel * DDRIODQ_CH_OFFSET));
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= 8;
> +       temp &= 0x3F;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count += temp;
> +
> +       LEAVEFN();
> +
> +       return pi_count;
> +}
> +
> +/*
> + * This function will program the WCMD delays based on an absolute
> + * number of PIs.
> + */
> +void set_wcmd(uint8_t channel, uint32_t pi_count)
> +{
> +       uint32_t reg;
> +       uint32_t msk;
> +       uint32_t temp;
> +
> +       ENTERFN();
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * CMDPTRREG[11:08] (0x0-0xF)
> +        */
> +       reg = CMDPTRREG + (channel * DDRIOCCC_CH_OFFSET);
> +       msk = (BIT11 | BIT10 | BIT9 | BIT8);
> +       temp = pi_count / HALF_CLK;
> +       temp <<= 8;
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* Adjust PI_COUNT */
> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * CMDDLLPICODER0[29:24] -> CMDSLICE R3 (unused)
> +        * CMDDLLPICODER0[21:16] -> CMDSLICE L3 (unused)
> +        * CMDDLLPICODER0[13:08] -> CMDSLICE R2 (unused)
> +        * CMDDLLPICODER0[05:00] -> CMDSLICE L2 (unused)
> +        * CMDDLLPICODER1[29:24] -> CMDSLICE R1 (unused)
> +        * CMDDLLPICODER1[21:16] -> CMDSLICE L1 (0x00-0x3F)
> +        * CMDDLLPICODER1[13:08] -> CMDSLICE R0 (unused)
> +        * CMDDLLPICODER1[05:00] -> CMDSLICE L0 (unused)
> +        */
> +       reg = CMDDLLPICODER1 + (channel * DDRIOCCC_CH_OFFSET);
> +
> +       msk = (BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24 |
> +               BIT21 | BIT20 | BIT19 | BIT18 | BIT17 | BIT16 |
> +               BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8 |
> +               BIT5 | BIT4 | BIT3 | BIT2 | BIT1 | BIT0);
> +
> +       temp = (pi_count << 24) | (pi_count << 16) |
> +               (pi_count << 8) | (pi_count << 0);
> +
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +       reg = CMDDLLPICODER0 + (channel * DDRIOCCC_CH_OFFSET);  /* PO */
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /*
> +        * DEADBAND
> +        * CMDCFGREG0[17] (+1 select)
> +        * CMDCFGREG0[16] (enable)
> +        */
> +       reg = CMDCFGREG0 + (channel * DDRIOCCC_CH_OFFSET);
> +       msk = 0x00;
> +       temp = 0x00;
> +
> +       /* enable */
> +       msk |= BIT16;
> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
> +               temp |= msk;
> +
> +       /* select */
> +       msk |= BIT17;
> +       if (pi_count < EARLY_DB)
> +               temp |= msk;
> +
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* error check */
> +       if (pi_count > 0x3F)
> +               mrc_post_code(0xEE, 0xE4);
> +
> +       LEAVEFN();
> +}
> +
> +/*
> + * This function will return the amount of WCMD delay on the given
> + * channel as an absolute PI count.
> + */
> +uint32_t get_wcmd(uint8_t channel)
> +{
> +       uint32_t reg;
> +       uint32_t temp;
> +       uint32_t pi_count;
> +
> +       ENTERFN();
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * CMDPTRREG[11:08] (0x0-0xF)
> +        */
> +       reg = CMDPTRREG + (channel * DDRIOCCC_CH_OFFSET);
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= 8;
> +       temp &= 0xF;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count = temp * HALF_CLK;
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * CMDDLLPICODER0[29:24] -> CMDSLICE R3 (unused)
> +        * CMDDLLPICODER0[21:16] -> CMDSLICE L3 (unused)
> +        * CMDDLLPICODER0[13:08] -> CMDSLICE R2 (unused)
> +        * CMDDLLPICODER0[05:00] -> CMDSLICE L2 (unused)
> +        * CMDDLLPICODER1[29:24] -> CMDSLICE R1 (unused)
> +        * CMDDLLPICODER1[21:16] -> CMDSLICE L1 (0x00-0x3F)
> +        * CMDDLLPICODER1[13:08] -> CMDSLICE R0 (unused)
> +        * CMDDLLPICODER1[05:00] -> CMDSLICE L0 (unused)
> +        */
> +       reg = CMDDLLPICODER1 + (channel * DDRIOCCC_CH_OFFSET);
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= 16;
> +       temp &= 0x3F;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count += temp;
> +
> +       LEAVEFN();
> +
> +       return pi_count;
> +}
> +
> +/*
> + * This function will program the WCLK delays based on an absolute
> + * number of PIs.
> + */
> +void set_wclk(uint8_t channel, uint8_t rank, uint32_t pi_count)
> +{
> +       uint32_t reg;
> +       uint32_t msk;
> +       uint32_t temp;
> +
> +       ENTERFN();
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * CCPTRREG[15:12] -> CLK1 (0x0-0xF)
> +        * CCPTRREG[11:08] -> CLK0 (0x0-0xF)
> +        */
> +       reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
> +       msk = (BIT15 | BIT14 | BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8);

mask = 0xff00 is much better, isn't it?

> +       temp = ((pi_count / HALF_CLK) << 12) | ((pi_count / HALF_CLK) << 8);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* Adjust PI_COUNT */
> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * ECCB1DLLPICODER0[13:08] -> CLK0 (0x00-0x3F)
> +        * ECCB1DLLPICODER0[21:16] -> CLK1 (0x00-0x3F)
> +        */
> +       reg = (rank) ? (ECCB1DLLPICODER0) : (ECCB1DLLPICODER0);
> +       reg += (channel * DDRIOCCC_CH_OFFSET);
> +       msk = (BIT21 | BIT20 | BIT19 | BIT18 | BIT17 | BIT16 |
> +               BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8);

Ick!

> +       temp = (pi_count << 16) | (pi_count << 8);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +       reg = (rank) ? (ECCB1DLLPICODER1) : (ECCB1DLLPICODER1);

Remove all (), and below. Please fix globally.

> +       reg += (channel * DDRIOCCC_CH_OFFSET);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +       reg = (rank) ? (ECCB1DLLPICODER2) : (ECCB1DLLPICODER2);
> +       reg += (channel * DDRIOCCC_CH_OFFSET);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +       reg = (rank) ? (ECCB1DLLPICODER3) : (ECCB1DLLPICODER3);
> +       reg += (channel * DDRIOCCC_CH_OFFSET);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /*
> +        * DEADBAND
> +        * CCCFGREG1[11:08] (+1 select)
> +        * CCCFGREG1[03:00] (enable)
> +        */
> +       reg = CCCFGREG1 + (channel * DDRIOCCC_CH_OFFSET);
> +       msk = 0x00;
> +       temp = 0x00;
> +
> +       /* enable */
> +       msk |= (BIT3 | BIT2 | BIT1 | BIT0);
> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
> +               temp |= msk;
> +
> +       /* select */
> +       msk |= (BIT11 | BIT10 | BIT9 | BIT8);
> +       if (pi_count < EARLY_DB)
> +               temp |= msk;
> +
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* error check */
> +       if (pi_count > 0x3F)
> +               mrc_post_code(0xEE, 0xE5);
> +
> +       LEAVEFN();
> +}
> +
> +/*
> + * This function will return the amout of WCLK delay on the given
> + * channel, rank as an absolute PI count.
> + */
> +uint32_t get_wclk(uint8_t channel, uint8_t rank)
> +{
> +       uint32_t reg;
> +       uint32_t temp;
> +       uint32_t pi_count;
> +
> +       ENTERFN();
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * CCPTRREG[15:12] -> CLK1 (0x0-0xF)
> +        * CCPTRREG[11:08] -> CLK0 (0x0-0xF)
> +        */
> +       reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= (rank) ? (12) : (8);
> +       temp &= 0xF;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count = temp * HALF_CLK;
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * ECCB1DLLPICODER0[13:08] -> CLK0 (0x00-0x3F)
> +        * ECCB1DLLPICODER0[21:16] -> CLK1 (0x00-0x3F)
> +        */
> +       reg = (rank) ? (ECCB1DLLPICODER0) : (ECCB1DLLPICODER0);
> +       reg += (channel * DDRIOCCC_CH_OFFSET);
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= (rank) ? (16) : (8);
> +       temp &= 0x3F;
> +
> +       pi_count += temp;
> +
> +       LEAVEFN();
> +
> +       return pi_count;
> +}
> +
> +/*
> + * This function will program the WCTL delays based on an absolute
> + * number of PIs.
> + *
> + * (currently doesn't comprehend rank)
> + */
> +void set_wctl(uint8_t channel, uint8_t rank, uint32_t pi_count)
> +{
> +       uint32_t reg;
> +       uint32_t msk;
> +       uint32_t temp;
> +
> +       ENTERFN();
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * CCPTRREG[31:28] (0x0-0xF)
> +        * CCPTRREG[27:24] (0x0-0xF)
> +        */
> +       reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
> +       msk = (BIT31 | BIT30 | BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24);
> +       temp = ((pi_count / HALF_CLK) << 28) | ((pi_count / HALF_CLK) << 24);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* Adjust PI_COUNT */
> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
> +        * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
> +        */
> +       reg = ECCB1DLLPICODER0 + (channel * DDRIOCCC_CH_OFFSET);
> +       msk = (BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24);
> +       temp = (pi_count << 24);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +       reg = ECCB1DLLPICODER1 + (channel * DDRIOCCC_CH_OFFSET);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +       reg = ECCB1DLLPICODER2 + (channel * DDRIOCCC_CH_OFFSET);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +       reg = ECCB1DLLPICODER3 + (channel * DDRIOCCC_CH_OFFSET);
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /*
> +        * DEADBAND
> +        * CCCFGREG1[13:12] (+1 select)
> +        * CCCFGREG1[05:04] (enable)
> +        */
> +       reg = CCCFGREG1 + (channel * DDRIOCCC_CH_OFFSET);
> +       msk = 0x00;
> +       temp = 0x00;
> +
> +       /* enable */
> +       msk |= (BIT5 | BIT4);
> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
> +               temp |= msk;
> +
> +       /* select */
> +       msk |= (BIT13 | BIT12);
> +       if (pi_count < EARLY_DB)
> +               temp |= msk;
> +
> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
> +
> +       /* error check */
> +       if (pi_count > 0x3F)
> +               mrc_post_code(0xEE, 0xE6);
> +
> +       LEAVEFN();
> +}
> +
> +/*
> + * This function will return the amount of WCTL delay on the given
> + * channel, rank as an absolute PI count.
> + *
> + * (currently doesn't comprehend rank)
> + */
> +uint32_t get_wctl(uint8_t channel, uint8_t rank)
> +{
> +       uint32_t reg;
> +       uint32_t temp;
> +       uint32_t pi_count;
> +
> +       ENTERFN();
> +
> +       /*
> +        * RDPTR (1/2 MCLK, 64 PIs)
> +        * CCPTRREG[31:28] (0x0-0xF)
> +        * CCPTRREG[27:24] (0x0-0xF)
> +        */
> +       reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= 24;
> +       temp &= 0xF;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count = temp * HALF_CLK;
> +
> +       /*
> +        * PI (1/64 MCLK, 1 PIs)
> +        * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
> +        * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
> +        */
> +       reg = ECCB1DLLPICODER0 + (channel * DDRIOCCC_CH_OFFSET);
> +       temp = msg_port_alt_read(DDRPHY, reg);
> +       temp >>= 24;
> +       temp &= 0x3F;
> +
> +       /* Adjust PI_COUNT */
> +       pi_count += temp;
> +
> +       LEAVEFN();
> +
> +       return pi_count;
> +}
> +
> +/*
> + * This function will program the internal Vref setting in a given
> + * byte lane in a given channel.
> + */
> +void set_vref(uint8_t channel, uint8_t byte_lane, uint32_t setting)
> +{
> +       uint32_t reg = (byte_lane & 0x1) ? (B1VREFCTL) : (B0VREFCTL);
> +
> +       ENTERFN();
> +
> +       DPF(D_TRN, "Vref ch%d ln%d : val=%03X\n",
> +           channel, byte_lane, setting);
> +
> +       mrc_alt_write_mask(DDRPHY, (reg + (channel * DDRIODQ_CH_OFFSET) +
> +               ((byte_lane >> 1) * DDRIODQ_BL_OFFSET)),
> +               (vref_codes[setting] << 2),
> +               (BIT7 | BIT6 | BIT5 | BIT4 | BIT3 | BIT2));
> +
> +       /*
> +        * need to wait ~300ns for Vref to settle
> +        * (check that this is necessary)
> +        */
> +       delay_n(300);
> +
> +       /* ??? may need to clear pointers ??? */
> +
> +       LEAVEFN();
> +}
> +
> +/*
> + * This function will return the internal Vref setting for the given
> + * channel, byte_lane.
> + */
> +uint32_t get_vref(uint8_t channel, uint8_t byte_lane)
> +{
> +       uint8_t j;
> +       uint32_t ret_val = sizeof(vref_codes) / 2;
> +       uint32_t reg = (byte_lane & 0x1) ? (B1VREFCTL) : (B0VREFCTL);
> +       uint32_t temp;
> +
> +       ENTERFN();
> +
> +       temp = msg_port_alt_read(DDRPHY, (reg + (channel * DDRIODQ_CH_OFFSET) +
> +               ((byte_lane >> 1) * DDRIODQ_BL_OFFSET)));
> +       temp >>= 2;
> +       temp &= 0x3F;
> +
> +       for (j = 0; j < sizeof(vref_codes); j++) {
> +               if (vref_codes[j] == temp) {
> +                       ret_val = j;
> +                       break;
> +               }
> +       }
> +
> +       LEAVEFN();
> +
> +       return ret_val;
> +}
> +
> +/*
> + * This function will return a 32 bit address in the desired

32-bit

> + * channel and rank.
> + */
> +uint32_t get_addr(uint8_t channel, uint8_t rank)
> +{
> +       uint32_t offset = 0x02000000;   /* 32MB */
> +
> +       /* Begin product specific code */
> +       if (channel > 0) {
> +               DPF(D_ERROR, "ILLEGAL CHANNEL\n");
> +               DEAD_LOOP();
> +       }
> +
> +       if (rank > 1) {
> +               DPF(D_ERROR, "ILLEGAL RANK\n");
> +               DEAD_LOOP();
> +       }
> +
> +       /* use 256MB lowest density as per DRP == 0x0003 */
> +       offset += rank * (256 * 1024 * 1024);
> +
> +       return offset;
> +}
> +
> +/*
> + * This function will sample the DQTRAINSTS registers in the given
> + * channel/rank SAMPLE_SIZE times looking for a valid '0' or '1'.
> + *
> + * It will return an encoded DWORD in which each bit corresponds to

DWORD?

> + * the sampled value on the byte lane.
> + */
> +uint32_t sample_dqs(struct mrc_params *mrc_params, uint8_t channel,
> +                   uint8_t rank, bool rcvn)
> +{
> +       uint8_t j;      /* just a counter */
> +       uint8_t bl;     /* which BL in the module (always 2 per module) */
> +       uint8_t bl_grp; /* which BL module */
> +       /* byte lane divisor */

Maybe rename the variable so you can drop the comment?

> +       uint8_t bl_divisor = (mrc_params->channel_width == X16) ? 2 : 1;
> +       uint32_t msk[2];        /* BLx in module */
> +       /* DQTRAINSTS register contents for each sample */
> +       uint32_t sampled_val[SAMPLE_SIZE];
> +       uint32_t num_0s;        /* tracks the number of '0' samples */
> +       uint32_t num_1s;        /* tracks the number of '1' samples */
> +       uint32_t ret_val = 0x00;        /* assume all '0' samples */
> +       uint32_t address = get_addr(channel, rank);
> +
> +       /* initialise msk[] */
> +       msk[0] = (rcvn) ? (BIT1) : (BIT9);      /* BL0 */
> +       msk[1] = (rcvn) ? (BIT0) : (BIT8);      /* BL1 */
> +
> +       /* cycle through each byte lane group */
> +       for (bl_grp = 0; bl_grp < (NUM_BYTE_LANES / bl_divisor) / 2; bl_grp++) {
> +               /* take SAMPLE_SIZE samples */
> +               for (j = 0; j < SAMPLE_SIZE; j++) {
> +                       hte_mem_op(address, mrc_params->first_run,
> +                                  rcvn ? 0 : 1);
> +                       mrc_params->first_run = 0;
> +
> +                       /*
> +                        * record the contents of the proper
> +                        * DQTRAINSTS register
> +                        */
> +                       sampled_val[j] = msg_port_alt_read(DDRPHY,
> +                               (DQTRAINSTS +
> +                               (bl_grp * DDRIODQ_BL_OFFSET) +
> +                               (channel * DDRIODQ_CH_OFFSET)));
> +               }
> +
> +               /*
> +                * look for a majority value (SAMPLE_SIZE / 2) + 1
> +                * on the byte lane and set that value in the corresponding
> +                * ret_val bit
> +                */
> +               for (bl = 0; bl < 2; bl++) {
> +                       num_0s = 0x00;  /* reset '0' tracker for byte lane */
> +                       num_1s = 0x00;  /* reset '1' tracker for byte lane */
> +                       for (j = 0; j < SAMPLE_SIZE; j++) {
> +                               if (sampled_val[j] & msk[bl])
> +                                       num_1s++;
> +                               else
> +                                       num_0s++;
> +                       }
> +               if (num_1s > num_0s)
> +                       ret_val |= (1 << (bl + (bl_grp * 2)));
> +               }
> +       }
> +
> +       /*
> +        * "ret_val.0" contains the status of BL0
> +        * "ret_val.1" contains the status of BL1
> +        * "ret_val.2" contains the status of BL2
> +        * etc.

This comment should go in @return in the function comment.

> +        */
> +       return ret_val;
> +}
> +
> +/* This function will find the rising edge transition on RCVN or WDQS */
> +void find_rising_edge(struct mrc_params *mrc_params, uint32_t delay[],
> +                     uint8_t channel, uint8_t rank, bool rcvn)
> +{
> +       bool all_edges_found;   /* determines stop condition */
> +       bool direction[NUM_BYTE_LANES]; /* direction indicator */
> +       uint8_t sample; /* sample counter */
> +       uint8_t bl;     /* byte lane counter */
> +       /* byte lane divisor */
> +       uint8_t bl_divisor = (mrc_params->channel_width == X16) ? 2 : 1;
> +       uint32_t sample_result[SAMPLE_CNT];     /* results of sample_dqs() */
> +       uint32_t temp;
> +       uint32_t transition_pattern;
> +
> +       ENTERFN();
> +
> +       /* select hte and request initial configuration */
> +       select_hte();
> +       mrc_params->first_run = 1;
> +
> +       /* Take 3 sample points (T1,T2,T3) to obtain a transition pattern */
> +       for (sample = 0; sample < SAMPLE_CNT; sample++) {
> +               /* program the desired delays for sample */
> +               for (bl = 0; bl < (NUM_BYTE_LANES / bl_divisor); bl++) {
> +                       /* increase sample delay by 26 PI (0.2 CLK) */
> +                       if (rcvn) {
> +                               set_rcvn(channel, rank, bl,
> +                                        delay[bl] + (sample * SAMPLE_DLY));
> +                       } else {
> +                               set_wdqs(channel, rank, bl,
> +                                        delay[bl] + (sample * SAMPLE_DLY));
> +                       }
> +               }
> +
> +               /* take samples (Tsample_i) */
> +               sample_result[sample] = sample_dqs(mrc_params,
> +                       channel, rank, rcvn);
> +
> +               DPF(D_TRN,
> +                   "Find rising edge %s ch%d rnk%d: #%d dly=%d dqs=%02X\n",
> +                   (rcvn ? "RCVN" : "WDQS"), channel, rank, sample,
> +                   sample * SAMPLE_DLY, sample_result[sample]);
> +       }
> +
> +       /*
> +        * This pattern will help determine where we landed and ultimately
> +        * how to place RCVEN/WDQS.
> +        */
> +       for (bl = 0; bl < (NUM_BYTE_LANES / bl_divisor); bl++) {
> +               /* build transition_pattern (MSB is 1st sample) */
> +               transition_pattern = 0;
> +               for (sample = 0; sample < SAMPLE_CNT; sample++) {
> +                       transition_pattern |=
> +                               ((sample_result[sample] & (1 << bl)) >> bl) <<
> +                               (SAMPLE_CNT - 1 - sample);
> +               }
> +
> +               DPF(D_TRN, "=== transition pattern %d\n", transition_pattern);
> +
> +               /*
> +                * set up to look for rising edge based on
> +                * transition_pattern
> +                */
> +               switch (transition_pattern) {
> +               case 0: /* sampled 0->0->0 */
> +                       /* move forward from T3 looking for 0->1 */
> +                       delay[bl] += 2 * SAMPLE_DLY;
> +                       direction[bl] = FORWARD;
> +                       break;
> +               case 1: /* sampled 0->0->1 */
> +               case 5: /* sampled 1->0->1 (bad duty cycle) *HSD#237503* */
> +                       /* move forward from T2 looking for 0->1 */
> +                       delay[bl] += 1 * SAMPLE_DLY;
> +                       direction[bl] = FORWARD;
> +                       break;
> +               case 2: /* sampled 0->1->0 (bad duty cycle) *HSD#237503* */
> +               case 3: /* sampled 0->1->1 */
> +                       /* move forward from T1 looking for 0->1 */
> +                       delay[bl] += 0 * SAMPLE_DLY;
> +                       direction[bl] = FORWARD;
> +                       break;
> +               case 4: /* sampled 1->0->0 (assumes BL8, HSD#234975) */
> +                       /* move forward from T3 looking for 0->1 */
> +                       delay[bl] += 2 * SAMPLE_DLY;
> +                       direction[bl] = FORWARD;
> +                       break;
> +               case 6: /* sampled 1->1->0 */
> +               case 7: /* sampled 1->1->1 */
> +                       /* move backward from T1 looking for 1->0 */
> +                       delay[bl] += 0 * SAMPLE_DLY;
> +                       direction[bl] = BACKWARD;
> +                       break;
> +               default:
> +                       mrc_post_code(0xEE, 0xEE);
> +                       break;
> +               }
> +
> +               /* program delays */
> +               if (rcvn)
> +                       set_rcvn(channel, rank, bl, delay[bl]);
> +               else
> +                       set_wdqs(channel, rank, bl, delay[bl]);
> +       }
> +
> +       /*
> +        * Based on the observed transition pattern on the byte lane,
> +        * begin looking for a rising edge with single PI granularity.
> +        */
> +       do {
> +               all_edges_found = true; /* assume all byte lanes passed */
> +               /* take a sample */
> +               temp = sample_dqs(mrc_params, channel, rank, rcvn);
> +               /* check all each byte lane for proper edge */
> +               for (bl = 0; bl < (NUM_BYTE_LANES / bl_divisor); bl++) {
> +                       if (temp & (1 << bl)) {
> +                               /* sampled "1" */
> +                               if (direction[bl] == BACKWARD) {
> +                                       /*
> +                                        * keep looking for edge
> +                                        * on this byte lane
> +                                        */
> +                                       all_edges_found = false;
> +                                       delay[bl] -= 1;
> +                                       if (rcvn) {
> +                                               set_rcvn(channel, rank,
> +                                                        bl, delay[bl]);
> +                                       } else {
> +                                               set_wdqs(channel, rank,
> +                                                        bl, delay[bl]);
> +                                       }
> +                               }
> +                       } else {
> +                               /* sampled "0" */
> +                               if (direction[bl] == FORWARD) {
> +                                       /*
> +                                        * keep looking for edge
> +                                        * on this byte lane
> +                                        */
> +                                       all_edges_found = false;
> +                                       delay[bl] += 1;
> +                                       if (rcvn) {
> +                                               set_rcvn(channel, rank,
> +                                                        bl, delay[bl]);
> +                                       } else {
> +                                               set_wdqs(channel, rank,
> +                                                        bl, delay[bl]);
> +                                       }
> +                               }
> +                       }
> +               }
> +       } while (!all_edges_found);
> +
> +       /* restore DDR idle state */
> +       dram_init_command(DCMD_PREA(rank));
> +
> +       DPF(D_TRN, "Delay %03X %03X %03X %03X\n",
> +           delay[0], delay[1], delay[2], delay[3]);
> +
> +       LEAVEFN();
> +}
> +
> +/*
> + * This function will return a 32 bit mask that will be used to
> + * check for byte lane failures.
> + */
> +uint32_t byte_lane_mask(struct mrc_params *mrc_params)
> +{
> +       uint32_t j;
> +       uint32_t ret_val = 0x00;
> +
> +       /*
> +        * set ret_val based on NUM_BYTE_LANES such that you will check
> +        * only BL0 in result
> +        *
> +        * (each bit in result represents a byte lane)
> +        */
> +       for (j = 0; j < MAX_BYTE_LANES; j += NUM_BYTE_LANES)
> +               ret_val |= (1 << ((j / NUM_BYTE_LANES) * NUM_BYTE_LANES));
> +
> +       /*
> +        * HSD#235037
> +        * need to adjust the mask for 16-bit mode
> +        */
> +       if (mrc_params->channel_width == X16)
> +               ret_val |= (ret_val << 2);
> +
> +       return ret_val;
> +}
> +
> +/*
> + * Check memory executing simple write/read/verify at the specified address.
> + *
> + * Bits in the result indicate failure on specific byte lane.
> + */
> +uint32_t check_rw_coarse(struct mrc_params *mrc_params, uint32_t address)
> +{
> +       uint32_t result = 0;
> +       uint8_t first_run = 0;
> +
> +       if (mrc_params->hte_setup) {
> +               mrc_params->hte_setup = 0;
> +               first_run = 1;
> +               select_hte();
> +       }
> +
> +       result = hte_basic_write_read(mrc_params, address,
> +                                     first_run, WRITE_TRAIN);

reformat to 80cols

> +
> +       DPF(D_TRN, "check_rw_coarse result is %x\n", result);
> +
> +       return result;
> +}
> +
> +/*
> + * Check memory executing write/read/verify of many data patterns
> + * at the specified address. Bits in the result indicate failure
> + * on specific byte lane.
> + */
> +uint32_t check_bls_ex(struct mrc_params *mrc_params, uint32_t address)
> +{
> +       uint32_t result;
> +       uint8_t first_run = 0;
> +
> +       if (mrc_params->hte_setup) {
> +               mrc_params->hte_setup = 0;
> +               first_run = 1;
> +               select_hte();
> +       }
> +
> +       result = hte_write_stress_bit_lanes(mrc_params, address, first_run);
> +
> +       DPF(D_TRN, "check_bls_ex result is %x\n", result);
> +
> +       return result;
> +}
> +
> +/*
> + * 32-bit LFSR with characteristic polynomial: X^32 + X^22 +X^2 + X^1
> + *
> + * The function takes pointer to previous 32 bit value and
> + * modifies it to next value.
> + */
> +void lfsr32(uint32_t *lfsr_ptr)
> +{
> +       uint32_t bit;
> +       uint32_t lfsr;
> +       int i;
> +
> +       lfsr = *lfsr_ptr;
> +
> +       for (i = 0; i < 32; i++) {
> +               bit = 1 ^ (lfsr & BIT0);
> +               bit = bit ^ ((lfsr & BIT1) >> 1);
> +               bit = bit ^ ((lfsr & BIT2) >> 2);
> +               bit = bit ^ ((lfsr & BIT22) >> 22);
> +
> +               lfsr = ((lfsr >> 1) | (bit << 31));
> +       }
> +
> +       *lfsr_ptr = lfsr;
> +}
> +
> +/* Clear the pointers in a given byte lane in a given channel */
> +void clear_pointers(void)
> +{
> +       uint8_t channel;
> +       uint8_t bl;
> +
> +       ENTERFN();
> +
> +       for (channel = 0; channel < NUM_CHANNELS; channel++) {
> +               for (bl = 0; bl < NUM_BYTE_LANES; bl++) {
> +                       mrc_alt_write_mask(DDRPHY,
> +                                          (B01PTRCTL1 +
> +                                          (channel * DDRIODQ_CH_OFFSET) +
> +                                          ((bl >> 1) * DDRIODQ_BL_OFFSET)),
> +                                          ~BIT8, BIT8);
> +
> +                       mrc_alt_write_mask(DDRPHY,
> +                                          (B01PTRCTL1 +
> +                                          (channel * DDRIODQ_CH_OFFSET) +
> +                                          ((bl >> 1) * DDRIODQ_BL_OFFSET)),
> +                                          BIT8, BIT8);
> +               }
> +       }
> +
> +       LEAVEFN();
> +}
> +
> +void print_timings(struct mrc_params *mrc_params)
> +{
> +       uint8_t algo;
> +       uint8_t channel;
> +       uint8_t rank;
> +       uint8_t bl;
> +       uint8_t bl_divisor = (mrc_params->channel_width == X16) ? 2 : 1;
> +
> +       DPF(D_INFO, "\n---------------------------");
> +       DPF(D_INFO, "\nALGO[CH:RK] BL0 BL1 BL2 BL3");
> +       DPF(D_INFO, "\n===========================");
> +
> +       for (algo = 0; algo < MAX_ALGOS; algo++) {
> +               for (channel = 0; channel < NUM_CHANNELS; channel++) {
> +                       if (mrc_params->channel_enables & (1 << channel)) {
> +                               for (rank = 0; rank < NUM_RANKS; rank++) {

Can we put this block in its own function to fix the over-indenting?

> +                                       if (mrc_params->rank_enables &
> +                                               (1 << rank)) {
> +                                               switch (algo) {
> +                                               case RCVN:
> +                                                       DPF(D_INFO,
> +                                                           "\nRCVN[%02d:%02d]",
> +                                                           channel, rank);
> +                                                       break;
> +                                               case WDQS:
> +                                                       DPF(D_INFO,
> +                                                           "\nWDQS[%02d:%02d]",
> +                                                           channel, rank);
> +                                                       break;
> +                                               case WDQX:
> +                                                       DPF(D_INFO,
> +                                                           "\nWDQx[%02d:%02d]",
> +                                                           channel, rank);
> +                                                       break;
> +                                               case RDQS:
> +                                                       DPF(D_INFO,
> +                                                           "\nRDQS[%02d:%02d]",
> +                                                           channel, rank);
> +                                                       break;
> +                                               case VREF:
> +                                                       DPF(D_INFO,
> +                                                           "\nVREF[%02d:%02d]",
> +                                                           channel, rank);
> +                                                       break;
> +                                               case WCMD:
> +                                                       DPF(D_INFO,
> +                                                           "\nWCMD[%02d:%02d]",
> +                                                           channel, rank);
> +                                                       break;
> +                                               case WCTL:
> +                                                       DPF(D_INFO,
> +                                                           "\nWCTL[%02d:%02d]",
> +                                                           channel, rank);
> +                                                       break;
> +                                               case WCLK:
> +                                                       DPF(D_INFO,
> +                                                           "\nWCLK[%02d:%02d]",
> +                                                           channel, rank);
> +                                                       break;
> +                                               default:
> +                                                       break;
> +                                               }
> +
> +                                               for (bl = 0;
> +                                                    bl < (NUM_BYTE_LANES / bl_divisor);
> +                                                    bl++) {
> +                                                       switch (algo) {
> +                                                       case RCVN:
> +                                                               DPF(D_INFO,
> +                                                                   " %03d",
> +                                                                   get_rcvn(channel, rank, bl));
> +                                                               break;
> +                                                       case WDQS:
> +                                                               DPF(D_INFO,
> +                                                                   " %03d",
> +                                                                   get_wdqs(channel, rank, bl));
> +                                                               break;
> +                                                       case WDQX:
> +                                                               DPF(D_INFO,
> +                                                                   " %03d",
> +                                                                   get_wdq(channel, rank, bl));
> +                                                               break;
> +                                                       case RDQS:
> +                                                               DPF(D_INFO,
> +                                                                   " %03d",
> +                                                                   get_rdqs(channel, rank, bl));
> +                                                               break;
> +                                                       case VREF:
> +                                                               DPF(D_INFO,
> +                                                                   " %03d",
> +                                                                   get_vref(channel, bl));
> +                                                               break;
> +                                                       case WCMD:
> +                                                               DPF(D_INFO,
> +                                                                   " %03d",
> +                                                                   get_wcmd(channel));
> +                                                               break;
> +                                                       case WCTL:
> +                                                               DPF(D_INFO,
> +                                                                   " %03d",
> +                                                                   get_wctl(channel, rank));
> +                                                               break;
> +                                                       case WCLK:
> +                                                               DPF(D_INFO,
> +                                                                   " %03d",
> +                                                                   get_wclk(channel, rank));
> +                                                               break;
> +                                                       default:
> +                                                               break;
> +                                                       }
> +                                               }
> +                                       }
> +                               }
> +                       }
> +               }
> +       }
> +
> +       DPF(D_INFO, "\n---------------------------");
> +       DPF(D_INFO, "\n");
> +}
> diff --git a/arch/x86/cpu/quark/mrc_util.h b/arch/x86/cpu/quark/mrc_util.h
> new file mode 100644
> index 0000000..edbe219
> --- /dev/null
> +++ b/arch/x86/cpu/quark/mrc_util.h
> @@ -0,0 +1,153 @@
> +/*
> + * Copyright (C) 2013, Intel Corporation
> + * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
> + *
> + * Ported from Intel released Quark UEFI BIOS
> + * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
> + *
> + * SPDX-License-Identifier:    Intel
> + */
> +
> +#ifndef _MRC_UTIL_H_
> +#define _MRC_UTIL_H_
> +
> +/* Turn on this macro to enable MRC debugging output */
> +#undef  MRC_DEBUG
> +
> +/* MRC Debug Support */
> +#define DPF            debug_cond
> +
> +/* debug print type */
> +
> +#ifdef MRC_DEBUG
> +#define D_ERROR                0x0001
> +#define D_INFO         0x0002
> +#define D_REGRD                0x0004
> +#define D_REGWR                0x0008
> +#define D_FCALL                0x0010
> +#define D_TRN          0x0020
> +#define D_TIME         0x0040
> +#else
> +#define D_ERROR                0
> +#define D_INFO         0
> +#define D_REGRD                0
> +#define D_REGWR                0
> +#define D_FCALL                0
> +#define D_TRN          0
> +#define D_TIME         0
> +#endif
> +
> +#define ENTERFN(...)   debug_cond(D_FCALL, "<%s>\n", __func__)
> +#define LEAVEFN(...)   debug_cond(D_FCALL, "</%s>\n", __func__)
> +#define REPORTFN(...)  debug_cond(D_FCALL, "<%s/>\n", __func__)
> +
> +/* Generic Register Bits */
> +#define BIT0           0x00000001
> +#define BIT1           0x00000002
> +#define BIT2           0x00000004
> +#define BIT3           0x00000008
> +#define BIT4           0x00000010
> +#define BIT5           0x00000020
> +#define BIT6           0x00000040
> +#define BIT7           0x00000080
> +#define BIT8           0x00000100
> +#define BIT9           0x00000200
> +#define BIT10          0x00000400
> +#define BIT11          0x00000800
> +#define BIT12          0x00001000
> +#define BIT13          0x00002000
> +#define BIT14          0x00004000
> +#define BIT15          0x00008000
> +#define BIT16          0x00010000
> +#define BIT17          0x00020000
> +#define BIT18          0x00040000
> +#define BIT19          0x00080000
> +#define BIT20          0x00100000
> +#define BIT21          0x00200000
> +#define BIT22          0x00400000
> +#define BIT23          0x00800000
> +#define BIT24          0x01000000
> +#define BIT25          0x02000000
> +#define BIT26          0x04000000
> +#define BIT27          0x08000000
> +#define BIT28          0x10000000
> +#define BIT29          0x20000000
> +#define BIT30          0x40000000
> +#define BIT31          0x80000000
> +
> +/* Message Bus Port */
> +#define MEM_CTLR       0x01
> +#define HOST_BRIDGE    0x03
> +#define MEM_MGR                0x05
> +#define HTE            0x11
> +#define DDRPHY         0x12
> +
> +/* number of sample points */
> +#define SAMPLE_CNT     3
> +/* number of PIs to increment per sample */
> +#define SAMPLE_DLY     26
> +
> +enum {
> +       /* indicates to decrease delays when looking for edge */
> +       BACKWARD,
> +       /* indicates to increase delays when looking for edge */
> +       FORWARD
> +};
> +
> +enum {
> +       RCVN,
> +       WDQS,
> +       WDQX,
> +       RDQS,
> +       VREF,
> +       WCMD,
> +       WCTL,
> +       WCLK,
> +       MAX_ALGOS,
> +};
> +
> +void mrc_write_mask(u32 unit, u32 addr, u32 data, u32 mask);
> +void mrc_alt_write_mask(u32 unit, u32 addr, u32 data, u32 mask);
> +void mrc_post_code(uint8_t major, uint8_t minor);
> +void delay_n(uint32_t ns);
> +void delay_u(uint32_t ms);
> +void select_mem_mgr(void);
> +void select_hte(void);
> +void dram_init_command(uint32_t data);
> +void dram_wake_command(void);
> +void training_message(uint8_t channel, uint8_t rank, uint8_t byte_lane);
> +
> +void set_rcvn(uint8_t channel, uint8_t rank,
> +             uint8_t byte_lane, uint32_t pi_count);
> +uint32_t get_rcvn(uint8_t channel, uint8_t rank, uint8_t byte_lane);
> +void set_rdqs(uint8_t channel, uint8_t rank,
> +             uint8_t byte_lane, uint32_t pi_count);
> +uint32_t get_rdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane);
> +void set_wdqs(uint8_t channel, uint8_t rank,
> +             uint8_t byte_lane, uint32_t pi_count);
> +uint32_t get_wdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane);
> +void set_wdq(uint8_t channel, uint8_t rank,
> +            uint8_t byte_lane, uint32_t pi_count);
> +uint32_t get_wdq(uint8_t channel, uint8_t rank, uint8_t byte_lane);
> +void set_wcmd(uint8_t channel, uint32_t pi_count);
> +uint32_t get_wcmd(uint8_t channel);
> +void set_wclk(uint8_t channel, uint8_t rank, uint32_t pi_count);
> +uint32_t get_wclk(uint8_t channel, uint8_t rank);
> +void set_wctl(uint8_t channel, uint8_t rank, uint32_t pi_count);
> +uint32_t get_wctl(uint8_t channel, uint8_t rank);
> +void set_vref(uint8_t channel, uint8_t byte_lane, uint32_t setting);
> +uint32_t get_vref(uint8_t channel, uint8_t byte_lane);
> +
> +uint32_t get_addr(uint8_t channel, uint8_t rank);
> +uint32_t sample_dqs(struct mrc_params *mrc_params, uint8_t channel,
> +                   uint8_t rank, bool rcvn);
> +void find_rising_edge(struct mrc_params *mrc_params, uint32_t delay[],
> +                     uint8_t channel, uint8_t rank, bool rcvn);
> +uint32_t byte_lane_mask(struct mrc_params *mrc_params);
> +uint32_t check_rw_coarse(struct mrc_params *mrc_params, uint32_t address);
> +uint32_t check_bls_ex(struct mrc_params *mrc_params, uint32_t address);
> +void lfsr32(uint32_t *lfsr_ptr);
> +void clear_pointers(void);
> +void print_timings(struct mrc_params *mrc_params);

If these are all truly exported, can we please put the function
comments here in the header file?

> +
> +#endif /* _MRC_UTIL_H_ */
> --
> 1.8.2.1
>

Regards,
Simon
Bin Meng Feb. 5, 2015, 2:25 p.m. UTC | #2
Hi Simon,

On Thu, Feb 5, 2015 at 12:24 AM, Simon Glass <sjg@chromium.org> wrote:
> Hi Bin,
>
> On 3 February 2015 at 04:45, Bin Meng <bmeng.cn@gmail.com> wrote:
>> Add various utility codes needed for Quark MRC.
>>
>> Signed-off-by: Bin Meng <bmeng.cn@gmail.com>
>>
>> ---
>> There are 12 checkpatch warnings in this patch, which are:
>>
>> warning: arch/x86/cpu/quark/mrc_util.c,1446: Too many leading tabs - consider code refactoring
>> warning: arch/x86/cpu/quark/mrc_util.c,1450: line over 80 characters
>> ...
>>
>> Fixing 'Too many leading tabs ...' will be very dangerous, as I don't have
>> all the details on how Intel's MRC codes are actually written to play with
>> the hardware. Trying to refactor them may lead to a non-working MRC codes.
>> For the 'line over 80 characters' issue, we have to leave them as is now
>> due to the 'Too many leading tabs ...', sigh.
>
> The code looks fine for the most part - I only have nits.
>
> I'm not keen on BIT though. See my comments and what improvements you
> can make. It would be great to drop BIT.

I found it is hard to replace BIT to something meaningful, as lots of
registers are undocumented.

> Re the debug macros, I suppose they are OK to keep. U-Boot doesn't
> have the concept of debug() for different categories or levels of
> verbosity.

Yep, maybe we can enhance U-Boot's debug() in the future.

>>
>>  arch/x86/cpu/quark/hte.c      |  398 +++++++++++
>>  arch/x86/cpu/quark/hte.h      |   44 ++
>>  arch/x86/cpu/quark/mrc_util.c | 1499 +++++++++++++++++++++++++++++++++++++++++
>>  arch/x86/cpu/quark/mrc_util.h |  153 +++++
>>  4 files changed, 2094 insertions(+)
>>  create mode 100644 arch/x86/cpu/quark/hte.c
>>  create mode 100644 arch/x86/cpu/quark/hte.h
>>  create mode 100644 arch/x86/cpu/quark/mrc_util.c
>>  create mode 100644 arch/x86/cpu/quark/mrc_util.h
>>
>> diff --git a/arch/x86/cpu/quark/hte.c b/arch/x86/cpu/quark/hte.c
>> new file mode 100644
>> index 0000000..d813c9c
>> --- /dev/null
>> +++ b/arch/x86/cpu/quark/hte.c
>> @@ -0,0 +1,398 @@
>> +/*
>> + * Copyright (C) 2013, Intel Corporation
>> + * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
>> + *
>> + * Ported from Intel released Quark UEFI BIOS
>> + * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
>
> Remove training slash?

Fixed globally.

>> + *
>> + * SPDX-License-Identifier:    Intel
>> + */
>> +
>> +#include <common.h>
>> +#include <asm/arch/mrc.h>
>> +#include <asm/arch/msg_port.h>
>> +#include "mrc_util.h"
>> +#include "hte.h"
>> +
>> +/**
>> + * This function enables HTE to detect all possible errors for
>
> s/This function// globally
>
> I'd suggest present tense, like "enable HTE to detect all possible errors for...

Fixed globally.

>> + * the given training parameters (per-bit or full byte lane).
>> + */
>> +static void hte_enable_all_errors(void)
>> +{
>> +       msg_port_write(HTE, 0x000200A2, 0xFFFFFFFF);
>> +       msg_port_write(HTE, 0x000200A3, 0x000000FF);
>> +       msg_port_write(HTE, 0x000200A4, 0x00000000);
>
> Lower case hex again.

All of Intel's MRC codes are using upper case, to keep it consistent
(or maybe I don't want to replace every place to lower case ..) I
chose not to fix these lower case, and just leave them as is now.

>> +}
>> +
>> +/**
>> + * This function goes and reads the HTE register in order to find any error
>> + *
>> + * @return: The errors detected in the HTE status register
>> + */
>> +static u32 hte_check_errors(void)
>> +{
>> +       return msg_port_read(HTE, 0x000200A7);
>> +}
>> +
>> +/**
>> + * This function waits until HTE finishes
>> + */
>> +static void hte_wait_for_complete(void)
>> +{
>> +       u32 tmp;
>> +
>> +       ENTERFN();
>> +
>> +       do {} while ((msg_port_read(HTE, 0x00020012) & BIT30) != 0);
>> +
>> +       tmp = msg_port_read(HTE, 0x00020011);
>> +       tmp |= BIT9;
>> +       tmp &= ~(BIT12 | BIT13);
>> +       msg_port_write(HTE, 0x00020011, tmp);
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/**
>> + * This function clears registers related with errors in the HTE
>> + */
>> +static void hte_clear_error_regs(void)
>> +{
>> +       u32 tmp;
>> +
>> +       /*
>> +        * Clear all HTE errors and enable error checking
>> +        * for burst and chunk.
>> +        */
>> +       tmp = msg_port_read(HTE, 0x000200A1);
>> +       tmp |= BIT8;
>> +       msg_port_write(HTE, 0x000200A1, tmp);
>> +}
>> +
>> +/**
>> + * This function executes basic single cache line memory write/read/verify
>> + * test using simple constant pattern, different for READ_RAIN and
>
> REAS_TRAIN?

Fixed globally.

>> + * WRITE_TRAIN modes.
>> + *
>> + * See hte_basic_write_read() which is external visible wrapper.
>
> the external (fix below also)
>
>> + *
>> + * @mrc_params: host struture for all MRC global data
>> + * @addr: memory adress being tested (must hit specific channel/rank)
>> + * @first_run: if set then hte registers are configured, otherwise it is
>
> the hte?

Changed to 'the HTE'

>> + *             assumed configuration is done and just re-run the test
>
> assumed configuration is done the we just re-run the test
> ,
> (fix below also)

Fixed.

>> + * @mode: READ_TRAIN or WRITE_TRAIN (the difference is in the pattern)
>> + *
>> + * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
>> + */
>> +static u16 hte_basic_data_cmp(struct mrc_params *mrc_params, u32 addr,
>> +                             u8 first_run, u8 mode)
>> +{
>> +       u32 pattern;
>> +       u32 offset;
>> +
>> +       if (first_run) {
>> +               msg_port_write(HTE, 0x00020020, 0x01B10021);
>> +               msg_port_write(HTE, 0x00020021, 0x06000000);
>> +               msg_port_write(HTE, 0x00020022, addr >> 6);
>> +               msg_port_write(HTE, 0x00020062, 0x00800015);
>> +               msg_port_write(HTE, 0x00020063, 0xAAAAAAAA);
>> +               msg_port_write(HTE, 0x00020064, 0xCCCCCCCC);
>> +               msg_port_write(HTE, 0x00020065, 0xF0F0F0F0);
>> +               msg_port_write(HTE, 0x00020061, 0x00030008);
>> +
>> +               if (mode == WRITE_TRAIN)
>> +                       pattern = 0xC33C0000;
>> +               else /* READ_TRAIN */
>> +                       pattern = 0xAA5555AA;
>> +
>> +               for (offset = 0x80; offset <= 0x8F; offset++)
>> +                       msg_port_write(HTE, offset, pattern);
>> +       }
>> +
>> +       msg_port_write(HTE, 0x000200A1, 0xFFFF1000);
>> +       msg_port_write(HTE, 0x00020011, 0x00011000);
>> +       msg_port_write(HTE, 0x00020011, 0x00011100);
>> +
>> +       hte_wait_for_complete();
>> +
>> +       /*
>> +        * Return bits 15:8 of HTE_CH0_ERR_XSTAT to check for
>> +        * any bytelane errors.
>> +        */
>> +       return (hte_check_errors() >> 8) & 0xFF;
>> +}
>> +
>> +/**
>> + * This function examines single cache line memory with write/read/verify
>> + * test using multiple data patterns (victim-aggressor algorithm).
>> + *
>> + * See hte_write_stress_bit_lanes() which is external visible wrapper.
>> + *
>> + * @mrc_params: host struture for all MRC global data
>
> structure

Fixed globally.

>> + * @addr: memory adress being tested (must hit specific channel/rank)
>> + * @loop_cnt: number of test iterations
>> + * @seed_victim: victim data pattern seed
>> + * @seed_aggressor: aggressor data pattern seed
>> + * @victim_bit: should be 0 as auto rotate feature is in use
>
> auto-rotate

Fixed.

>> + * @first_run: if set then hte registers are configured, otherwise it is
>
> Actually I wonder if HTE would be better than hte, which looks like a
> 'the' typo, particularly if you leave out 'the'. Also can you please
> comment at the top of the file (first function) what HTE stands for)?

Changed to 'the HTE'. Intel doc only mentions MTE (Memory Training
Engine), and its registers are undocumented. I am not sure if this HTE
means MTE in Intel's doc. So I don't add any comment for HTE.

>> + *             assumed configuration is done and just re-run the test
>> + *
>> + * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
>> + */
>> +static u16 hte_rw_data_cmp(struct mrc_params *mrc_params, u32 addr,
>> +                          u8 loop_cnt, u32 seed_victim, u32 seed_aggressor,
>> +                          u8 victim_bit, u8 first_run)
>> +{
>> +       u32 offset;
>> +       u32 tmp;
>> +
>> +       if (first_run) {
>> +               msg_port_write(HTE, 0x00020020, 0x00910024);
>> +               msg_port_write(HTE, 0x00020023, 0x00810024);
>> +               msg_port_write(HTE, 0x00020021, 0x06070000);
>> +               msg_port_write(HTE, 0x00020024, 0x06070000);
>> +               msg_port_write(HTE, 0x00020022, addr >> 6);
>> +               msg_port_write(HTE, 0x00020025, addr >> 6);
>> +               msg_port_write(HTE, 0x00020062, 0x0000002A);
>> +               msg_port_write(HTE, 0x00020063, seed_victim);
>> +               msg_port_write(HTE, 0x00020064, seed_aggressor);
>> +               msg_port_write(HTE, 0x00020065, seed_victim);
>> +
>> +               /*
>> +                * Write the pattern buffers to select the victim bit
>> +                *
>> +                * Start with bit0
>> +                */
>> +               for (offset = 0x80; offset <= 0x8F; offset++) {
>> +                       if ((offset % 8) == victim_bit)
>> +                               msg_port_write(HTE, offset, 0x55555555);
>> +                       else
>> +                               msg_port_write(HTE, offset, 0xCCCCCCCC);
>> +               }
>> +
>> +               msg_port_write(HTE, 0x00020061, 0x00000000);
>> +               msg_port_write(HTE, 0x00020066, 0x03440000);
>> +               msg_port_write(HTE, 0x000200A1, 0xFFFF1000);
>> +       }
>> +
>> +       tmp = 0x10001000 | (loop_cnt << 16);
>> +       msg_port_write(HTE, 0x00020011, tmp);
>> +       msg_port_write(HTE, 0x00020011, tmp | BIT8);
>> +
>> +       hte_wait_for_complete();
>> +
>> +       /*
>> +        * Return bits 15:8 of HTE_CH0_ERR_XSTAT to check for
>> +        * any bytelane errors.
>> +        */
>> +       return (hte_check_errors() >> 8) & 0xFF;
>> +}
>> +
>> +/**
>> + * This function uses HW HTE engine to initialize or test all memory attached
>> + * to a given DUNIT. If flag is MRC_MEM_INIT, this routine writes 0s to all
>> + * memory locations to initialize ECC. If flag is MRC_MEM_TEST, this routine
>> + * will send an 5AA55AA5 pattern to all memory locations on the RankMask and
>> + * then read it back. Then it sends an A55AA55A pattern to all memory locations
>> + * on the RankMask and reads it back.
>> + *
>> + * @mrc_params: host struture for all MRC global data
>> + * @flag: MRC_MEM_INIT or MRC_MEM_TEST
>> + *
>> + * @return: errors register showing HTE failures. Also prints out which rank
>> + *          failed the HTE test if failure occurs. For rank detection to work,
>> + *          the address map must be left in its default state. If MRC changes
>> + *          the address map, this function must be modified to change it back
>> + *          to default at the beginning, then restore it at the end.
>> + */
>> +u32 hte_mem_init(struct mrc_params *mrc_params, u8 flag)
>> +{
>> +       u32 offset;
>> +       int test_num;
>> +       int i;
>> +
>> +       /*
>> +        * Clear out the error registers at the start of each memory
>> +        * init or memory test run.
>> +        */
>> +       hte_clear_error_regs();
>> +
>> +       msg_port_write(HTE, 0x00020062, 0x00000015);
>> +
>> +       for (offset = 0x80; offset <= 0x8F; offset++)
>> +               msg_port_write(HTE, offset, ((offset & 1) ? 0xA55A : 0x5AA5));
>> +
>> +       msg_port_write(HTE, 0x00020021, 0x00000000);
>> +       msg_port_write(HTE, 0x00020022, (mrc_params->mem_size >> 6) - 1);
>> +       msg_port_write(HTE, 0x00020063, 0xAAAAAAAA);
>> +       msg_port_write(HTE, 0x00020064, 0xCCCCCCCC);
>> +       msg_port_write(HTE, 0x00020065, 0xF0F0F0F0);
>> +       msg_port_write(HTE, 0x00020066, 0x03000000);
>> +
>> +       switch (flag) {
>> +       case MRC_MEM_INIT:
>> +               /*
>> +                * Only 1 write pass through memory is needed
>> +                * to initialize ECC
>> +                */
>> +               test_num = 1;
>> +               break;
>> +       case MRC_MEM_TEST:
>> +               /* Write/read then write/read with inverted pattern */
>> +               test_num = 4;
>> +               break;
>> +       default:
>> +               DPF(D_INFO, "Unknown parameter for flag: %d\n", flag);
>> +               return 0xFFFFFFFF;
>> +       }
>> +
>> +       DPF(D_INFO, "hte_mem_init");
>
> debug()

Keep to use DPF().

>> +
>> +       for (i = 0; i < test_num; i++) {
>> +               DPF(D_INFO, ".");
>> +
>> +               if (i == 0) {
>> +                       msg_port_write(HTE, 0x00020061, 0x00000000);
>> +                       msg_port_write(HTE, 0x00020020, 0x00110010);
>> +               } else if (i == 1) {
>> +                       msg_port_write(HTE, 0x00020061, 0x00000000);
>> +                       msg_port_write(HTE, 0x00020020, 0x00010010);
>> +               } else if (i == 2) {
>> +                       msg_port_write(HTE, 0x00020061, 0x00010100);
>> +                       msg_port_write(HTE, 0x00020020, 0x00110010);
>> +               } else {
>> +                       msg_port_write(HTE, 0x00020061, 0x00010100);
>> +                       msg_port_write(HTE, 0x00020020, 0x00010010);
>> +               }
>> +
>> +               msg_port_write(HTE, 0x00020011, 0x00111000);
>> +               msg_port_write(HTE, 0x00020011, 0x00111100);
>> +
>> +               hte_wait_for_complete();
>> +
>> +               /* If this is a READ pass, check for errors at the end */
>> +               if ((i % 2) == 1) {
>> +                       /* Return immediately if error */
>> +                       if (hte_check_errors())
>> +                               break;
>> +               }
>> +       }
>> +
>> +       DPF(D_INFO, "done\n");
>> +
>> +       return hte_check_errors();
>> +}
>> +
>> +/**
>> + * This function executes basic single cache line memory write/read/verify
>
> 'executes a basic'

Fixed.

>> + * test using simple constant pattern, different for READ_RAIN and
>> + * WRITE_TRAIN modes.
>> + *
>> + * @mrc_params: host struture for all MRC global data
>
> structure, please fix globally

Fixed globally.

>> + * @addr: memory adress being tested (must hit specific channel/rank)
>> + * @first_run: if set then hte registers are configured, otherwise it is
>> + *             assumed configuration is done and just re-run the test
>> + * @mode: READ_TRAIN or WRITE_TRAIN (the difference is in the pattern)
>> + *
>> + * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
>> + */
>> +u16 hte_basic_write_read(struct mrc_params *mrc_params, u32 addr,
>> +                        u8 first_run, u8 mode)
>
> Why do we use u8 for these? Would uint be good enough? Just a suggestion.
>
>> +{
>> +       u16 errors;
>> +
>> +       ENTERFN();
>> +
>> +       /* Enable all error reporting in preparation for HTE test */
>> +       hte_enable_all_errors();
>> +       hte_clear_error_regs();
>> +
>> +       errors = hte_basic_data_cmp(mrc_params, addr, first_run, mode);
>> +
>> +       LEAVEFN();
>> +
>> +       return errors;
>> +}
>> +
>> +/**
>> + * This function examines single cache line memory with write/read/verify
>
> examines a single-cache-line memory
>
> (at least I think this is what it is saying)

Fixed globally.

>> + * test using multiple data patterns (victim-aggressor algorithm).
>> + *
>> + * @mrc_params: host struture for all MRC global data
>> + * @addr: memory adress being tested (must hit specific channel/rank)
>> + * @first_run: if set then hte registers are configured, otherwise it is
>> + *             assumed configuration is done and just re-run the test
>> + *
>> + * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
>> + */
>> +u16 hte_write_stress_bit_lanes(struct mrc_params *mrc_params,
>> +                              u32 addr, u8 first_run)
>> +{
>> +       u16 errors;
>> +       u8 victim_bit = 0;
>> +
>> +       ENTERFN();
>> +
>> +       /* Enable all error reporting in preparation for HTE test */
>> +       hte_enable_all_errors();
>> +       hte_clear_error_regs();
>> +
>> +       /*
>> +        * Loop through each bit in the bytelane.
>> +        *
>> +        * Each pass creates a victim bit while keeping all other bits the same
>> +        * as aggressors. AVN HTE adds an auto-rotate feature which allows us
>> +        * to program the entire victim/aggressor sequence in 1 step.
>
> What is AVN?

I don't know. I guess it might be Intel's Avoton core in some Atom
processors. So does not change the comment ..

>> +        *
>> +        * The victim bit rotates on each pass so no need to have software
>> +        * implement a victim bit loop like on VLV.
>
> VLV? I think it is sometimes better to write these out and put the
> abbreviation in brackets after it, at least once in the file.

Like above, I guess it is Valleyview (VLV). Don't know if it is
correct. So keep it unchanged.

>
>> +        */
>> +       errors = hte_rw_data_cmp(mrc_params, addr, HTE_LOOP_CNT,
>> +                                HTE_LFSR_VICTIM_SEED, HTE_LFSR_AGRESSOR_SEED,
>> +                                victim_bit, first_run);
>> +
>> +       LEAVEFN();
>> +
>> +       return errors;
>> +}
>> +
>> +/**
>> + * This function execute basic single cache line memory write or read.
>
> as above
>
>> + * This is just for receive enable / fine write levelling purpose.
>
> write-levelling (I think that's what you mean)

Fixed

>> + *
>> + * @addr: memory adress being tested (must hit specific channel/rank)
>> + * @first_run: if set then hte registers are configured, otherwise it is
>> + *             assumed configuration is done and just re-run the test
>> + * @is_write: when non-zero memory write operation executed, otherwise read
>> + */
>> +void hte_mem_op(u32 addr, u8 first_run, u8 is_write)
>> +{
>> +       u32 offset;
>> +       u32 tmp;
>> +
>> +       hte_enable_all_errors();
>> +       hte_clear_error_regs();
>> +
>> +       if (first_run) {
>> +               tmp = is_write ? 0x01110021 : 0x01010021;
>> +               msg_port_write(HTE, 0x00020020, tmp);
>> +
>> +               msg_port_write(HTE, 0x00020021, 0x06000000);
>> +               msg_port_write(HTE, 0x00020022, addr >> 6);
>> +               msg_port_write(HTE, 0x00020062, 0x00800015);
>> +               msg_port_write(HTE, 0x00020063, 0xAAAAAAAA);
>> +               msg_port_write(HTE, 0x00020064, 0xCCCCCCCC);
>> +               msg_port_write(HTE, 0x00020065, 0xF0F0F0F0);
>> +               msg_port_write(HTE, 0x00020061, 0x00030008);
>> +
>> +               for (offset = 0x80; offset <= 0x8F; offset++)
>> +                       msg_port_write(HTE, offset, 0xC33C0000);
>> +       }
>> +
>> +       msg_port_write(HTE, 0x000200A1, 0xFFFF1000);
>> +       msg_port_write(HTE, 0x00020011, 0x00011000);
>> +       msg_port_write(HTE, 0x00020011, 0x00011100);
>> +
>> +       hte_wait_for_complete();
>> +}
>> diff --git a/arch/x86/cpu/quark/hte.h b/arch/x86/cpu/quark/hte.h
>> new file mode 100644
>> index 0000000..3a173ea
>> --- /dev/null
>> +++ b/arch/x86/cpu/quark/hte.h
>> @@ -0,0 +1,44 @@
>> +/*
>> + * Copyright (C) 2013, Intel Corporation
>> + * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
>> + *
>> + * Ported from Intel released Quark UEFI BIOS
>> + * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
>> + *
>> + * SPDX-License-Identifier:    Intel
>> + */
>> +
>> +#ifndef _HTE_H_
>> +#define _HTE_H_
>> +
>> +enum {
>> +       MRC_MEM_INIT,
>> +       MRC_MEM_TEST
>> +};
>> +
>> +enum {
>> +       READ_TRAIN,
>> +       WRITE_TRAIN
>> +};
>> +
>> +/*
>> + * EXP_LOOP_CNT field of HTE_CMD_CTL
>> + *
>> + * This CANNOT be less than 4!
>> + */
>> +#define HTE_LOOP_CNT           5
>> +
>> +/* random seed for victim */
>> +#define HTE_LFSR_VICTIM_SEED   0xF294BA21
>> +
>> +/* random seed for aggressor */
>> +#define HTE_LFSR_AGRESSOR_SEED 0xEBA7492D
>> +
>> +u32 hte_mem_init(struct mrc_params *mrc_params, u8 flag);
>> +u16 hte_basic_write_read(struct mrc_params *mrc_params, u32 addr,
>> +                        u8 first_run, u8 mode);
>> +u16 hte_write_stress_bit_lanes(struct mrc_params *mrc_params,
>> +                              u32 addr, u8 first_run);
>> +void hte_mem_op(u32 addr, u8 first_run, u8 is_write);
>
> Can you move the comments from the .c to the .h for these exported functions?
>

These routines are only used by MRC internally, and not public APIs.
Thus I don't move the comments to header files.

>> +
>> +#endif /* _HTE_H_ */
>> diff --git a/arch/x86/cpu/quark/mrc_util.c b/arch/x86/cpu/quark/mrc_util.c
>> new file mode 100644
>> index 0000000..1ae42d6
>> --- /dev/null
>> +++ b/arch/x86/cpu/quark/mrc_util.c
>> @@ -0,0 +1,1499 @@
>> +/*
>> + * Copyright (C) 2013, Intel Corporation
>> + * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
>> + *
>> + * Ported from Intel released Quark UEFI BIOS
>> + * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
>> + *
>> + * SPDX-License-Identifier:    Intel
>> + */
>> +
>> +#include <common.h>
>> +#include <asm/arch/device.h>
>> +#include <asm/arch/mrc.h>
>> +#include <asm/arch/msg_port.h>
>> +#include "mrc_util.h"
>> +#include "hte.h"
>> +#include "smc.h"
>> +
>> +static const uint8_t vref_codes[64] = {
>> +       /* lowest to highest */
>> +       0x3F, 0x3E, 0x3D, 0x3C, 0x3B, 0x3A, 0x39, 0x38,
>> +       0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30,
>> +       0x2F, 0x2E, 0x2D, 0x2C, 0x2B, 0x2A, 0x29, 0x28,
>> +       0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20,
>> +       0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
>> +       0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
>> +       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
>> +       0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
>> +};
>> +
>> +void mrc_write_mask(u32 unit, u32 addr, u32 data, u32 mask)
>> +{
>> +       msg_port_write(unit, addr,
>> +                      (msg_port_read(unit, addr) & ~(mask)) |
>> +                      ((data) & (mask)));
>> +}
>> +
>> +void mrc_alt_write_mask(u32 unit, u32 addr, u32 data, u32 mask)
>> +{
>> +       msg_port_alt_write(unit, addr,
>> +                          (msg_port_alt_read(unit, addr) & ~(mask)) |
>> +                          ((data) & (mask)));
>> +}
>> +
>> +void mrc_post_code(uint8_t major, uint8_t minor)
>> +{
>> +       /* send message to UART */
>> +       DPF(D_INFO, "POST: 0x%01x%02x\n", major, minor);
>> +
>> +       /* error check */
>> +       if (major == 0xEE)
>> +               hang();
>> +}
>> +
>> +/* Delay number of nanoseconds */
>> +void delay_n(uint32_t ns)
>> +{
>> +       /* 1000 MHz clock has 1ns period --> no conversion required */
>> +       uint64_t final_tsc = rdtsc();
>
> blank line here after declarations end
>
Fixed.

>> +       final_tsc += ((get_tbclk_mhz() * ns) / 1000);
>> +
>> +       while (rdtsc() < final_tsc)
>> +               ;
>> +}
>> +
>> +/* Delay number of microseconds */
>> +void delay_u(uint32_t ms)
>> +{
>> +       /* 64-bit math is not an option, just use loops */
>> +       while (ms--)
>> +               delay_n(1000);
>> +}
>
> Some day I suspect these could be pulled out into general x86
> functions. Let's see if anything else needs them first.

Agreed.

>> +
>> +/* Select Memory Manager as the source for PRI interface */
>> +void select_mem_mgr(void)
>> +{
>> +       u32 dco;
>> +
>> +       ENTERFN();
>> +
>> +       dco = msg_port_read(MEM_CTLR, DCO);
>> +       dco &= ~BIT28;
>
> ~(1 << 28)
>
> Ah but I see you are using this everywhere.
>
> U-Boot tries to avoid defining this sort of thing. See some comments
> below about this.

Did not fix the BIT stuff.

>> +       msg_port_write(MEM_CTLR, DCO, dco);
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/* Select HTE as the source for PRI interface */
>> +void select_hte(void)
>> +{
>> +       u32 dco;
>> +
>> +       ENTERFN();
>> +
>> +       dco = msg_port_read(MEM_CTLR, DCO);
>> +       dco |= BIT28;
>> +       msg_port_write(MEM_CTLR, DCO, dco);
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/*
>> + * Send DRAM command
>> + * data should be formated using DCMD_Xxxx macro or emrsXCommand structure
>> + */
>> +void dram_init_command(uint32_t data)
>> +{
>> +       pci_write_config_dword(QUARK_HOST_BRIDGE, MSG_DATA_REG, data);
>> +       pci_write_config_dword(QUARK_HOST_BRIDGE, MSG_CTRL_EXT_REG, 0);
>> +       msg_port_setup(MSG_OP_DRAM_INIT, MEM_CTLR, 0);
>> +
>> +       DPF(D_REGWR, "WR32 %03X %08X %08X\n", MEM_CTLR, 0, data);
>> +}
>> +
>> +/* Send DRAM wake command using special MCU side-band WAKE opcode */
>> +void dram_wake_command(void)
>> +{
>> +       ENTERFN();
>> +
>> +       msg_port_setup(MSG_OP_DRAM_WAKE, MEM_CTLR, 0);
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +void training_message(uint8_t channel, uint8_t rank, uint8_t byte_lane)
>> +{
>> +       /* send message to UART */
>> +       DPF(D_INFO, "CH%01X RK%01X BL%01X\n", channel, rank, byte_lane);
>> +}
>> +
>> +/*
>> + * This function will program the RCVEN delays
>> + *
>> + * (currently doesn't comprehend rank)
>> + */
>> +void set_rcvn(uint8_t channel, uint8_t rank,
>> +             uint8_t byte_lane, uint32_t pi_count)
>
> reformat to 80cols. Should this or any other function in this file be static?

These functions are used by MRC internally. Also I did not fix this
'reformat to 80cols) as I think it is fine.

>> +{
>> +       uint32_t reg;
>> +       uint32_t msk;
>> +       uint32_t temp;
>> +
>> +       ENTERFN();
>> +
>> +       DPF(D_TRN, "Rcvn ch%d rnk%d ln%d : pi=%03X\n",
>> +           channel, rank, byte_lane, pi_count);
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * BL0 -> B01PTRCTL0[11:08] (0x0-0xF)
>> +        * BL1 -> B01PTRCTL0[23:20] (0x0-0xF)
>> +        */
>> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET);
>> +       msk = (byte_lane & BIT0) ? (BIT23 | BIT22 | BIT21 | BIT20) :
>> +               (BIT11 | BIT10 | BIT9 | BIT8);
>
> Would this be better as:
>
> (0xf << 20) | (0xf << 8)
>
> It might be more meaningful also.
>
> I really don't think these long strings of | are nice.

I agree, but I did not fix this. Changing those globally is really
error prone and may break the whole MRC (my brain could be bad
converting these bits to hex numbers).

>> +       temp = (byte_lane & BIT0) ? ((pi_count / HALF_CLK) << 20) :
>> +               ((pi_count / HALF_CLK) << 8);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * BL0 -> B0DLLPICODER0[29:24] (0x00-0x3F)
>> +        * BL1 -> B1DLLPICODER0[29:24] (0x00-0x3F)
>
> lower case hex again

Did not fix these for consistency.

>> +        */
>> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
>> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET));
>> +       msk = (BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24);
>> +       temp = pi_count << 24;
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /*
>> +        * DEADBAND
>> +        * BL0/1 -> B01DBCTL1[08/11] (+1 select)
>> +        * BL0/1 -> B01DBCTL1[02/05] (enable)
>> +        */
>> +       reg = B01DBCTL1 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET);
>> +       msk = 0x00;
>> +       temp = 0x00;
>> +
>> +       /* enable */
>> +       msk |= (byte_lane & BIT0) ? (BIT5) : (BIT2);
>
> Remove () around BIT5

Fixed globally.

>> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
>> +               temp |= msk;
>> +
>> +       /* select */
>> +       msk |= (byte_lane & BIT0) ? (BIT11) : (BIT8);
>> +       if (pi_count < EARLY_DB)
>> +               temp |= msk;
>
> These uses of BIT seem more useful to me.
>
> Still it would be better to have #defines for the bits which actually
> describe their meaning.
>
> Maybe you don't know the meaning though...

Yes!!!

>> +
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* error check */
>> +       if (pi_count > 0x3F) {
>> +               training_message(channel, rank, byte_lane);
>> +               mrc_post_code(0xEE, 0xE0);
>> +       }
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/*
>> + * This function will return the current RCVEN delay on the given
>> + * channel, rank, byte_lane as an absolute PI count.
>> + *
>> + * (currently doesn't comprehend rank)
>> + */
>> +uint32_t get_rcvn(uint8_t channel, uint8_t rank, uint8_t byte_lane)
>> +{
>> +       uint32_t reg;
>> +       uint32_t temp;
>> +       uint32_t pi_count;
>> +
>> +       ENTERFN();
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * BL0 -> B01PTRCTL0[11:08] (0x0-0xF)
>> +        * BL1 -> B01PTRCTL0[23:20] (0x0-0xF)
>> +        */
>> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET);
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= (byte_lane & BIT0) ? (20) : (8);
>> +       temp &= 0xF;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count = temp * HALF_CLK;
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * BL0 -> B0DLLPICODER0[29:24] (0x00-0x3F)
>> +        * BL1 -> B1DLLPICODER0[29:24] (0x00-0x3F)
>> +        */
>> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
>
> Please avoid () around simple constants. Put them in the #define/enum if needed.

Fixed globally by removing ().

>> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET));
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= 24;
>> +       temp &= 0x3F;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count += temp;
>> +
>> +       LEAVEFN();
>> +
>> +       return pi_count;
>> +}
>> +
>> +/*
>> + * This function will program the RDQS delays based on an absolute
>> + * amount of PIs.
>> + *
>> + * (currently doesn't comprehend rank)
>> + */
>> +void set_rdqs(uint8_t channel, uint8_t rank,
>> +             uint8_t byte_lane, uint32_t pi_count)
>> +{
>> +       uint32_t reg;
>> +       uint32_t msk;
>> +       uint32_t temp;
>> +
>> +       ENTERFN();
>> +       DPF(D_TRN, "Rdqs ch%d rnk%d ln%d : pi=%03X\n",
>> +           channel, rank, byte_lane, pi_count);
>> +
>> +       /*
>> +        * PI (1/128 MCLK)
>> +        * BL0 -> B0RXDQSPICODE[06:00] (0x00-0x47)
>> +        * BL1 -> B1RXDQSPICODE[06:00] (0x00-0x47)
>> +        */
>> +       reg = (byte_lane & BIT0) ? (B1RXDQSPICODE) : (B0RXDQSPICODE);
>> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET));
>> +       msk = (BIT6 | BIT5 | BIT4 | BIT3 | BIT2 | BIT1 | BIT0);
>> +       temp = pi_count << 0;
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* error check (shouldn't go above 0x3F) */
>> +       if (pi_count > 0x47) {
>> +               training_message(channel, rank, byte_lane);
>> +               mrc_post_code(0xEE, 0xE1);
>> +       }
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/*
>> + * This function will return the current RDQS delay on the given
>> + * channel, rank, byte_lane as an absolute PI count.
>> + *
>> + * (currently doesn't comprehend rank)
>> + */
>> +uint32_t get_rdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane)
>> +{
>> +       uint32_t reg;
>> +       uint32_t temp;
>> +       uint32_t pi_count;
>> +
>> +       ENTERFN();
>> +
>> +       /*
>> +        * PI (1/128 MCLK)
>> +        * BL0 -> B0RXDQSPICODE[06:00] (0x00-0x47)
>> +        * BL1 -> B1RXDQSPICODE[06:00] (0x00-0x47)
>> +        */
>> +       reg = (byte_lane & BIT0) ? (B1RXDQSPICODE) : (B0RXDQSPICODE);
>> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET));
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count = temp & 0x7F;
>> +
>> +       LEAVEFN();
>> +
>> +       return pi_count;
>> +}
>> +
>> +/*
>> + * This function will program the WDQS delays based on an absolute
>> + * amount of PIs.
>> + *
>> + * (currently doesn't comprehend rank)
>> + */
>> +void set_wdqs(uint8_t channel, uint8_t rank,
>> +             uint8_t byte_lane, uint32_t pi_count)
>> +{
>> +       uint32_t reg;
>> +       uint32_t msk;
>> +       uint32_t temp;
>> +
>> +       ENTERFN();
>> +
>> +       DPF(D_TRN, "Wdqs ch%d rnk%d ln%d : pi=%03X\n",
>> +           channel, rank, byte_lane, pi_count);
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * BL0 -> B01PTRCTL0[07:04] (0x0-0xF)
>> +        * BL1 -> B01PTRCTL0[19:16] (0x0-0xF)
>> +        */
>> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET);
>> +       msk = (byte_lane & BIT0) ? (BIT19 | BIT18 | BIT17 | BIT16) :
>> +               (BIT7 | BIT6 | BIT5 | BIT4);
>> +       temp = pi_count / HALF_CLK;
>> +       temp <<= (byte_lane & BIT0) ? (16) : (4);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * BL0 -> B0DLLPICODER0[21:16] (0x00-0x3F)
>> +        * BL1 -> B1DLLPICODER0[21:16] (0x00-0x3F)
>> +        */
>> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
>> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET));
>> +       msk = (BIT21 | BIT20 | BIT19 | BIT18 | BIT17 | BIT16);
>> +       temp = pi_count << 16;
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /*
>> +        * DEADBAND
>> +        * BL0/1 -> B01DBCTL1[07/10] (+1 select)
>> +        * BL0/1 -> B01DBCTL1[01/04] (enable)
>> +        */
>> +       reg = B01DBCTL1 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET);
>> +       msk = 0x00;
>> +       temp = 0x00;
>> +
>> +       /* enable */
>> +       msk |= (byte_lane & BIT0) ? (BIT4) : (BIT1);
>> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
>> +               temp |= msk;
>> +
>> +       /* select */
>> +       msk |= (byte_lane & BIT0) ? (BIT10) : (BIT7);
>> +       if (pi_count < EARLY_DB)
>> +               temp |= msk;
>> +
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* error check */
>> +       if (pi_count > 0x3F) {
>> +               training_message(channel, rank, byte_lane);
>> +               mrc_post_code(0xEE, 0xE2);
>> +       }
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/*
>> + * This function will return the amount of WDQS delay on the given
>> + * channel, rank, byte_lane as an absolute PI count.
>> + *
>> + * (currently doesn't comprehend rank)
>> + */
>> +uint32_t get_wdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane)
>> +{
>> +       uint32_t reg;
>> +       uint32_t temp;
>> +       uint32_t pi_count;
>> +
>> +       ENTERFN();
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * BL0 -> B01PTRCTL0[07:04] (0x0-0xF)
>> +        * BL1 -> B01PTRCTL0[19:16] (0x0-0xF)
>> +        */
>> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET);
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= (byte_lane & BIT0) ? (16) : (4);
>> +       temp &= 0xF;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count = (temp * HALF_CLK);
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * BL0 -> B0DLLPICODER0[21:16] (0x00-0x3F)
>> +        * BL1 -> B1DLLPICODER0[21:16] (0x00-0x3F)
>> +        */
>> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
>> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET));
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= 16;
>> +       temp &= 0x3F;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count += temp;
>> +
>> +       LEAVEFN();
>> +
>> +       return pi_count;
>> +}
>> +
>> +/*
>> + * This function will program the WDQ delays based on an absolute
>> + * number of PIs.
>> + *
>> + * (currently doesn't comprehend rank)
>> + */
>> +void set_wdq(uint8_t channel, uint8_t rank,
>> +            uint8_t byte_lane, uint32_t pi_count)
>> +{
>> +       uint32_t reg;
>> +       uint32_t msk;
>> +       uint32_t temp;
>> +
>> +       ENTERFN();
>> +
>> +       DPF(D_TRN, "Wdq ch%d rnk%d ln%d : pi=%03X\n",
>> +           channel, rank, byte_lane, pi_count);
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * BL0 -> B01PTRCTL0[03:00] (0x0-0xF)
>> +        * BL1 -> B01PTRCTL0[15:12] (0x0-0xF)
>> +        */
>> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET);
>> +       msk = (byte_lane & BIT0) ? (BIT15 | BIT14 | BIT13 | BIT12) :
>> +               (BIT3 | BIT2 | BIT1 | BIT0);
>> +       temp = pi_count / HALF_CLK;
>> +       temp <<= (byte_lane & BIT0) ? (12) : (0);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * BL0 -> B0DLLPICODER0[13:08] (0x00-0x3F)
>> +        * BL1 -> B1DLLPICODER0[13:08] (0x00-0x3F)
>> +        */
>> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
>> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET));
>> +       msk = (BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8);
>> +       temp = pi_count << 8;
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /*
>> +        * DEADBAND
>> +        * BL0/1 -> B01DBCTL1[06/09] (+1 select)
>> +        * BL0/1 -> B01DBCTL1[00/03] (enable)
>> +        */
>> +       reg = B01DBCTL1 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET);
>> +       msk = 0x00;
>> +       temp = 0x00;
>> +
>> +       /* enable */
>> +       msk |= (byte_lane & BIT0) ? (BIT3) : (BIT0);
>> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
>> +               temp |= msk;
>> +
>> +       /* select */
>> +       msk |= (byte_lane & BIT0) ? (BIT9) : (BIT6);
>> +       if (pi_count < EARLY_DB)
>> +               temp |= msk;
>> +
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* error check */
>> +       if (pi_count > 0x3F) {
>> +               training_message(channel, rank, byte_lane);
>> +               mrc_post_code(0xEE, 0xE3);
>> +       }
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/*
>> + * This function will return the amount of WDQ delay on the given
>> + * channel, rank, byte_lane as an absolute PI count.
>> + *
>> + * (currently doesn't comprehend rank)
>> + */
>> +uint32_t get_wdq(uint8_t channel, uint8_t rank, uint8_t byte_lane)
>> +{
>> +       uint32_t reg;
>> +       uint32_t temp;
>> +       uint32_t pi_count;
>> +
>> +       ENTERFN();
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * BL0 -> B01PTRCTL0[03:00] (0x0-0xF)
>> +        * BL1 -> B01PTRCTL0[15:12] (0x0-0xF)
>> +        */
>> +       reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET);
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= (byte_lane & BIT0) ? (12) : (0);
>> +       temp &= 0xF;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count = (temp * HALF_CLK);
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * BL0 -> B0DLLPICODER0[13:08] (0x00-0x3F)
>> +        * BL1 -> B1DLLPICODER0[13:08] (0x00-0x3F)
>> +        */
>> +       reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
>> +       reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
>> +               (channel * DDRIODQ_CH_OFFSET));
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= 8;
>> +       temp &= 0x3F;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count += temp;
>> +
>> +       LEAVEFN();
>> +
>> +       return pi_count;
>> +}
>> +
>> +/*
>> + * This function will program the WCMD delays based on an absolute
>> + * number of PIs.
>> + */
>> +void set_wcmd(uint8_t channel, uint32_t pi_count)
>> +{
>> +       uint32_t reg;
>> +       uint32_t msk;
>> +       uint32_t temp;
>> +
>> +       ENTERFN();
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * CMDPTRREG[11:08] (0x0-0xF)
>> +        */
>> +       reg = CMDPTRREG + (channel * DDRIOCCC_CH_OFFSET);
>> +       msk = (BIT11 | BIT10 | BIT9 | BIT8);
>> +       temp = pi_count / HALF_CLK;
>> +       temp <<= 8;
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * CMDDLLPICODER0[29:24] -> CMDSLICE R3 (unused)
>> +        * CMDDLLPICODER0[21:16] -> CMDSLICE L3 (unused)
>> +        * CMDDLLPICODER0[13:08] -> CMDSLICE R2 (unused)
>> +        * CMDDLLPICODER0[05:00] -> CMDSLICE L2 (unused)
>> +        * CMDDLLPICODER1[29:24] -> CMDSLICE R1 (unused)
>> +        * CMDDLLPICODER1[21:16] -> CMDSLICE L1 (0x00-0x3F)
>> +        * CMDDLLPICODER1[13:08] -> CMDSLICE R0 (unused)
>> +        * CMDDLLPICODER1[05:00] -> CMDSLICE L0 (unused)
>> +        */
>> +       reg = CMDDLLPICODER1 + (channel * DDRIOCCC_CH_OFFSET);
>> +
>> +       msk = (BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24 |
>> +               BIT21 | BIT20 | BIT19 | BIT18 | BIT17 | BIT16 |
>> +               BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8 |
>> +               BIT5 | BIT4 | BIT3 | BIT2 | BIT1 | BIT0);
>> +
>> +       temp = (pi_count << 24) | (pi_count << 16) |
>> +               (pi_count << 8) | (pi_count << 0);
>> +
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +       reg = CMDDLLPICODER0 + (channel * DDRIOCCC_CH_OFFSET);  /* PO */
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /*
>> +        * DEADBAND
>> +        * CMDCFGREG0[17] (+1 select)
>> +        * CMDCFGREG0[16] (enable)
>> +        */
>> +       reg = CMDCFGREG0 + (channel * DDRIOCCC_CH_OFFSET);
>> +       msk = 0x00;
>> +       temp = 0x00;
>> +
>> +       /* enable */
>> +       msk |= BIT16;
>> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
>> +               temp |= msk;
>> +
>> +       /* select */
>> +       msk |= BIT17;
>> +       if (pi_count < EARLY_DB)
>> +               temp |= msk;
>> +
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* error check */
>> +       if (pi_count > 0x3F)
>> +               mrc_post_code(0xEE, 0xE4);
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/*
>> + * This function will return the amount of WCMD delay on the given
>> + * channel as an absolute PI count.
>> + */
>> +uint32_t get_wcmd(uint8_t channel)
>> +{
>> +       uint32_t reg;
>> +       uint32_t temp;
>> +       uint32_t pi_count;
>> +
>> +       ENTERFN();
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * CMDPTRREG[11:08] (0x0-0xF)
>> +        */
>> +       reg = CMDPTRREG + (channel * DDRIOCCC_CH_OFFSET);
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= 8;
>> +       temp &= 0xF;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count = temp * HALF_CLK;
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * CMDDLLPICODER0[29:24] -> CMDSLICE R3 (unused)
>> +        * CMDDLLPICODER0[21:16] -> CMDSLICE L3 (unused)
>> +        * CMDDLLPICODER0[13:08] -> CMDSLICE R2 (unused)
>> +        * CMDDLLPICODER0[05:00] -> CMDSLICE L2 (unused)
>> +        * CMDDLLPICODER1[29:24] -> CMDSLICE R1 (unused)
>> +        * CMDDLLPICODER1[21:16] -> CMDSLICE L1 (0x00-0x3F)
>> +        * CMDDLLPICODER1[13:08] -> CMDSLICE R0 (unused)
>> +        * CMDDLLPICODER1[05:00] -> CMDSLICE L0 (unused)
>> +        */
>> +       reg = CMDDLLPICODER1 + (channel * DDRIOCCC_CH_OFFSET);
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= 16;
>> +       temp &= 0x3F;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count += temp;
>> +
>> +       LEAVEFN();
>> +
>> +       return pi_count;
>> +}
>> +
>> +/*
>> + * This function will program the WCLK delays based on an absolute
>> + * number of PIs.
>> + */
>> +void set_wclk(uint8_t channel, uint8_t rank, uint32_t pi_count)
>> +{
>> +       uint32_t reg;
>> +       uint32_t msk;
>> +       uint32_t temp;
>> +
>> +       ENTERFN();
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * CCPTRREG[15:12] -> CLK1 (0x0-0xF)
>> +        * CCPTRREG[11:08] -> CLK0 (0x0-0xF)
>> +        */
>> +       reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
>> +       msk = (BIT15 | BIT14 | BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8);
>
> mask = 0xff00 is much better, isn't it?

Agreed, but did not fix it.

>> +       temp = ((pi_count / HALF_CLK) << 12) | ((pi_count / HALF_CLK) << 8);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * ECCB1DLLPICODER0[13:08] -> CLK0 (0x00-0x3F)
>> +        * ECCB1DLLPICODER0[21:16] -> CLK1 (0x00-0x3F)
>> +        */
>> +       reg = (rank) ? (ECCB1DLLPICODER0) : (ECCB1DLLPICODER0);
>> +       reg += (channel * DDRIOCCC_CH_OFFSET);
>> +       msk = (BIT21 | BIT20 | BIT19 | BIT18 | BIT17 | BIT16 |
>> +               BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8);
>
> Ick!
>

Echo!!

>> +       temp = (pi_count << 16) | (pi_count << 8);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +       reg = (rank) ? (ECCB1DLLPICODER1) : (ECCB1DLLPICODER1);
>
> Remove all (), and below. Please fix globally.

Fixed globally.

>> +       reg += (channel * DDRIOCCC_CH_OFFSET);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +       reg = (rank) ? (ECCB1DLLPICODER2) : (ECCB1DLLPICODER2);
>> +       reg += (channel * DDRIOCCC_CH_OFFSET);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +       reg = (rank) ? (ECCB1DLLPICODER3) : (ECCB1DLLPICODER3);
>> +       reg += (channel * DDRIOCCC_CH_OFFSET);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /*
>> +        * DEADBAND
>> +        * CCCFGREG1[11:08] (+1 select)
>> +        * CCCFGREG1[03:00] (enable)
>> +        */
>> +       reg = CCCFGREG1 + (channel * DDRIOCCC_CH_OFFSET);
>> +       msk = 0x00;
>> +       temp = 0x00;
>> +
>> +       /* enable */
>> +       msk |= (BIT3 | BIT2 | BIT1 | BIT0);
>> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
>> +               temp |= msk;
>> +
>> +       /* select */
>> +       msk |= (BIT11 | BIT10 | BIT9 | BIT8);
>> +       if (pi_count < EARLY_DB)
>> +               temp |= msk;
>> +
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* error check */
>> +       if (pi_count > 0x3F)
>> +               mrc_post_code(0xEE, 0xE5);
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/*
>> + * This function will return the amout of WCLK delay on the given
>> + * channel, rank as an absolute PI count.
>> + */
>> +uint32_t get_wclk(uint8_t channel, uint8_t rank)
>> +{
>> +       uint32_t reg;
>> +       uint32_t temp;
>> +       uint32_t pi_count;
>> +
>> +       ENTERFN();
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * CCPTRREG[15:12] -> CLK1 (0x0-0xF)
>> +        * CCPTRREG[11:08] -> CLK0 (0x0-0xF)
>> +        */
>> +       reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= (rank) ? (12) : (8);
>> +       temp &= 0xF;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count = temp * HALF_CLK;
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * ECCB1DLLPICODER0[13:08] -> CLK0 (0x00-0x3F)
>> +        * ECCB1DLLPICODER0[21:16] -> CLK1 (0x00-0x3F)
>> +        */
>> +       reg = (rank) ? (ECCB1DLLPICODER0) : (ECCB1DLLPICODER0);
>> +       reg += (channel * DDRIOCCC_CH_OFFSET);
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= (rank) ? (16) : (8);
>> +       temp &= 0x3F;
>> +
>> +       pi_count += temp;
>> +
>> +       LEAVEFN();
>> +
>> +       return pi_count;
>> +}
>> +
>> +/*
>> + * This function will program the WCTL delays based on an absolute
>> + * number of PIs.
>> + *
>> + * (currently doesn't comprehend rank)
>> + */
>> +void set_wctl(uint8_t channel, uint8_t rank, uint32_t pi_count)
>> +{
>> +       uint32_t reg;
>> +       uint32_t msk;
>> +       uint32_t temp;
>> +
>> +       ENTERFN();
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * CCPTRREG[31:28] (0x0-0xF)
>> +        * CCPTRREG[27:24] (0x0-0xF)
>> +        */
>> +       reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
>> +       msk = (BIT31 | BIT30 | BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24);
>> +       temp = ((pi_count / HALF_CLK) << 28) | ((pi_count / HALF_CLK) << 24);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
>> +        * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
>> +        */
>> +       reg = ECCB1DLLPICODER0 + (channel * DDRIOCCC_CH_OFFSET);
>> +       msk = (BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24);
>> +       temp = (pi_count << 24);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +       reg = ECCB1DLLPICODER1 + (channel * DDRIOCCC_CH_OFFSET);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +       reg = ECCB1DLLPICODER2 + (channel * DDRIOCCC_CH_OFFSET);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +       reg = ECCB1DLLPICODER3 + (channel * DDRIOCCC_CH_OFFSET);
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /*
>> +        * DEADBAND
>> +        * CCCFGREG1[13:12] (+1 select)
>> +        * CCCFGREG1[05:04] (enable)
>> +        */
>> +       reg = CCCFGREG1 + (channel * DDRIOCCC_CH_OFFSET);
>> +       msk = 0x00;
>> +       temp = 0x00;
>> +
>> +       /* enable */
>> +       msk |= (BIT5 | BIT4);
>> +       if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
>> +               temp |= msk;
>> +
>> +       /* select */
>> +       msk |= (BIT13 | BIT12);
>> +       if (pi_count < EARLY_DB)
>> +               temp |= msk;
>> +
>> +       mrc_alt_write_mask(DDRPHY, reg, temp, msk);
>> +
>> +       /* error check */
>> +       if (pi_count > 0x3F)
>> +               mrc_post_code(0xEE, 0xE6);
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/*
>> + * This function will return the amount of WCTL delay on the given
>> + * channel, rank as an absolute PI count.
>> + *
>> + * (currently doesn't comprehend rank)
>> + */
>> +uint32_t get_wctl(uint8_t channel, uint8_t rank)
>> +{
>> +       uint32_t reg;
>> +       uint32_t temp;
>> +       uint32_t pi_count;
>> +
>> +       ENTERFN();
>> +
>> +       /*
>> +        * RDPTR (1/2 MCLK, 64 PIs)
>> +        * CCPTRREG[31:28] (0x0-0xF)
>> +        * CCPTRREG[27:24] (0x0-0xF)
>> +        */
>> +       reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= 24;
>> +       temp &= 0xF;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count = temp * HALF_CLK;
>> +
>> +       /*
>> +        * PI (1/64 MCLK, 1 PIs)
>> +        * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
>> +        * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
>> +        */
>> +       reg = ECCB1DLLPICODER0 + (channel * DDRIOCCC_CH_OFFSET);
>> +       temp = msg_port_alt_read(DDRPHY, reg);
>> +       temp >>= 24;
>> +       temp &= 0x3F;
>> +
>> +       /* Adjust PI_COUNT */
>> +       pi_count += temp;
>> +
>> +       LEAVEFN();
>> +
>> +       return pi_count;
>> +}
>> +
>> +/*
>> + * This function will program the internal Vref setting in a given
>> + * byte lane in a given channel.
>> + */
>> +void set_vref(uint8_t channel, uint8_t byte_lane, uint32_t setting)
>> +{
>> +       uint32_t reg = (byte_lane & 0x1) ? (B1VREFCTL) : (B0VREFCTL);
>> +
>> +       ENTERFN();
>> +
>> +       DPF(D_TRN, "Vref ch%d ln%d : val=%03X\n",
>> +           channel, byte_lane, setting);
>> +
>> +       mrc_alt_write_mask(DDRPHY, (reg + (channel * DDRIODQ_CH_OFFSET) +
>> +               ((byte_lane >> 1) * DDRIODQ_BL_OFFSET)),
>> +               (vref_codes[setting] << 2),
>> +               (BIT7 | BIT6 | BIT5 | BIT4 | BIT3 | BIT2));
>> +
>> +       /*
>> +        * need to wait ~300ns for Vref to settle
>> +        * (check that this is necessary)
>> +        */
>> +       delay_n(300);
>> +
>> +       /* ??? may need to clear pointers ??? */
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/*
>> + * This function will return the internal Vref setting for the given
>> + * channel, byte_lane.
>> + */
>> +uint32_t get_vref(uint8_t channel, uint8_t byte_lane)
>> +{
>> +       uint8_t j;
>> +       uint32_t ret_val = sizeof(vref_codes) / 2;
>> +       uint32_t reg = (byte_lane & 0x1) ? (B1VREFCTL) : (B0VREFCTL);
>> +       uint32_t temp;
>> +
>> +       ENTERFN();
>> +
>> +       temp = msg_port_alt_read(DDRPHY, (reg + (channel * DDRIODQ_CH_OFFSET) +
>> +               ((byte_lane >> 1) * DDRIODQ_BL_OFFSET)));
>> +       temp >>= 2;
>> +       temp &= 0x3F;
>> +
>> +       for (j = 0; j < sizeof(vref_codes); j++) {
>> +               if (vref_codes[j] == temp) {
>> +                       ret_val = j;
>> +                       break;
>> +               }
>> +       }
>> +
>> +       LEAVEFN();
>> +
>> +       return ret_val;
>> +}
>> +
>> +/*
>> + * This function will return a 32 bit address in the desired
>
> 32-bit

Fixed

>> + * channel and rank.
>> + */
>> +uint32_t get_addr(uint8_t channel, uint8_t rank)
>> +{
>> +       uint32_t offset = 0x02000000;   /* 32MB */
>> +
>> +       /* Begin product specific code */
>> +       if (channel > 0) {
>> +               DPF(D_ERROR, "ILLEGAL CHANNEL\n");
>> +               DEAD_LOOP();
>> +       }
>> +
>> +       if (rank > 1) {
>> +               DPF(D_ERROR, "ILLEGAL RANK\n");
>> +               DEAD_LOOP();
>> +       }
>> +
>> +       /* use 256MB lowest density as per DRP == 0x0003 */
>> +       offset += rank * (256 * 1024 * 1024);
>> +
>> +       return offset;
>> +}
>> +
>> +/*
>> + * This function will sample the DQTRAINSTS registers in the given
>> + * channel/rank SAMPLE_SIZE times looking for a valid '0' or '1'.
>> + *
>> + * It will return an encoded DWORD in which each bit corresponds to
>
> DWORD?

Changed to 32-bit data

>> + * the sampled value on the byte lane.
>> + */
>> +uint32_t sample_dqs(struct mrc_params *mrc_params, uint8_t channel,
>> +                   uint8_t rank, bool rcvn)
>> +{
>> +       uint8_t j;      /* just a counter */
>> +       uint8_t bl;     /* which BL in the module (always 2 per module) */
>> +       uint8_t bl_grp; /* which BL module */
>> +       /* byte lane divisor */
>
> Maybe rename the variable so you can drop the comment?

That does not help much. Unchanged.

>> +       uint8_t bl_divisor = (mrc_params->channel_width == X16) ? 2 : 1;
>> +       uint32_t msk[2];        /* BLx in module */
>> +       /* DQTRAINSTS register contents for each sample */
>> +       uint32_t sampled_val[SAMPLE_SIZE];
>> +       uint32_t num_0s;        /* tracks the number of '0' samples */
>> +       uint32_t num_1s;        /* tracks the number of '1' samples */
>> +       uint32_t ret_val = 0x00;        /* assume all '0' samples */
>> +       uint32_t address = get_addr(channel, rank);
>> +
>> +       /* initialise msk[] */
>> +       msk[0] = (rcvn) ? (BIT1) : (BIT9);      /* BL0 */
>> +       msk[1] = (rcvn) ? (BIT0) : (BIT8);      /* BL1 */
>> +
>> +       /* cycle through each byte lane group */
>> +       for (bl_grp = 0; bl_grp < (NUM_BYTE_LANES / bl_divisor) / 2; bl_grp++) {
>> +               /* take SAMPLE_SIZE samples */
>> +               for (j = 0; j < SAMPLE_SIZE; j++) {
>> +                       hte_mem_op(address, mrc_params->first_run,
>> +                                  rcvn ? 0 : 1);
>> +                       mrc_params->first_run = 0;
>> +
>> +                       /*
>> +                        * record the contents of the proper
>> +                        * DQTRAINSTS register
>> +                        */
>> +                       sampled_val[j] = msg_port_alt_read(DDRPHY,
>> +                               (DQTRAINSTS +
>> +                               (bl_grp * DDRIODQ_BL_OFFSET) +
>> +                               (channel * DDRIODQ_CH_OFFSET)));
>> +               }
>> +
>> +               /*
>> +                * look for a majority value (SAMPLE_SIZE / 2) + 1
>> +                * on the byte lane and set that value in the corresponding
>> +                * ret_val bit
>> +                */
>> +               for (bl = 0; bl < 2; bl++) {
>> +                       num_0s = 0x00;  /* reset '0' tracker for byte lane */
>> +                       num_1s = 0x00;  /* reset '1' tracker for byte lane */
>> +                       for (j = 0; j < SAMPLE_SIZE; j++) {
>> +                               if (sampled_val[j] & msk[bl])
>> +                                       num_1s++;
>> +                               else
>> +                                       num_0s++;
>> +                       }
>> +               if (num_1s > num_0s)
>> +                       ret_val |= (1 << (bl + (bl_grp * 2)));
>> +               }
>> +       }
>> +
>> +       /*
>> +        * "ret_val.0" contains the status of BL0
>> +        * "ret_val.1" contains the status of BL1
>> +        * "ret_val.2" contains the status of BL2
>> +        * etc.
>
> This comment should go in @return in the function comment.

Actually I failed to understand what it really means, so leave it unchanged :(

>> +        */
>> +       return ret_val;
>> +}
>> +
>> +/* This function will find the rising edge transition on RCVN or WDQS */
>> +void find_rising_edge(struct mrc_params *mrc_params, uint32_t delay[],
>> +                     uint8_t channel, uint8_t rank, bool rcvn)
>> +{
>> +       bool all_edges_found;   /* determines stop condition */
>> +       bool direction[NUM_BYTE_LANES]; /* direction indicator */
>> +       uint8_t sample; /* sample counter */
>> +       uint8_t bl;     /* byte lane counter */
>> +       /* byte lane divisor */
>> +       uint8_t bl_divisor = (mrc_params->channel_width == X16) ? 2 : 1;
>> +       uint32_t sample_result[SAMPLE_CNT];     /* results of sample_dqs() */
>> +       uint32_t temp;
>> +       uint32_t transition_pattern;
>> +
>> +       ENTERFN();
>> +
>> +       /* select hte and request initial configuration */
>> +       select_hte();
>> +       mrc_params->first_run = 1;
>> +
>> +       /* Take 3 sample points (T1,T2,T3) to obtain a transition pattern */
>> +       for (sample = 0; sample < SAMPLE_CNT; sample++) {
>> +               /* program the desired delays for sample */
>> +               for (bl = 0; bl < (NUM_BYTE_LANES / bl_divisor); bl++) {
>> +                       /* increase sample delay by 26 PI (0.2 CLK) */
>> +                       if (rcvn) {
>> +                               set_rcvn(channel, rank, bl,
>> +                                        delay[bl] + (sample * SAMPLE_DLY));
>> +                       } else {
>> +                               set_wdqs(channel, rank, bl,
>> +                                        delay[bl] + (sample * SAMPLE_DLY));
>> +                       }
>> +               }
>> +
>> +               /* take samples (Tsample_i) */
>> +               sample_result[sample] = sample_dqs(mrc_params,
>> +                       channel, rank, rcvn);
>> +
>> +               DPF(D_TRN,
>> +                   "Find rising edge %s ch%d rnk%d: #%d dly=%d dqs=%02X\n",
>> +                   (rcvn ? "RCVN" : "WDQS"), channel, rank, sample,
>> +                   sample * SAMPLE_DLY, sample_result[sample]);
>> +       }
>> +
>> +       /*
>> +        * This pattern will help determine where we landed and ultimately
>> +        * how to place RCVEN/WDQS.
>> +        */
>> +       for (bl = 0; bl < (NUM_BYTE_LANES / bl_divisor); bl++) {
>> +               /* build transition_pattern (MSB is 1st sample) */
>> +               transition_pattern = 0;
>> +               for (sample = 0; sample < SAMPLE_CNT; sample++) {
>> +                       transition_pattern |=
>> +                               ((sample_result[sample] & (1 << bl)) >> bl) <<
>> +                               (SAMPLE_CNT - 1 - sample);
>> +               }
>> +
>> +               DPF(D_TRN, "=== transition pattern %d\n", transition_pattern);
>> +
>> +               /*
>> +                * set up to look for rising edge based on
>> +                * transition_pattern
>> +                */
>> +               switch (transition_pattern) {
>> +               case 0: /* sampled 0->0->0 */
>> +                       /* move forward from T3 looking for 0->1 */
>> +                       delay[bl] += 2 * SAMPLE_DLY;
>> +                       direction[bl] = FORWARD;
>> +                       break;
>> +               case 1: /* sampled 0->0->1 */
>> +               case 5: /* sampled 1->0->1 (bad duty cycle) *HSD#237503* */
>> +                       /* move forward from T2 looking for 0->1 */
>> +                       delay[bl] += 1 * SAMPLE_DLY;
>> +                       direction[bl] = FORWARD;
>> +                       break;
>> +               case 2: /* sampled 0->1->0 (bad duty cycle) *HSD#237503* */
>> +               case 3: /* sampled 0->1->1 */
>> +                       /* move forward from T1 looking for 0->1 */
>> +                       delay[bl] += 0 * SAMPLE_DLY;
>> +                       direction[bl] = FORWARD;
>> +                       break;
>> +               case 4: /* sampled 1->0->0 (assumes BL8, HSD#234975) */
>> +                       /* move forward from T3 looking for 0->1 */
>> +                       delay[bl] += 2 * SAMPLE_DLY;
>> +                       direction[bl] = FORWARD;
>> +                       break;
>> +               case 6: /* sampled 1->1->0 */
>> +               case 7: /* sampled 1->1->1 */
>> +                       /* move backward from T1 looking for 1->0 */
>> +                       delay[bl] += 0 * SAMPLE_DLY;
>> +                       direction[bl] = BACKWARD;
>> +                       break;
>> +               default:
>> +                       mrc_post_code(0xEE, 0xEE);
>> +                       break;
>> +               }
>> +
>> +               /* program delays */
>> +               if (rcvn)
>> +                       set_rcvn(channel, rank, bl, delay[bl]);
>> +               else
>> +                       set_wdqs(channel, rank, bl, delay[bl]);
>> +       }
>> +
>> +       /*
>> +        * Based on the observed transition pattern on the byte lane,
>> +        * begin looking for a rising edge with single PI granularity.
>> +        */
>> +       do {
>> +               all_edges_found = true; /* assume all byte lanes passed */
>> +               /* take a sample */
>> +               temp = sample_dqs(mrc_params, channel, rank, rcvn);
>> +               /* check all each byte lane for proper edge */
>> +               for (bl = 0; bl < (NUM_BYTE_LANES / bl_divisor); bl++) {
>> +                       if (temp & (1 << bl)) {
>> +                               /* sampled "1" */
>> +                               if (direction[bl] == BACKWARD) {
>> +                                       /*
>> +                                        * keep looking for edge
>> +                                        * on this byte lane
>> +                                        */
>> +                                       all_edges_found = false;
>> +                                       delay[bl] -= 1;
>> +                                       if (rcvn) {
>> +                                               set_rcvn(channel, rank,
>> +                                                        bl, delay[bl]);
>> +                                       } else {
>> +                                               set_wdqs(channel, rank,
>> +                                                        bl, delay[bl]);
>> +                                       }
>> +                               }
>> +                       } else {
>> +                               /* sampled "0" */
>> +                               if (direction[bl] == FORWARD) {
>> +                                       /*
>> +                                        * keep looking for edge
>> +                                        * on this byte lane
>> +                                        */
>> +                                       all_edges_found = false;
>> +                                       delay[bl] += 1;
>> +                                       if (rcvn) {
>> +                                               set_rcvn(channel, rank,
>> +                                                        bl, delay[bl]);
>> +                                       } else {
>> +                                               set_wdqs(channel, rank,
>> +                                                        bl, delay[bl]);
>> +                                       }
>> +                               }
>> +                       }
>> +               }
>> +       } while (!all_edges_found);
>> +
>> +       /* restore DDR idle state */
>> +       dram_init_command(DCMD_PREA(rank));
>> +
>> +       DPF(D_TRN, "Delay %03X %03X %03X %03X\n",
>> +           delay[0], delay[1], delay[2], delay[3]);
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +/*
>> + * This function will return a 32 bit mask that will be used to
>> + * check for byte lane failures.
>> + */
>> +uint32_t byte_lane_mask(struct mrc_params *mrc_params)
>> +{
>> +       uint32_t j;
>> +       uint32_t ret_val = 0x00;
>> +
>> +       /*
>> +        * set ret_val based on NUM_BYTE_LANES such that you will check
>> +        * only BL0 in result
>> +        *
>> +        * (each bit in result represents a byte lane)
>> +        */
>> +       for (j = 0; j < MAX_BYTE_LANES; j += NUM_BYTE_LANES)
>> +               ret_val |= (1 << ((j / NUM_BYTE_LANES) * NUM_BYTE_LANES));
>> +
>> +       /*
>> +        * HSD#235037
>> +        * need to adjust the mask for 16-bit mode
>> +        */
>> +       if (mrc_params->channel_width == X16)
>> +               ret_val |= (ret_val << 2);
>> +
>> +       return ret_val;
>> +}
>> +
>> +/*
>> + * Check memory executing simple write/read/verify at the specified address.
>> + *
>> + * Bits in the result indicate failure on specific byte lane.
>> + */
>> +uint32_t check_rw_coarse(struct mrc_params *mrc_params, uint32_t address)
>> +{
>> +       uint32_t result = 0;
>> +       uint8_t first_run = 0;
>> +
>> +       if (mrc_params->hte_setup) {
>> +               mrc_params->hte_setup = 0;
>> +               first_run = 1;
>> +               select_hte();
>> +       }
>> +
>> +       result = hte_basic_write_read(mrc_params, address,
>> +                                     first_run, WRITE_TRAIN);
>
> reformat to 80cols

Fixed.

>> +
>> +       DPF(D_TRN, "check_rw_coarse result is %x\n", result);
>> +
>> +       return result;
>> +}
>> +
>> +/*
>> + * Check memory executing write/read/verify of many data patterns
>> + * at the specified address. Bits in the result indicate failure
>> + * on specific byte lane.
>> + */
>> +uint32_t check_bls_ex(struct mrc_params *mrc_params, uint32_t address)
>> +{
>> +       uint32_t result;
>> +       uint8_t first_run = 0;
>> +
>> +       if (mrc_params->hte_setup) {
>> +               mrc_params->hte_setup = 0;
>> +               first_run = 1;
>> +               select_hte();
>> +       }
>> +
>> +       result = hte_write_stress_bit_lanes(mrc_params, address, first_run);
>> +
>> +       DPF(D_TRN, "check_bls_ex result is %x\n", result);
>> +
>> +       return result;
>> +}
>> +
>> +/*
>> + * 32-bit LFSR with characteristic polynomial: X^32 + X^22 +X^2 + X^1
>> + *
>> + * The function takes pointer to previous 32 bit value and
>> + * modifies it to next value.
>> + */
>> +void lfsr32(uint32_t *lfsr_ptr)
>> +{
>> +       uint32_t bit;
>> +       uint32_t lfsr;
>> +       int i;
>> +
>> +       lfsr = *lfsr_ptr;
>> +
>> +       for (i = 0; i < 32; i++) {
>> +               bit = 1 ^ (lfsr & BIT0);
>> +               bit = bit ^ ((lfsr & BIT1) >> 1);
>> +               bit = bit ^ ((lfsr & BIT2) >> 2);
>> +               bit = bit ^ ((lfsr & BIT22) >> 22);
>> +
>> +               lfsr = ((lfsr >> 1) | (bit << 31));
>> +       }
>> +
>> +       *lfsr_ptr = lfsr;
>> +}
>> +
>> +/* Clear the pointers in a given byte lane in a given channel */
>> +void clear_pointers(void)
>> +{
>> +       uint8_t channel;
>> +       uint8_t bl;
>> +
>> +       ENTERFN();
>> +
>> +       for (channel = 0; channel < NUM_CHANNELS; channel++) {
>> +               for (bl = 0; bl < NUM_BYTE_LANES; bl++) {
>> +                       mrc_alt_write_mask(DDRPHY,
>> +                                          (B01PTRCTL1 +
>> +                                          (channel * DDRIODQ_CH_OFFSET) +
>> +                                          ((bl >> 1) * DDRIODQ_BL_OFFSET)),
>> +                                          ~BIT8, BIT8);
>> +
>> +                       mrc_alt_write_mask(DDRPHY,
>> +                                          (B01PTRCTL1 +
>> +                                          (channel * DDRIODQ_CH_OFFSET) +
>> +                                          ((bl >> 1) * DDRIODQ_BL_OFFSET)),
>> +                                          BIT8, BIT8);
>> +               }
>> +       }
>> +
>> +       LEAVEFN();
>> +}
>> +
>> +void print_timings(struct mrc_params *mrc_params)
>> +{
>> +       uint8_t algo;
>> +       uint8_t channel;
>> +       uint8_t rank;
>> +       uint8_t bl;
>> +       uint8_t bl_divisor = (mrc_params->channel_width == X16) ? 2 : 1;
>> +
>> +       DPF(D_INFO, "\n---------------------------");
>> +       DPF(D_INFO, "\nALGO[CH:RK] BL0 BL1 BL2 BL3");
>> +       DPF(D_INFO, "\n===========================");
>> +
>> +       for (algo = 0; algo < MAX_ALGOS; algo++) {
>> +               for (channel = 0; channel < NUM_CHANNELS; channel++) {
>> +                       if (mrc_params->channel_enables & (1 << channel)) {
>> +                               for (rank = 0; rank < NUM_RANKS; rank++) {
>
> Can we put this block in its own function to fix the over-indenting?

Fixed.

>> +                                       if (mrc_params->rank_enables &
>> +                                               (1 << rank)) {
>> +                                               switch (algo) {
>> +                                               case RCVN:
>> +                                                       DPF(D_INFO,
>> +                                                           "\nRCVN[%02d:%02d]",
>> +                                                           channel, rank);
>> +                                                       break;
>> +                                               case WDQS:
>> +                                                       DPF(D_INFO,
>> +                                                           "\nWDQS[%02d:%02d]",
>> +                                                           channel, rank);
>> +                                                       break;
>> +                                               case WDQX:
>> +                                                       DPF(D_INFO,
>> +                                                           "\nWDQx[%02d:%02d]",
>> +                                                           channel, rank);
>> +                                                       break;
>> +                                               case RDQS:
>> +                                                       DPF(D_INFO,
>> +                                                           "\nRDQS[%02d:%02d]",
>> +                                                           channel, rank);
>> +                                                       break;
>> +                                               case VREF:
>> +                                                       DPF(D_INFO,
>> +                                                           "\nVREF[%02d:%02d]",
>> +                                                           channel, rank);
>> +                                                       break;
>> +                                               case WCMD:
>> +                                                       DPF(D_INFO,
>> +                                                           "\nWCMD[%02d:%02d]",
>> +                                                           channel, rank);
>> +                                                       break;
>> +                                               case WCTL:
>> +                                                       DPF(D_INFO,
>> +                                                           "\nWCTL[%02d:%02d]",
>> +                                                           channel, rank);
>> +                                                       break;
>> +                                               case WCLK:
>> +                                                       DPF(D_INFO,
>> +                                                           "\nWCLK[%02d:%02d]",
>> +                                                           channel, rank);
>> +                                                       break;
>> +                                               default:
>> +                                                       break;
>> +                                               }
>> +
>> +                                               for (bl = 0;
>> +                                                    bl < (NUM_BYTE_LANES / bl_divisor);
>> +                                                    bl++) {
>> +                                                       switch (algo) {
>> +                                                       case RCVN:
>> +                                                               DPF(D_INFO,
>> +                                                                   " %03d",
>> +                                                                   get_rcvn(channel, rank, bl));
>> +                                                               break;
>> +                                                       case WDQS:
>> +                                                               DPF(D_INFO,
>> +                                                                   " %03d",
>> +                                                                   get_wdqs(channel, rank, bl));
>> +                                                               break;
>> +                                                       case WDQX:
>> +                                                               DPF(D_INFO,
>> +                                                                   " %03d",
>> +                                                                   get_wdq(channel, rank, bl));
>> +                                                               break;
>> +                                                       case RDQS:
>> +                                                               DPF(D_INFO,
>> +                                                                   " %03d",
>> +                                                                   get_rdqs(channel, rank, bl));
>> +                                                               break;
>> +                                                       case VREF:
>> +                                                               DPF(D_INFO,
>> +                                                                   " %03d",
>> +                                                                   get_vref(channel, bl));
>> +                                                               break;
>> +                                                       case WCMD:
>> +                                                               DPF(D_INFO,
>> +                                                                   " %03d",
>> +                                                                   get_wcmd(channel));
>> +                                                               break;
>> +                                                       case WCTL:
>> +                                                               DPF(D_INFO,
>> +                                                                   " %03d",
>> +                                                                   get_wctl(channel, rank));
>> +                                                               break;
>> +                                                       case WCLK:
>> +                                                               DPF(D_INFO,
>> +                                                                   " %03d",
>> +                                                                   get_wclk(channel, rank));
>> +                                                               break;
>> +                                                       default:
>> +                                                               break;
>> +                                                       }
>> +                                               }
>> +                                       }
>> +                               }
>> +                       }
>> +               }
>> +       }
>> +
>> +       DPF(D_INFO, "\n---------------------------");
>> +       DPF(D_INFO, "\n");
>> +}
>> diff --git a/arch/x86/cpu/quark/mrc_util.h b/arch/x86/cpu/quark/mrc_util.h
>> new file mode 100644
>> index 0000000..edbe219
>> --- /dev/null
>> +++ b/arch/x86/cpu/quark/mrc_util.h
>> @@ -0,0 +1,153 @@
>> +/*
>> + * Copyright (C) 2013, Intel Corporation
>> + * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
>> + *
>> + * Ported from Intel released Quark UEFI BIOS
>> + * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
>> + *
>> + * SPDX-License-Identifier:    Intel
>> + */
>> +
>> +#ifndef _MRC_UTIL_H_
>> +#define _MRC_UTIL_H_
>> +
>> +/* Turn on this macro to enable MRC debugging output */
>> +#undef  MRC_DEBUG
>> +
>> +/* MRC Debug Support */
>> +#define DPF            debug_cond
>> +
>> +/* debug print type */
>> +
>> +#ifdef MRC_DEBUG
>> +#define D_ERROR                0x0001
>> +#define D_INFO         0x0002
>> +#define D_REGRD                0x0004
>> +#define D_REGWR                0x0008
>> +#define D_FCALL                0x0010
>> +#define D_TRN          0x0020
>> +#define D_TIME         0x0040
>> +#else
>> +#define D_ERROR                0
>> +#define D_INFO         0
>> +#define D_REGRD                0
>> +#define D_REGWR                0
>> +#define D_FCALL                0
>> +#define D_TRN          0
>> +#define D_TIME         0
>> +#endif
>> +
>> +#define ENTERFN(...)   debug_cond(D_FCALL, "<%s>\n", __func__)
>> +#define LEAVEFN(...)   debug_cond(D_FCALL, "</%s>\n", __func__)
>> +#define REPORTFN(...)  debug_cond(D_FCALL, "<%s/>\n", __func__)
>> +
>> +/* Generic Register Bits */
>> +#define BIT0           0x00000001
>> +#define BIT1           0x00000002
>> +#define BIT2           0x00000004
>> +#define BIT3           0x00000008
>> +#define BIT4           0x00000010
>> +#define BIT5           0x00000020
>> +#define BIT6           0x00000040
>> +#define BIT7           0x00000080
>> +#define BIT8           0x00000100
>> +#define BIT9           0x00000200
>> +#define BIT10          0x00000400
>> +#define BIT11          0x00000800
>> +#define BIT12          0x00001000
>> +#define BIT13          0x00002000
>> +#define BIT14          0x00004000
>> +#define BIT15          0x00008000
>> +#define BIT16          0x00010000
>> +#define BIT17          0x00020000
>> +#define BIT18          0x00040000
>> +#define BIT19          0x00080000
>> +#define BIT20          0x00100000
>> +#define BIT21          0x00200000
>> +#define BIT22          0x00400000
>> +#define BIT23          0x00800000
>> +#define BIT24          0x01000000
>> +#define BIT25          0x02000000
>> +#define BIT26          0x04000000
>> +#define BIT27          0x08000000
>> +#define BIT28          0x10000000
>> +#define BIT29          0x20000000
>> +#define BIT30          0x40000000
>> +#define BIT31          0x80000000
>> +
>> +/* Message Bus Port */
>> +#define MEM_CTLR       0x01
>> +#define HOST_BRIDGE    0x03
>> +#define MEM_MGR                0x05
>> +#define HTE            0x11
>> +#define DDRPHY         0x12
>> +
>> +/* number of sample points */
>> +#define SAMPLE_CNT     3
>> +/* number of PIs to increment per sample */
>> +#define SAMPLE_DLY     26
>> +
>> +enum {
>> +       /* indicates to decrease delays when looking for edge */
>> +       BACKWARD,
>> +       /* indicates to increase delays when looking for edge */
>> +       FORWARD
>> +};
>> +
>> +enum {
>> +       RCVN,
>> +       WDQS,
>> +       WDQX,
>> +       RDQS,
>> +       VREF,
>> +       WCMD,
>> +       WCTL,
>> +       WCLK,
>> +       MAX_ALGOS,
>> +};
>> +
>> +void mrc_write_mask(u32 unit, u32 addr, u32 data, u32 mask);
>> +void mrc_alt_write_mask(u32 unit, u32 addr, u32 data, u32 mask);
>> +void mrc_post_code(uint8_t major, uint8_t minor);
>> +void delay_n(uint32_t ns);
>> +void delay_u(uint32_t ms);
>> +void select_mem_mgr(void);
>> +void select_hte(void);
>> +void dram_init_command(uint32_t data);
>> +void dram_wake_command(void);
>> +void training_message(uint8_t channel, uint8_t rank, uint8_t byte_lane);
>> +
>> +void set_rcvn(uint8_t channel, uint8_t rank,
>> +             uint8_t byte_lane, uint32_t pi_count);
>> +uint32_t get_rcvn(uint8_t channel, uint8_t rank, uint8_t byte_lane);
>> +void set_rdqs(uint8_t channel, uint8_t rank,
>> +             uint8_t byte_lane, uint32_t pi_count);
>> +uint32_t get_rdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane);
>> +void set_wdqs(uint8_t channel, uint8_t rank,
>> +             uint8_t byte_lane, uint32_t pi_count);
>> +uint32_t get_wdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane);
>> +void set_wdq(uint8_t channel, uint8_t rank,
>> +            uint8_t byte_lane, uint32_t pi_count);
>> +uint32_t get_wdq(uint8_t channel, uint8_t rank, uint8_t byte_lane);
>> +void set_wcmd(uint8_t channel, uint32_t pi_count);
>> +uint32_t get_wcmd(uint8_t channel);
>> +void set_wclk(uint8_t channel, uint8_t rank, uint32_t pi_count);
>> +uint32_t get_wclk(uint8_t channel, uint8_t rank);
>> +void set_wctl(uint8_t channel, uint8_t rank, uint32_t pi_count);
>> +uint32_t get_wctl(uint8_t channel, uint8_t rank);
>> +void set_vref(uint8_t channel, uint8_t byte_lane, uint32_t setting);
>> +uint32_t get_vref(uint8_t channel, uint8_t byte_lane);
>> +
>> +uint32_t get_addr(uint8_t channel, uint8_t rank);
>> +uint32_t sample_dqs(struct mrc_params *mrc_params, uint8_t channel,
>> +                   uint8_t rank, bool rcvn);
>> +void find_rising_edge(struct mrc_params *mrc_params, uint32_t delay[],
>> +                     uint8_t channel, uint8_t rank, bool rcvn);
>> +uint32_t byte_lane_mask(struct mrc_params *mrc_params);
>> +uint32_t check_rw_coarse(struct mrc_params *mrc_params, uint32_t address);
>> +uint32_t check_bls_ex(struct mrc_params *mrc_params, uint32_t address);
>> +void lfsr32(uint32_t *lfsr_ptr);
>> +void clear_pointers(void);
>> +void print_timings(struct mrc_params *mrc_params);
>
> If these are all truly exported, can we please put the function
> comments here in the header file?

No, they are only used internally by MRC.

>> +
>> +#endif /* _MRC_UTIL_H_ */
>> --
>> 1.8.2.1
>>
>
> Regards,
> Simon

Regards,
Bin
diff mbox

Patch

diff --git a/arch/x86/cpu/quark/hte.c b/arch/x86/cpu/quark/hte.c
new file mode 100644
index 0000000..d813c9c
--- /dev/null
+++ b/arch/x86/cpu/quark/hte.c
@@ -0,0 +1,398 @@ 
+/*
+ * Copyright (C) 2013, Intel Corporation
+ * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
+ *
+ * Ported from Intel released Quark UEFI BIOS
+ * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
+ *
+ * SPDX-License-Identifier:	Intel
+ */
+
+#include <common.h>
+#include <asm/arch/mrc.h>
+#include <asm/arch/msg_port.h>
+#include "mrc_util.h"
+#include "hte.h"
+
+/**
+ * This function enables HTE to detect all possible errors for
+ * the given training parameters (per-bit or full byte lane).
+ */
+static void hte_enable_all_errors(void)
+{
+	msg_port_write(HTE, 0x000200A2, 0xFFFFFFFF);
+	msg_port_write(HTE, 0x000200A3, 0x000000FF);
+	msg_port_write(HTE, 0x000200A4, 0x00000000);
+}
+
+/**
+ * This function goes and reads the HTE register in order to find any error
+ *
+ * @return: The errors detected in the HTE status register
+ */
+static u32 hte_check_errors(void)
+{
+	return msg_port_read(HTE, 0x000200A7);
+}
+
+/**
+ * This function waits until HTE finishes
+ */
+static void hte_wait_for_complete(void)
+{
+	u32 tmp;
+
+	ENTERFN();
+
+	do {} while ((msg_port_read(HTE, 0x00020012) & BIT30) != 0);
+
+	tmp = msg_port_read(HTE, 0x00020011);
+	tmp |= BIT9;
+	tmp &= ~(BIT12 | BIT13);
+	msg_port_write(HTE, 0x00020011, tmp);
+
+	LEAVEFN();
+}
+
+/**
+ * This function clears registers related with errors in the HTE
+ */
+static void hte_clear_error_regs(void)
+{
+	u32 tmp;
+
+	/*
+	 * Clear all HTE errors and enable error checking
+	 * for burst and chunk.
+	 */
+	tmp = msg_port_read(HTE, 0x000200A1);
+	tmp |= BIT8;
+	msg_port_write(HTE, 0x000200A1, tmp);
+}
+
+/**
+ * This function executes basic single cache line memory write/read/verify
+ * test using simple constant pattern, different for READ_RAIN and
+ * WRITE_TRAIN modes.
+ *
+ * See hte_basic_write_read() which is external visible wrapper.
+ *
+ * @mrc_params: host struture for all MRC global data
+ * @addr: memory adress being tested (must hit specific channel/rank)
+ * @first_run: if set then hte registers are configured, otherwise it is
+ *             assumed configuration is done and just re-run the test
+ * @mode: READ_TRAIN or WRITE_TRAIN (the difference is in the pattern)
+ *
+ * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
+ */
+static u16 hte_basic_data_cmp(struct mrc_params *mrc_params, u32 addr,
+			      u8 first_run, u8 mode)
+{
+	u32 pattern;
+	u32 offset;
+
+	if (first_run) {
+		msg_port_write(HTE, 0x00020020, 0x01B10021);
+		msg_port_write(HTE, 0x00020021, 0x06000000);
+		msg_port_write(HTE, 0x00020022, addr >> 6);
+		msg_port_write(HTE, 0x00020062, 0x00800015);
+		msg_port_write(HTE, 0x00020063, 0xAAAAAAAA);
+		msg_port_write(HTE, 0x00020064, 0xCCCCCCCC);
+		msg_port_write(HTE, 0x00020065, 0xF0F0F0F0);
+		msg_port_write(HTE, 0x00020061, 0x00030008);
+
+		if (mode == WRITE_TRAIN)
+			pattern = 0xC33C0000;
+		else /* READ_TRAIN */
+			pattern = 0xAA5555AA;
+
+		for (offset = 0x80; offset <= 0x8F; offset++)
+			msg_port_write(HTE, offset, pattern);
+	}
+
+	msg_port_write(HTE, 0x000200A1, 0xFFFF1000);
+	msg_port_write(HTE, 0x00020011, 0x00011000);
+	msg_port_write(HTE, 0x00020011, 0x00011100);
+
+	hte_wait_for_complete();
+
+	/*
+	 * Return bits 15:8 of HTE_CH0_ERR_XSTAT to check for
+	 * any bytelane errors.
+	 */
+	return (hte_check_errors() >> 8) & 0xFF;
+}
+
+/**
+ * This function examines single cache line memory with write/read/verify
+ * test using multiple data patterns (victim-aggressor algorithm).
+ *
+ * See hte_write_stress_bit_lanes() which is external visible wrapper.
+ *
+ * @mrc_params: host struture for all MRC global data
+ * @addr: memory adress being tested (must hit specific channel/rank)
+ * @loop_cnt: number of test iterations
+ * @seed_victim: victim data pattern seed
+ * @seed_aggressor: aggressor data pattern seed
+ * @victim_bit: should be 0 as auto rotate feature is in use
+ * @first_run: if set then hte registers are configured, otherwise it is
+ *             assumed configuration is done and just re-run the test
+ *
+ * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
+ */
+static u16 hte_rw_data_cmp(struct mrc_params *mrc_params, u32 addr,
+			   u8 loop_cnt, u32 seed_victim, u32 seed_aggressor,
+			   u8 victim_bit, u8 first_run)
+{
+	u32 offset;
+	u32 tmp;
+
+	if (first_run) {
+		msg_port_write(HTE, 0x00020020, 0x00910024);
+		msg_port_write(HTE, 0x00020023, 0x00810024);
+		msg_port_write(HTE, 0x00020021, 0x06070000);
+		msg_port_write(HTE, 0x00020024, 0x06070000);
+		msg_port_write(HTE, 0x00020022, addr >> 6);
+		msg_port_write(HTE, 0x00020025, addr >> 6);
+		msg_port_write(HTE, 0x00020062, 0x0000002A);
+		msg_port_write(HTE, 0x00020063, seed_victim);
+		msg_port_write(HTE, 0x00020064, seed_aggressor);
+		msg_port_write(HTE, 0x00020065, seed_victim);
+
+		/*
+		 * Write the pattern buffers to select the victim bit
+		 *
+		 * Start with bit0
+		 */
+		for (offset = 0x80; offset <= 0x8F; offset++) {
+			if ((offset % 8) == victim_bit)
+				msg_port_write(HTE, offset, 0x55555555);
+			else
+				msg_port_write(HTE, offset, 0xCCCCCCCC);
+		}
+
+		msg_port_write(HTE, 0x00020061, 0x00000000);
+		msg_port_write(HTE, 0x00020066, 0x03440000);
+		msg_port_write(HTE, 0x000200A1, 0xFFFF1000);
+	}
+
+	tmp = 0x10001000 | (loop_cnt << 16);
+	msg_port_write(HTE, 0x00020011, tmp);
+	msg_port_write(HTE, 0x00020011, tmp | BIT8);
+
+	hte_wait_for_complete();
+
+	/*
+	 * Return bits 15:8 of HTE_CH0_ERR_XSTAT to check for
+	 * any bytelane errors.
+	 */
+	return (hte_check_errors() >> 8) & 0xFF;
+}
+
+/**
+ * This function uses HW HTE engine to initialize or test all memory attached
+ * to a given DUNIT. If flag is MRC_MEM_INIT, this routine writes 0s to all
+ * memory locations to initialize ECC. If flag is MRC_MEM_TEST, this routine
+ * will send an 5AA55AA5 pattern to all memory locations on the RankMask and
+ * then read it back. Then it sends an A55AA55A pattern to all memory locations
+ * on the RankMask and reads it back.
+ *
+ * @mrc_params: host struture for all MRC global data
+ * @flag: MRC_MEM_INIT or MRC_MEM_TEST
+ *
+ * @return: errors register showing HTE failures. Also prints out which rank
+ *          failed the HTE test if failure occurs. For rank detection to work,
+ *          the address map must be left in its default state. If MRC changes
+ *          the address map, this function must be modified to change it back
+ *          to default at the beginning, then restore it at the end.
+ */
+u32 hte_mem_init(struct mrc_params *mrc_params, u8 flag)
+{
+	u32 offset;
+	int test_num;
+	int i;
+
+	/*
+	 * Clear out the error registers at the start of each memory
+	 * init or memory test run.
+	 */
+	hte_clear_error_regs();
+
+	msg_port_write(HTE, 0x00020062, 0x00000015);
+
+	for (offset = 0x80; offset <= 0x8F; offset++)
+		msg_port_write(HTE, offset, ((offset & 1) ? 0xA55A : 0x5AA5));
+
+	msg_port_write(HTE, 0x00020021, 0x00000000);
+	msg_port_write(HTE, 0x00020022, (mrc_params->mem_size >> 6) - 1);
+	msg_port_write(HTE, 0x00020063, 0xAAAAAAAA);
+	msg_port_write(HTE, 0x00020064, 0xCCCCCCCC);
+	msg_port_write(HTE, 0x00020065, 0xF0F0F0F0);
+	msg_port_write(HTE, 0x00020066, 0x03000000);
+
+	switch (flag) {
+	case MRC_MEM_INIT:
+		/*
+		 * Only 1 write pass through memory is needed
+		 * to initialize ECC
+		 */
+		test_num = 1;
+		break;
+	case MRC_MEM_TEST:
+		/* Write/read then write/read with inverted pattern */
+		test_num = 4;
+		break;
+	default:
+		DPF(D_INFO, "Unknown parameter for flag: %d\n", flag);
+		return 0xFFFFFFFF;
+	}
+
+	DPF(D_INFO, "hte_mem_init");
+
+	for (i = 0; i < test_num; i++) {
+		DPF(D_INFO, ".");
+
+		if (i == 0) {
+			msg_port_write(HTE, 0x00020061, 0x00000000);
+			msg_port_write(HTE, 0x00020020, 0x00110010);
+		} else if (i == 1) {
+			msg_port_write(HTE, 0x00020061, 0x00000000);
+			msg_port_write(HTE, 0x00020020, 0x00010010);
+		} else if (i == 2) {
+			msg_port_write(HTE, 0x00020061, 0x00010100);
+			msg_port_write(HTE, 0x00020020, 0x00110010);
+		} else {
+			msg_port_write(HTE, 0x00020061, 0x00010100);
+			msg_port_write(HTE, 0x00020020, 0x00010010);
+		}
+
+		msg_port_write(HTE, 0x00020011, 0x00111000);
+		msg_port_write(HTE, 0x00020011, 0x00111100);
+
+		hte_wait_for_complete();
+
+		/* If this is a READ pass, check for errors at the end */
+		if ((i % 2) == 1) {
+			/* Return immediately if error */
+			if (hte_check_errors())
+				break;
+		}
+	}
+
+	DPF(D_INFO, "done\n");
+
+	return hte_check_errors();
+}
+
+/**
+ * This function executes basic single cache line memory write/read/verify
+ * test using simple constant pattern, different for READ_RAIN and
+ * WRITE_TRAIN modes.
+ *
+ * @mrc_params: host struture for all MRC global data
+ * @addr: memory adress being tested (must hit specific channel/rank)
+ * @first_run: if set then hte registers are configured, otherwise it is
+ *             assumed configuration is done and just re-run the test
+ * @mode: READ_TRAIN or WRITE_TRAIN (the difference is in the pattern)
+ *
+ * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
+ */
+u16 hte_basic_write_read(struct mrc_params *mrc_params, u32 addr,
+			 u8 first_run, u8 mode)
+{
+	u16 errors;
+
+	ENTERFN();
+
+	/* Enable all error reporting in preparation for HTE test */
+	hte_enable_all_errors();
+	hte_clear_error_regs();
+
+	errors = hte_basic_data_cmp(mrc_params, addr, first_run, mode);
+
+	LEAVEFN();
+
+	return errors;
+}
+
+/**
+ * This function examines single cache line memory with write/read/verify
+ * test using multiple data patterns (victim-aggressor algorithm).
+ *
+ * @mrc_params: host struture for all MRC global data
+ * @addr: memory adress being tested (must hit specific channel/rank)
+ * @first_run: if set then hte registers are configured, otherwise it is
+ *             assumed configuration is done and just re-run the test
+ *
+ * @return: byte lane failure on each bit (for Quark only bit0 and bit1)
+ */
+u16 hte_write_stress_bit_lanes(struct mrc_params *mrc_params,
+			       u32 addr, u8 first_run)
+{
+	u16 errors;
+	u8 victim_bit = 0;
+
+	ENTERFN();
+
+	/* Enable all error reporting in preparation for HTE test */
+	hte_enable_all_errors();
+	hte_clear_error_regs();
+
+	/*
+	 * Loop through each bit in the bytelane.
+	 *
+	 * Each pass creates a victim bit while keeping all other bits the same
+	 * as aggressors. AVN HTE adds an auto-rotate feature which allows us
+	 * to program the entire victim/aggressor sequence in 1 step.
+	 *
+	 * The victim bit rotates on each pass so no need to have software
+	 * implement a victim bit loop like on VLV.
+	 */
+	errors = hte_rw_data_cmp(mrc_params, addr, HTE_LOOP_CNT,
+				 HTE_LFSR_VICTIM_SEED, HTE_LFSR_AGRESSOR_SEED,
+				 victim_bit, first_run);
+
+	LEAVEFN();
+
+	return errors;
+}
+
+/**
+ * This function execute basic single cache line memory write or read.
+ * This is just for receive enable / fine write levelling purpose.
+ *
+ * @addr: memory adress being tested (must hit specific channel/rank)
+ * @first_run: if set then hte registers are configured, otherwise it is
+ *             assumed configuration is done and just re-run the test
+ * @is_write: when non-zero memory write operation executed, otherwise read
+ */
+void hte_mem_op(u32 addr, u8 first_run, u8 is_write)
+{
+	u32 offset;
+	u32 tmp;
+
+	hte_enable_all_errors();
+	hte_clear_error_regs();
+
+	if (first_run) {
+		tmp = is_write ? 0x01110021 : 0x01010021;
+		msg_port_write(HTE, 0x00020020, tmp);
+
+		msg_port_write(HTE, 0x00020021, 0x06000000);
+		msg_port_write(HTE, 0x00020022, addr >> 6);
+		msg_port_write(HTE, 0x00020062, 0x00800015);
+		msg_port_write(HTE, 0x00020063, 0xAAAAAAAA);
+		msg_port_write(HTE, 0x00020064, 0xCCCCCCCC);
+		msg_port_write(HTE, 0x00020065, 0xF0F0F0F0);
+		msg_port_write(HTE, 0x00020061, 0x00030008);
+
+		for (offset = 0x80; offset <= 0x8F; offset++)
+			msg_port_write(HTE, offset, 0xC33C0000);
+	}
+
+	msg_port_write(HTE, 0x000200A1, 0xFFFF1000);
+	msg_port_write(HTE, 0x00020011, 0x00011000);
+	msg_port_write(HTE, 0x00020011, 0x00011100);
+
+	hte_wait_for_complete();
+}
diff --git a/arch/x86/cpu/quark/hte.h b/arch/x86/cpu/quark/hte.h
new file mode 100644
index 0000000..3a173ea
--- /dev/null
+++ b/arch/x86/cpu/quark/hte.h
@@ -0,0 +1,44 @@ 
+/*
+ * Copyright (C) 2013, Intel Corporation
+ * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
+ *
+ * Ported from Intel released Quark UEFI BIOS
+ * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
+ *
+ * SPDX-License-Identifier:	Intel
+ */
+
+#ifndef _HTE_H_
+#define _HTE_H_
+
+enum {
+	MRC_MEM_INIT,
+	MRC_MEM_TEST
+};
+
+enum {
+	READ_TRAIN,
+	WRITE_TRAIN
+};
+
+/*
+ * EXP_LOOP_CNT field of HTE_CMD_CTL
+ *
+ * This CANNOT be less than 4!
+ */
+#define HTE_LOOP_CNT		5
+
+/* random seed for victim */
+#define HTE_LFSR_VICTIM_SEED	0xF294BA21
+
+/* random seed for aggressor */
+#define HTE_LFSR_AGRESSOR_SEED	0xEBA7492D
+
+u32 hte_mem_init(struct mrc_params *mrc_params, u8 flag);
+u16 hte_basic_write_read(struct mrc_params *mrc_params, u32 addr,
+			 u8 first_run, u8 mode);
+u16 hte_write_stress_bit_lanes(struct mrc_params *mrc_params,
+			       u32 addr, u8 first_run);
+void hte_mem_op(u32 addr, u8 first_run, u8 is_write);
+
+#endif /* _HTE_H_ */
diff --git a/arch/x86/cpu/quark/mrc_util.c b/arch/x86/cpu/quark/mrc_util.c
new file mode 100644
index 0000000..1ae42d6
--- /dev/null
+++ b/arch/x86/cpu/quark/mrc_util.c
@@ -0,0 +1,1499 @@ 
+/*
+ * Copyright (C) 2013, Intel Corporation
+ * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
+ *
+ * Ported from Intel released Quark UEFI BIOS
+ * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
+ *
+ * SPDX-License-Identifier:	Intel
+ */
+
+#include <common.h>
+#include <asm/arch/device.h>
+#include <asm/arch/mrc.h>
+#include <asm/arch/msg_port.h>
+#include "mrc_util.h"
+#include "hte.h"
+#include "smc.h"
+
+static const uint8_t vref_codes[64] = {
+	/* lowest to highest */
+	0x3F, 0x3E, 0x3D, 0x3C, 0x3B, 0x3A, 0x39, 0x38,
+	0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30,
+	0x2F, 0x2E, 0x2D, 0x2C, 0x2B, 0x2A, 0x29, 0x28,
+	0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
+};
+
+void mrc_write_mask(u32 unit, u32 addr, u32 data, u32 mask)
+{
+	msg_port_write(unit, addr,
+		       (msg_port_read(unit, addr) & ~(mask)) |
+		       ((data) & (mask)));
+}
+
+void mrc_alt_write_mask(u32 unit, u32 addr, u32 data, u32 mask)
+{
+	msg_port_alt_write(unit, addr,
+			   (msg_port_alt_read(unit, addr) & ~(mask)) |
+			   ((data) & (mask)));
+}
+
+void mrc_post_code(uint8_t major, uint8_t minor)
+{
+	/* send message to UART */
+	DPF(D_INFO, "POST: 0x%01x%02x\n", major, minor);
+
+	/* error check */
+	if (major == 0xEE)
+		hang();
+}
+
+/* Delay number of nanoseconds */
+void delay_n(uint32_t ns)
+{
+	/* 1000 MHz clock has 1ns period --> no conversion required */
+	uint64_t final_tsc = rdtsc();
+	final_tsc += ((get_tbclk_mhz() * ns) / 1000);
+
+	while (rdtsc() < final_tsc)
+		;
+}
+
+/* Delay number of microseconds */
+void delay_u(uint32_t ms)
+{
+	/* 64-bit math is not an option, just use loops */
+	while (ms--)
+		delay_n(1000);
+}
+
+/* Select Memory Manager as the source for PRI interface */
+void select_mem_mgr(void)
+{
+	u32 dco;
+
+	ENTERFN();
+
+	dco = msg_port_read(MEM_CTLR, DCO);
+	dco &= ~BIT28;
+	msg_port_write(MEM_CTLR, DCO, dco);
+
+	LEAVEFN();
+}
+
+/* Select HTE as the source for PRI interface */
+void select_hte(void)
+{
+	u32 dco;
+
+	ENTERFN();
+
+	dco = msg_port_read(MEM_CTLR, DCO);
+	dco |= BIT28;
+	msg_port_write(MEM_CTLR, DCO, dco);
+
+	LEAVEFN();
+}
+
+/*
+ * Send DRAM command
+ * data should be formated using DCMD_Xxxx macro or emrsXCommand structure
+ */
+void dram_init_command(uint32_t data)
+{
+	pci_write_config_dword(QUARK_HOST_BRIDGE, MSG_DATA_REG, data);
+	pci_write_config_dword(QUARK_HOST_BRIDGE, MSG_CTRL_EXT_REG, 0);
+	msg_port_setup(MSG_OP_DRAM_INIT, MEM_CTLR, 0);
+
+	DPF(D_REGWR, "WR32 %03X %08X %08X\n", MEM_CTLR, 0, data);
+}
+
+/* Send DRAM wake command using special MCU side-band WAKE opcode */
+void dram_wake_command(void)
+{
+	ENTERFN();
+
+	msg_port_setup(MSG_OP_DRAM_WAKE, MEM_CTLR, 0);
+
+	LEAVEFN();
+}
+
+void training_message(uint8_t channel, uint8_t rank, uint8_t byte_lane)
+{
+	/* send message to UART */
+	DPF(D_INFO, "CH%01X RK%01X BL%01X\n", channel, rank, byte_lane);
+}
+
+/*
+ * This function will program the RCVEN delays
+ *
+ * (currently doesn't comprehend rank)
+ */
+void set_rcvn(uint8_t channel, uint8_t rank,
+	      uint8_t byte_lane, uint32_t pi_count)
+{
+	uint32_t reg;
+	uint32_t msk;
+	uint32_t temp;
+
+	ENTERFN();
+
+	DPF(D_TRN, "Rcvn ch%d rnk%d ln%d : pi=%03X\n",
+	    channel, rank, byte_lane, pi_count);
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * BL0 -> B01PTRCTL0[11:08] (0x0-0xF)
+	 * BL1 -> B01PTRCTL0[23:20] (0x0-0xF)
+	 */
+	reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET);
+	msk = (byte_lane & BIT0) ? (BIT23 | BIT22 | BIT21 | BIT20) :
+		(BIT11 | BIT10 | BIT9 | BIT8);
+	temp = (byte_lane & BIT0) ? ((pi_count / HALF_CLK) << 20) :
+		((pi_count / HALF_CLK) << 8);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* Adjust PI_COUNT */
+	pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * BL0 -> B0DLLPICODER0[29:24] (0x00-0x3F)
+	 * BL1 -> B1DLLPICODER0[29:24] (0x00-0x3F)
+	 */
+	reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
+	reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET));
+	msk = (BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24);
+	temp = pi_count << 24;
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/*
+	 * DEADBAND
+	 * BL0/1 -> B01DBCTL1[08/11] (+1 select)
+	 * BL0/1 -> B01DBCTL1[02/05] (enable)
+	 */
+	reg = B01DBCTL1 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET);
+	msk = 0x00;
+	temp = 0x00;
+
+	/* enable */
+	msk |= (byte_lane & BIT0) ? (BIT5) : (BIT2);
+	if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
+		temp |= msk;
+
+	/* select */
+	msk |= (byte_lane & BIT0) ? (BIT11) : (BIT8);
+	if (pi_count < EARLY_DB)
+		temp |= msk;
+
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* error check */
+	if (pi_count > 0x3F) {
+		training_message(channel, rank, byte_lane);
+		mrc_post_code(0xEE, 0xE0);
+	}
+
+	LEAVEFN();
+}
+
+/*
+ * This function will return the current RCVEN delay on the given
+ * channel, rank, byte_lane as an absolute PI count.
+ *
+ * (currently doesn't comprehend rank)
+ */
+uint32_t get_rcvn(uint8_t channel, uint8_t rank, uint8_t byte_lane)
+{
+	uint32_t reg;
+	uint32_t temp;
+	uint32_t pi_count;
+
+	ENTERFN();
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * BL0 -> B01PTRCTL0[11:08] (0x0-0xF)
+	 * BL1 -> B01PTRCTL0[23:20] (0x0-0xF)
+	 */
+	reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET);
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= (byte_lane & BIT0) ? (20) : (8);
+	temp &= 0xF;
+
+	/* Adjust PI_COUNT */
+	pi_count = temp * HALF_CLK;
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * BL0 -> B0DLLPICODER0[29:24] (0x00-0x3F)
+	 * BL1 -> B1DLLPICODER0[29:24] (0x00-0x3F)
+	 */
+	reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
+	reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET));
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= 24;
+	temp &= 0x3F;
+
+	/* Adjust PI_COUNT */
+	pi_count += temp;
+
+	LEAVEFN();
+
+	return pi_count;
+}
+
+/*
+ * This function will program the RDQS delays based on an absolute
+ * amount of PIs.
+ *
+ * (currently doesn't comprehend rank)
+ */
+void set_rdqs(uint8_t channel, uint8_t rank,
+	      uint8_t byte_lane, uint32_t pi_count)
+{
+	uint32_t reg;
+	uint32_t msk;
+	uint32_t temp;
+
+	ENTERFN();
+	DPF(D_TRN, "Rdqs ch%d rnk%d ln%d : pi=%03X\n",
+	    channel, rank, byte_lane, pi_count);
+
+	/*
+	 * PI (1/128 MCLK)
+	 * BL0 -> B0RXDQSPICODE[06:00] (0x00-0x47)
+	 * BL1 -> B1RXDQSPICODE[06:00] (0x00-0x47)
+	 */
+	reg = (byte_lane & BIT0) ? (B1RXDQSPICODE) : (B0RXDQSPICODE);
+	reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET));
+	msk = (BIT6 | BIT5 | BIT4 | BIT3 | BIT2 | BIT1 | BIT0);
+	temp = pi_count << 0;
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* error check (shouldn't go above 0x3F) */
+	if (pi_count > 0x47) {
+		training_message(channel, rank, byte_lane);
+		mrc_post_code(0xEE, 0xE1);
+	}
+
+	LEAVEFN();
+}
+
+/*
+ * This function will return the current RDQS delay on the given
+ * channel, rank, byte_lane as an absolute PI count.
+ *
+ * (currently doesn't comprehend rank)
+ */
+uint32_t get_rdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane)
+{
+	uint32_t reg;
+	uint32_t temp;
+	uint32_t pi_count;
+
+	ENTERFN();
+
+	/*
+	 * PI (1/128 MCLK)
+	 * BL0 -> B0RXDQSPICODE[06:00] (0x00-0x47)
+	 * BL1 -> B1RXDQSPICODE[06:00] (0x00-0x47)
+	 */
+	reg = (byte_lane & BIT0) ? (B1RXDQSPICODE) : (B0RXDQSPICODE);
+	reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET));
+	temp = msg_port_alt_read(DDRPHY, reg);
+
+	/* Adjust PI_COUNT */
+	pi_count = temp & 0x7F;
+
+	LEAVEFN();
+
+	return pi_count;
+}
+
+/*
+ * This function will program the WDQS delays based on an absolute
+ * amount of PIs.
+ *
+ * (currently doesn't comprehend rank)
+ */
+void set_wdqs(uint8_t channel, uint8_t rank,
+	      uint8_t byte_lane, uint32_t pi_count)
+{
+	uint32_t reg;
+	uint32_t msk;
+	uint32_t temp;
+
+	ENTERFN();
+
+	DPF(D_TRN, "Wdqs ch%d rnk%d ln%d : pi=%03X\n",
+	    channel, rank, byte_lane, pi_count);
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * BL0 -> B01PTRCTL0[07:04] (0x0-0xF)
+	 * BL1 -> B01PTRCTL0[19:16] (0x0-0xF)
+	 */
+	reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET);
+	msk = (byte_lane & BIT0) ? (BIT19 | BIT18 | BIT17 | BIT16) :
+		(BIT7 | BIT6 | BIT5 | BIT4);
+	temp = pi_count / HALF_CLK;
+	temp <<= (byte_lane & BIT0) ? (16) : (4);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* Adjust PI_COUNT */
+	pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * BL0 -> B0DLLPICODER0[21:16] (0x00-0x3F)
+	 * BL1 -> B1DLLPICODER0[21:16] (0x00-0x3F)
+	 */
+	reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
+	reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET));
+	msk = (BIT21 | BIT20 | BIT19 | BIT18 | BIT17 | BIT16);
+	temp = pi_count << 16;
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/*
+	 * DEADBAND
+	 * BL0/1 -> B01DBCTL1[07/10] (+1 select)
+	 * BL0/1 -> B01DBCTL1[01/04] (enable)
+	 */
+	reg = B01DBCTL1 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET);
+	msk = 0x00;
+	temp = 0x00;
+
+	/* enable */
+	msk |= (byte_lane & BIT0) ? (BIT4) : (BIT1);
+	if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
+		temp |= msk;
+
+	/* select */
+	msk |= (byte_lane & BIT0) ? (BIT10) : (BIT7);
+	if (pi_count < EARLY_DB)
+		temp |= msk;
+
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* error check */
+	if (pi_count > 0x3F) {
+		training_message(channel, rank, byte_lane);
+		mrc_post_code(0xEE, 0xE2);
+	}
+
+	LEAVEFN();
+}
+
+/*
+ * This function will return the amount of WDQS delay on the given
+ * channel, rank, byte_lane as an absolute PI count.
+ *
+ * (currently doesn't comprehend rank)
+ */
+uint32_t get_wdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane)
+{
+	uint32_t reg;
+	uint32_t temp;
+	uint32_t pi_count;
+
+	ENTERFN();
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * BL0 -> B01PTRCTL0[07:04] (0x0-0xF)
+	 * BL1 -> B01PTRCTL0[19:16] (0x0-0xF)
+	 */
+	reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET);
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= (byte_lane & BIT0) ? (16) : (4);
+	temp &= 0xF;
+
+	/* Adjust PI_COUNT */
+	pi_count = (temp * HALF_CLK);
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * BL0 -> B0DLLPICODER0[21:16] (0x00-0x3F)
+	 * BL1 -> B1DLLPICODER0[21:16] (0x00-0x3F)
+	 */
+	reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
+	reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET));
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= 16;
+	temp &= 0x3F;
+
+	/* Adjust PI_COUNT */
+	pi_count += temp;
+
+	LEAVEFN();
+
+	return pi_count;
+}
+
+/*
+ * This function will program the WDQ delays based on an absolute
+ * number of PIs.
+ *
+ * (currently doesn't comprehend rank)
+ */
+void set_wdq(uint8_t channel, uint8_t rank,
+	     uint8_t byte_lane, uint32_t pi_count)
+{
+	uint32_t reg;
+	uint32_t msk;
+	uint32_t temp;
+
+	ENTERFN();
+
+	DPF(D_TRN, "Wdq ch%d rnk%d ln%d : pi=%03X\n",
+	    channel, rank, byte_lane, pi_count);
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * BL0 -> B01PTRCTL0[03:00] (0x0-0xF)
+	 * BL1 -> B01PTRCTL0[15:12] (0x0-0xF)
+	 */
+	reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET);
+	msk = (byte_lane & BIT0) ? (BIT15 | BIT14 | BIT13 | BIT12) :
+		(BIT3 | BIT2 | BIT1 | BIT0);
+	temp = pi_count / HALF_CLK;
+	temp <<= (byte_lane & BIT0) ? (12) : (0);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* Adjust PI_COUNT */
+	pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * BL0 -> B0DLLPICODER0[13:08] (0x00-0x3F)
+	 * BL1 -> B1DLLPICODER0[13:08] (0x00-0x3F)
+	 */
+	reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
+	reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET));
+	msk = (BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8);
+	temp = pi_count << 8;
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/*
+	 * DEADBAND
+	 * BL0/1 -> B01DBCTL1[06/09] (+1 select)
+	 * BL0/1 -> B01DBCTL1[00/03] (enable)
+	 */
+	reg = B01DBCTL1 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET);
+	msk = 0x00;
+	temp = 0x00;
+
+	/* enable */
+	msk |= (byte_lane & BIT0) ? (BIT3) : (BIT0);
+	if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
+		temp |= msk;
+
+	/* select */
+	msk |= (byte_lane & BIT0) ? (BIT9) : (BIT6);
+	if (pi_count < EARLY_DB)
+		temp |= msk;
+
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* error check */
+	if (pi_count > 0x3F) {
+		training_message(channel, rank, byte_lane);
+		mrc_post_code(0xEE, 0xE3);
+	}
+
+	LEAVEFN();
+}
+
+/*
+ * This function will return the amount of WDQ delay on the given
+ * channel, rank, byte_lane as an absolute PI count.
+ *
+ * (currently doesn't comprehend rank)
+ */
+uint32_t get_wdq(uint8_t channel, uint8_t rank, uint8_t byte_lane)
+{
+	uint32_t reg;
+	uint32_t temp;
+	uint32_t pi_count;
+
+	ENTERFN();
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * BL0 -> B01PTRCTL0[03:00] (0x0-0xF)
+	 * BL1 -> B01PTRCTL0[15:12] (0x0-0xF)
+	 */
+	reg = B01PTRCTL0 + ((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET);
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= (byte_lane & BIT0) ? (12) : (0);
+	temp &= 0xF;
+
+	/* Adjust PI_COUNT */
+	pi_count = (temp * HALF_CLK);
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * BL0 -> B0DLLPICODER0[13:08] (0x00-0x3F)
+	 * BL1 -> B1DLLPICODER0[13:08] (0x00-0x3F)
+	 */
+	reg = (byte_lane & BIT0) ? (B1DLLPICODER0) : (B0DLLPICODER0);
+	reg += (((byte_lane >> 1) * DDRIODQ_BL_OFFSET) +
+		(channel * DDRIODQ_CH_OFFSET));
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= 8;
+	temp &= 0x3F;
+
+	/* Adjust PI_COUNT */
+	pi_count += temp;
+
+	LEAVEFN();
+
+	return pi_count;
+}
+
+/*
+ * This function will program the WCMD delays based on an absolute
+ * number of PIs.
+ */
+void set_wcmd(uint8_t channel, uint32_t pi_count)
+{
+	uint32_t reg;
+	uint32_t msk;
+	uint32_t temp;
+
+	ENTERFN();
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * CMDPTRREG[11:08] (0x0-0xF)
+	 */
+	reg = CMDPTRREG + (channel * DDRIOCCC_CH_OFFSET);
+	msk = (BIT11 | BIT10 | BIT9 | BIT8);
+	temp = pi_count / HALF_CLK;
+	temp <<= 8;
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* Adjust PI_COUNT */
+	pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * CMDDLLPICODER0[29:24] -> CMDSLICE R3 (unused)
+	 * CMDDLLPICODER0[21:16] -> CMDSLICE L3 (unused)
+	 * CMDDLLPICODER0[13:08] -> CMDSLICE R2 (unused)
+	 * CMDDLLPICODER0[05:00] -> CMDSLICE L2 (unused)
+	 * CMDDLLPICODER1[29:24] -> CMDSLICE R1 (unused)
+	 * CMDDLLPICODER1[21:16] -> CMDSLICE L1 (0x00-0x3F)
+	 * CMDDLLPICODER1[13:08] -> CMDSLICE R0 (unused)
+	 * CMDDLLPICODER1[05:00] -> CMDSLICE L0 (unused)
+	 */
+	reg = CMDDLLPICODER1 + (channel * DDRIOCCC_CH_OFFSET);
+
+	msk = (BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24 |
+		BIT21 | BIT20 | BIT19 | BIT18 | BIT17 | BIT16 |
+		BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8 |
+		BIT5 | BIT4 | BIT3 | BIT2 | BIT1 | BIT0);
+
+	temp = (pi_count << 24) | (pi_count << 16) |
+		(pi_count << 8) | (pi_count << 0);
+
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+	reg = CMDDLLPICODER0 + (channel * DDRIOCCC_CH_OFFSET);	/* PO */
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/*
+	 * DEADBAND
+	 * CMDCFGREG0[17] (+1 select)
+	 * CMDCFGREG0[16] (enable)
+	 */
+	reg = CMDCFGREG0 + (channel * DDRIOCCC_CH_OFFSET);
+	msk = 0x00;
+	temp = 0x00;
+
+	/* enable */
+	msk |= BIT16;
+	if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
+		temp |= msk;
+
+	/* select */
+	msk |= BIT17;
+	if (pi_count < EARLY_DB)
+		temp |= msk;
+
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* error check */
+	if (pi_count > 0x3F)
+		mrc_post_code(0xEE, 0xE4);
+
+	LEAVEFN();
+}
+
+/*
+ * This function will return the amount of WCMD delay on the given
+ * channel as an absolute PI count.
+ */
+uint32_t get_wcmd(uint8_t channel)
+{
+	uint32_t reg;
+	uint32_t temp;
+	uint32_t pi_count;
+
+	ENTERFN();
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * CMDPTRREG[11:08] (0x0-0xF)
+	 */
+	reg = CMDPTRREG + (channel * DDRIOCCC_CH_OFFSET);
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= 8;
+	temp &= 0xF;
+
+	/* Adjust PI_COUNT */
+	pi_count = temp * HALF_CLK;
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * CMDDLLPICODER0[29:24] -> CMDSLICE R3 (unused)
+	 * CMDDLLPICODER0[21:16] -> CMDSLICE L3 (unused)
+	 * CMDDLLPICODER0[13:08] -> CMDSLICE R2 (unused)
+	 * CMDDLLPICODER0[05:00] -> CMDSLICE L2 (unused)
+	 * CMDDLLPICODER1[29:24] -> CMDSLICE R1 (unused)
+	 * CMDDLLPICODER1[21:16] -> CMDSLICE L1 (0x00-0x3F)
+	 * CMDDLLPICODER1[13:08] -> CMDSLICE R0 (unused)
+	 * CMDDLLPICODER1[05:00] -> CMDSLICE L0 (unused)
+	 */
+	reg = CMDDLLPICODER1 + (channel * DDRIOCCC_CH_OFFSET);
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= 16;
+	temp &= 0x3F;
+
+	/* Adjust PI_COUNT */
+	pi_count += temp;
+
+	LEAVEFN();
+
+	return pi_count;
+}
+
+/*
+ * This function will program the WCLK delays based on an absolute
+ * number of PIs.
+ */
+void set_wclk(uint8_t channel, uint8_t rank, uint32_t pi_count)
+{
+	uint32_t reg;
+	uint32_t msk;
+	uint32_t temp;
+
+	ENTERFN();
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * CCPTRREG[15:12] -> CLK1 (0x0-0xF)
+	 * CCPTRREG[11:08] -> CLK0 (0x0-0xF)
+	 */
+	reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
+	msk = (BIT15 | BIT14 | BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8);
+	temp = ((pi_count / HALF_CLK) << 12) | ((pi_count / HALF_CLK) << 8);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* Adjust PI_COUNT */
+	pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * ECCB1DLLPICODER0[13:08] -> CLK0 (0x00-0x3F)
+	 * ECCB1DLLPICODER0[21:16] -> CLK1 (0x00-0x3F)
+	 */
+	reg = (rank) ? (ECCB1DLLPICODER0) : (ECCB1DLLPICODER0);
+	reg += (channel * DDRIOCCC_CH_OFFSET);
+	msk = (BIT21 | BIT20 | BIT19 | BIT18 | BIT17 | BIT16 |
+		BIT13 | BIT12 | BIT11 | BIT10 | BIT9 | BIT8);
+	temp = (pi_count << 16) | (pi_count << 8);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+	reg = (rank) ? (ECCB1DLLPICODER1) : (ECCB1DLLPICODER1);
+	reg += (channel * DDRIOCCC_CH_OFFSET);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+	reg = (rank) ? (ECCB1DLLPICODER2) : (ECCB1DLLPICODER2);
+	reg += (channel * DDRIOCCC_CH_OFFSET);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+	reg = (rank) ? (ECCB1DLLPICODER3) : (ECCB1DLLPICODER3);
+	reg += (channel * DDRIOCCC_CH_OFFSET);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/*
+	 * DEADBAND
+	 * CCCFGREG1[11:08] (+1 select)
+	 * CCCFGREG1[03:00] (enable)
+	 */
+	reg = CCCFGREG1 + (channel * DDRIOCCC_CH_OFFSET);
+	msk = 0x00;
+	temp = 0x00;
+
+	/* enable */
+	msk |= (BIT3 | BIT2 | BIT1 | BIT0);
+	if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
+		temp |= msk;
+
+	/* select */
+	msk |= (BIT11 | BIT10 | BIT9 | BIT8);
+	if (pi_count < EARLY_DB)
+		temp |= msk;
+
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* error check */
+	if (pi_count > 0x3F)
+		mrc_post_code(0xEE, 0xE5);
+
+	LEAVEFN();
+}
+
+/*
+ * This function will return the amout of WCLK delay on the given
+ * channel, rank as an absolute PI count.
+ */
+uint32_t get_wclk(uint8_t channel, uint8_t rank)
+{
+	uint32_t reg;
+	uint32_t temp;
+	uint32_t pi_count;
+
+	ENTERFN();
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * CCPTRREG[15:12] -> CLK1 (0x0-0xF)
+	 * CCPTRREG[11:08] -> CLK0 (0x0-0xF)
+	 */
+	reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= (rank) ? (12) : (8);
+	temp &= 0xF;
+
+	/* Adjust PI_COUNT */
+	pi_count = temp * HALF_CLK;
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * ECCB1DLLPICODER0[13:08] -> CLK0 (0x00-0x3F)
+	 * ECCB1DLLPICODER0[21:16] -> CLK1 (0x00-0x3F)
+	 */
+	reg = (rank) ? (ECCB1DLLPICODER0) : (ECCB1DLLPICODER0);
+	reg += (channel * DDRIOCCC_CH_OFFSET);
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= (rank) ? (16) : (8);
+	temp &= 0x3F;
+
+	pi_count += temp;
+
+	LEAVEFN();
+
+	return pi_count;
+}
+
+/*
+ * This function will program the WCTL delays based on an absolute
+ * number of PIs.
+ *
+ * (currently doesn't comprehend rank)
+ */
+void set_wctl(uint8_t channel, uint8_t rank, uint32_t pi_count)
+{
+	uint32_t reg;
+	uint32_t msk;
+	uint32_t temp;
+
+	ENTERFN();
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * CCPTRREG[31:28] (0x0-0xF)
+	 * CCPTRREG[27:24] (0x0-0xF)
+	 */
+	reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
+	msk = (BIT31 | BIT30 | BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24);
+	temp = ((pi_count / HALF_CLK) << 28) | ((pi_count / HALF_CLK) << 24);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* Adjust PI_COUNT */
+	pi_count -= ((pi_count / HALF_CLK) & 0xF) * HALF_CLK;
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
+	 * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
+	 */
+	reg = ECCB1DLLPICODER0 + (channel * DDRIOCCC_CH_OFFSET);
+	msk = (BIT29 | BIT28 | BIT27 | BIT26 | BIT25 | BIT24);
+	temp = (pi_count << 24);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+	reg = ECCB1DLLPICODER1 + (channel * DDRIOCCC_CH_OFFSET);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+	reg = ECCB1DLLPICODER2 + (channel * DDRIOCCC_CH_OFFSET);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+	reg = ECCB1DLLPICODER3 + (channel * DDRIOCCC_CH_OFFSET);
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/*
+	 * DEADBAND
+	 * CCCFGREG1[13:12] (+1 select)
+	 * CCCFGREG1[05:04] (enable)
+	 */
+	reg = CCCFGREG1 + (channel * DDRIOCCC_CH_OFFSET);
+	msk = 0x00;
+	temp = 0x00;
+
+	/* enable */
+	msk |= (BIT5 | BIT4);
+	if ((pi_count < EARLY_DB) || (pi_count > LATE_DB))
+		temp |= msk;
+
+	/* select */
+	msk |= (BIT13 | BIT12);
+	if (pi_count < EARLY_DB)
+		temp |= msk;
+
+	mrc_alt_write_mask(DDRPHY, reg, temp, msk);
+
+	/* error check */
+	if (pi_count > 0x3F)
+		mrc_post_code(0xEE, 0xE6);
+
+	LEAVEFN();
+}
+
+/*
+ * This function will return the amount of WCTL delay on the given
+ * channel, rank as an absolute PI count.
+ *
+ * (currently doesn't comprehend rank)
+ */
+uint32_t get_wctl(uint8_t channel, uint8_t rank)
+{
+	uint32_t reg;
+	uint32_t temp;
+	uint32_t pi_count;
+
+	ENTERFN();
+
+	/*
+	 * RDPTR (1/2 MCLK, 64 PIs)
+	 * CCPTRREG[31:28] (0x0-0xF)
+	 * CCPTRREG[27:24] (0x0-0xF)
+	 */
+	reg = CCPTRREG + (channel * DDRIOCCC_CH_OFFSET);
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= 24;
+	temp &= 0xF;
+
+	/* Adjust PI_COUNT */
+	pi_count = temp * HALF_CLK;
+
+	/*
+	 * PI (1/64 MCLK, 1 PIs)
+	 * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
+	 * ECCB1DLLPICODER?[29:24] (0x00-0x3F)
+	 */
+	reg = ECCB1DLLPICODER0 + (channel * DDRIOCCC_CH_OFFSET);
+	temp = msg_port_alt_read(DDRPHY, reg);
+	temp >>= 24;
+	temp &= 0x3F;
+
+	/* Adjust PI_COUNT */
+	pi_count += temp;
+
+	LEAVEFN();
+
+	return pi_count;
+}
+
+/*
+ * This function will program the internal Vref setting in a given
+ * byte lane in a given channel.
+ */
+void set_vref(uint8_t channel, uint8_t byte_lane, uint32_t setting)
+{
+	uint32_t reg = (byte_lane & 0x1) ? (B1VREFCTL) : (B0VREFCTL);
+
+	ENTERFN();
+
+	DPF(D_TRN, "Vref ch%d ln%d : val=%03X\n",
+	    channel, byte_lane, setting);
+
+	mrc_alt_write_mask(DDRPHY, (reg + (channel * DDRIODQ_CH_OFFSET) +
+		((byte_lane >> 1) * DDRIODQ_BL_OFFSET)),
+		(vref_codes[setting] << 2),
+		(BIT7 | BIT6 | BIT5 | BIT4 | BIT3 | BIT2));
+
+	/*
+	 * need to wait ~300ns for Vref to settle
+	 * (check that this is necessary)
+	 */
+	delay_n(300);
+
+	/* ??? may need to clear pointers ??? */
+
+	LEAVEFN();
+}
+
+/*
+ * This function will return the internal Vref setting for the given
+ * channel, byte_lane.
+ */
+uint32_t get_vref(uint8_t channel, uint8_t byte_lane)
+{
+	uint8_t j;
+	uint32_t ret_val = sizeof(vref_codes) / 2;
+	uint32_t reg = (byte_lane & 0x1) ? (B1VREFCTL) : (B0VREFCTL);
+	uint32_t temp;
+
+	ENTERFN();
+
+	temp = msg_port_alt_read(DDRPHY, (reg + (channel * DDRIODQ_CH_OFFSET) +
+		((byte_lane >> 1) * DDRIODQ_BL_OFFSET)));
+	temp >>= 2;
+	temp &= 0x3F;
+
+	for (j = 0; j < sizeof(vref_codes); j++) {
+		if (vref_codes[j] == temp) {
+			ret_val = j;
+			break;
+		}
+	}
+
+	LEAVEFN();
+
+	return ret_val;
+}
+
+/*
+ * This function will return a 32 bit address in the desired
+ * channel and rank.
+ */
+uint32_t get_addr(uint8_t channel, uint8_t rank)
+{
+	uint32_t offset = 0x02000000;	/* 32MB */
+
+	/* Begin product specific code */
+	if (channel > 0) {
+		DPF(D_ERROR, "ILLEGAL CHANNEL\n");
+		DEAD_LOOP();
+	}
+
+	if (rank > 1) {
+		DPF(D_ERROR, "ILLEGAL RANK\n");
+		DEAD_LOOP();
+	}
+
+	/* use 256MB lowest density as per DRP == 0x0003 */
+	offset += rank * (256 * 1024 * 1024);
+
+	return offset;
+}
+
+/*
+ * This function will sample the DQTRAINSTS registers in the given
+ * channel/rank SAMPLE_SIZE times looking for a valid '0' or '1'.
+ *
+ * It will return an encoded DWORD in which each bit corresponds to
+ * the sampled value on the byte lane.
+ */
+uint32_t sample_dqs(struct mrc_params *mrc_params, uint8_t channel,
+		    uint8_t rank, bool rcvn)
+{
+	uint8_t j;	/* just a counter */
+	uint8_t bl;	/* which BL in the module (always 2 per module) */
+	uint8_t bl_grp;	/* which BL module */
+	/* byte lane divisor */
+	uint8_t bl_divisor = (mrc_params->channel_width == X16) ? 2 : 1;
+	uint32_t msk[2];	/* BLx in module */
+	/* DQTRAINSTS register contents for each sample */
+	uint32_t sampled_val[SAMPLE_SIZE];
+	uint32_t num_0s;	/* tracks the number of '0' samples */
+	uint32_t num_1s;	/* tracks the number of '1' samples */
+	uint32_t ret_val = 0x00;	/* assume all '0' samples */
+	uint32_t address = get_addr(channel, rank);
+
+	/* initialise msk[] */
+	msk[0] = (rcvn) ? (BIT1) : (BIT9);	/* BL0 */
+	msk[1] = (rcvn) ? (BIT0) : (BIT8);	/* BL1 */
+
+	/* cycle through each byte lane group */
+	for (bl_grp = 0; bl_grp < (NUM_BYTE_LANES / bl_divisor) / 2; bl_grp++) {
+		/* take SAMPLE_SIZE samples */
+		for (j = 0; j < SAMPLE_SIZE; j++) {
+			hte_mem_op(address, mrc_params->first_run,
+				   rcvn ? 0 : 1);
+			mrc_params->first_run = 0;
+
+			/*
+			 * record the contents of the proper
+			 * DQTRAINSTS register
+			 */
+			sampled_val[j] = msg_port_alt_read(DDRPHY,
+				(DQTRAINSTS +
+				(bl_grp * DDRIODQ_BL_OFFSET) +
+				(channel * DDRIODQ_CH_OFFSET)));
+		}
+
+		/*
+		 * look for a majority value (SAMPLE_SIZE / 2) + 1
+		 * on the byte lane and set that value in the corresponding
+		 * ret_val bit
+		 */
+		for (bl = 0; bl < 2; bl++) {
+			num_0s = 0x00;	/* reset '0' tracker for byte lane */
+			num_1s = 0x00;	/* reset '1' tracker for byte lane */
+			for (j = 0; j < SAMPLE_SIZE; j++) {
+				if (sampled_val[j] & msk[bl])
+					num_1s++;
+				else
+					num_0s++;
+			}
+		if (num_1s > num_0s)
+			ret_val |= (1 << (bl + (bl_grp * 2)));
+		}
+	}
+
+	/*
+	 * "ret_val.0" contains the status of BL0
+	 * "ret_val.1" contains the status of BL1
+	 * "ret_val.2" contains the status of BL2
+	 * etc.
+	 */
+	return ret_val;
+}
+
+/* This function will find the rising edge transition on RCVN or WDQS */
+void find_rising_edge(struct mrc_params *mrc_params, uint32_t delay[],
+		      uint8_t channel, uint8_t rank, bool rcvn)
+{
+	bool all_edges_found;	/* determines stop condition */
+	bool direction[NUM_BYTE_LANES];	/* direction indicator */
+	uint8_t sample;	/* sample counter */
+	uint8_t bl;	/* byte lane counter */
+	/* byte lane divisor */
+	uint8_t bl_divisor = (mrc_params->channel_width == X16) ? 2 : 1;
+	uint32_t sample_result[SAMPLE_CNT];	/* results of sample_dqs() */
+	uint32_t temp;
+	uint32_t transition_pattern;
+
+	ENTERFN();
+
+	/* select hte and request initial configuration */
+	select_hte();
+	mrc_params->first_run = 1;
+
+	/* Take 3 sample points (T1,T2,T3) to obtain a transition pattern */
+	for (sample = 0; sample < SAMPLE_CNT; sample++) {
+		/* program the desired delays for sample */
+		for (bl = 0; bl < (NUM_BYTE_LANES / bl_divisor); bl++) {
+			/* increase sample delay by 26 PI (0.2 CLK) */
+			if (rcvn) {
+				set_rcvn(channel, rank, bl,
+					 delay[bl] + (sample * SAMPLE_DLY));
+			} else {
+				set_wdqs(channel, rank, bl,
+					 delay[bl] + (sample * SAMPLE_DLY));
+			}
+		}
+
+		/* take samples (Tsample_i) */
+		sample_result[sample] = sample_dqs(mrc_params,
+			channel, rank, rcvn);
+
+		DPF(D_TRN,
+		    "Find rising edge %s ch%d rnk%d: #%d dly=%d dqs=%02X\n",
+		    (rcvn ? "RCVN" : "WDQS"), channel, rank, sample,
+		    sample * SAMPLE_DLY, sample_result[sample]);
+	}
+
+	/*
+	 * This pattern will help determine where we landed and ultimately
+	 * how to place RCVEN/WDQS.
+	 */
+	for (bl = 0; bl < (NUM_BYTE_LANES / bl_divisor); bl++) {
+		/* build transition_pattern (MSB is 1st sample) */
+		transition_pattern = 0;
+		for (sample = 0; sample < SAMPLE_CNT; sample++) {
+			transition_pattern |=
+				((sample_result[sample] & (1 << bl)) >> bl) <<
+				(SAMPLE_CNT - 1 - sample);
+		}
+
+		DPF(D_TRN, "=== transition pattern %d\n", transition_pattern);
+
+		/*
+		 * set up to look for rising edge based on
+		 * transition_pattern
+		 */
+		switch (transition_pattern) {
+		case 0:	/* sampled 0->0->0 */
+			/* move forward from T3 looking for 0->1 */
+			delay[bl] += 2 * SAMPLE_DLY;
+			direction[bl] = FORWARD;
+			break;
+		case 1:	/* sampled 0->0->1 */
+		case 5:	/* sampled 1->0->1 (bad duty cycle) *HSD#237503* */
+			/* move forward from T2 looking for 0->1 */
+			delay[bl] += 1 * SAMPLE_DLY;
+			direction[bl] = FORWARD;
+			break;
+		case 2:	/* sampled 0->1->0 (bad duty cycle) *HSD#237503* */
+		case 3:	/* sampled 0->1->1 */
+			/* move forward from T1 looking for 0->1 */
+			delay[bl] += 0 * SAMPLE_DLY;
+			direction[bl] = FORWARD;
+			break;
+		case 4:	/* sampled 1->0->0 (assumes BL8, HSD#234975) */
+			/* move forward from T3 looking for 0->1 */
+			delay[bl] += 2 * SAMPLE_DLY;
+			direction[bl] = FORWARD;
+			break;
+		case 6:	/* sampled 1->1->0 */
+		case 7:	/* sampled 1->1->1 */
+			/* move backward from T1 looking for 1->0 */
+			delay[bl] += 0 * SAMPLE_DLY;
+			direction[bl] = BACKWARD;
+			break;
+		default:
+			mrc_post_code(0xEE, 0xEE);
+			break;
+		}
+
+		/* program delays */
+		if (rcvn)
+			set_rcvn(channel, rank, bl, delay[bl]);
+		else
+			set_wdqs(channel, rank, bl, delay[bl]);
+	}
+
+	/*
+	 * Based on the observed transition pattern on the byte lane,
+	 * begin looking for a rising edge with single PI granularity.
+	 */
+	do {
+		all_edges_found = true;	/* assume all byte lanes passed */
+		/* take a sample */
+		temp = sample_dqs(mrc_params, channel, rank, rcvn);
+		/* check all each byte lane for proper edge */
+		for (bl = 0; bl < (NUM_BYTE_LANES / bl_divisor); bl++) {
+			if (temp & (1 << bl)) {
+				/* sampled "1" */
+				if (direction[bl] == BACKWARD) {
+					/*
+					 * keep looking for edge
+					 * on this byte lane
+					 */
+					all_edges_found = false;
+					delay[bl] -= 1;
+					if (rcvn) {
+						set_rcvn(channel, rank,
+							 bl, delay[bl]);
+					} else {
+						set_wdqs(channel, rank,
+							 bl, delay[bl]);
+					}
+				}
+			} else {
+				/* sampled "0" */
+				if (direction[bl] == FORWARD) {
+					/*
+					 * keep looking for edge
+					 * on this byte lane
+					 */
+					all_edges_found = false;
+					delay[bl] += 1;
+					if (rcvn) {
+						set_rcvn(channel, rank,
+							 bl, delay[bl]);
+					} else {
+						set_wdqs(channel, rank,
+							 bl, delay[bl]);
+					}
+				}
+			}
+		}
+	} while (!all_edges_found);
+
+	/* restore DDR idle state */
+	dram_init_command(DCMD_PREA(rank));
+
+	DPF(D_TRN, "Delay %03X %03X %03X %03X\n",
+	    delay[0], delay[1], delay[2], delay[3]);
+
+	LEAVEFN();
+}
+
+/*
+ * This function will return a 32 bit mask that will be used to
+ * check for byte lane failures.
+ */
+uint32_t byte_lane_mask(struct mrc_params *mrc_params)
+{
+	uint32_t j;
+	uint32_t ret_val = 0x00;
+
+	/*
+	 * set ret_val based on NUM_BYTE_LANES such that you will check
+	 * only BL0 in result
+	 *
+	 * (each bit in result represents a byte lane)
+	 */
+	for (j = 0; j < MAX_BYTE_LANES; j += NUM_BYTE_LANES)
+		ret_val |= (1 << ((j / NUM_BYTE_LANES) * NUM_BYTE_LANES));
+
+	/*
+	 * HSD#235037
+	 * need to adjust the mask for 16-bit mode
+	 */
+	if (mrc_params->channel_width == X16)
+		ret_val |= (ret_val << 2);
+
+	return ret_val;
+}
+
+/*
+ * Check memory executing simple write/read/verify at the specified address.
+ *
+ * Bits in the result indicate failure on specific byte lane.
+ */
+uint32_t check_rw_coarse(struct mrc_params *mrc_params, uint32_t address)
+{
+	uint32_t result = 0;
+	uint8_t first_run = 0;
+
+	if (mrc_params->hte_setup) {
+		mrc_params->hte_setup = 0;
+		first_run = 1;
+		select_hte();
+	}
+
+	result = hte_basic_write_read(mrc_params, address,
+				      first_run, WRITE_TRAIN);
+
+	DPF(D_TRN, "check_rw_coarse result is %x\n", result);
+
+	return result;
+}
+
+/*
+ * Check memory executing write/read/verify of many data patterns
+ * at the specified address. Bits in the result indicate failure
+ * on specific byte lane.
+ */
+uint32_t check_bls_ex(struct mrc_params *mrc_params, uint32_t address)
+{
+	uint32_t result;
+	uint8_t first_run = 0;
+
+	if (mrc_params->hte_setup) {
+		mrc_params->hte_setup = 0;
+		first_run = 1;
+		select_hte();
+	}
+
+	result = hte_write_stress_bit_lanes(mrc_params, address, first_run);
+
+	DPF(D_TRN, "check_bls_ex result is %x\n", result);
+
+	return result;
+}
+
+/*
+ * 32-bit LFSR with characteristic polynomial: X^32 + X^22 +X^2 + X^1
+ *
+ * The function takes pointer to previous 32 bit value and
+ * modifies it to next value.
+ */
+void lfsr32(uint32_t *lfsr_ptr)
+{
+	uint32_t bit;
+	uint32_t lfsr;
+	int i;
+
+	lfsr = *lfsr_ptr;
+
+	for (i = 0; i < 32; i++) {
+		bit = 1 ^ (lfsr & BIT0);
+		bit = bit ^ ((lfsr & BIT1) >> 1);
+		bit = bit ^ ((lfsr & BIT2) >> 2);
+		bit = bit ^ ((lfsr & BIT22) >> 22);
+
+		lfsr = ((lfsr >> 1) | (bit << 31));
+	}
+
+	*lfsr_ptr = lfsr;
+}
+
+/* Clear the pointers in a given byte lane in a given channel */
+void clear_pointers(void)
+{
+	uint8_t channel;
+	uint8_t bl;
+
+	ENTERFN();
+
+	for (channel = 0; channel < NUM_CHANNELS; channel++) {
+		for (bl = 0; bl < NUM_BYTE_LANES; bl++) {
+			mrc_alt_write_mask(DDRPHY,
+					   (B01PTRCTL1 +
+					   (channel * DDRIODQ_CH_OFFSET) +
+					   ((bl >> 1) * DDRIODQ_BL_OFFSET)),
+					   ~BIT8, BIT8);
+
+			mrc_alt_write_mask(DDRPHY,
+					   (B01PTRCTL1 +
+					   (channel * DDRIODQ_CH_OFFSET) +
+					   ((bl >> 1) * DDRIODQ_BL_OFFSET)),
+					   BIT8, BIT8);
+		}
+	}
+
+	LEAVEFN();
+}
+
+void print_timings(struct mrc_params *mrc_params)
+{
+	uint8_t algo;
+	uint8_t channel;
+	uint8_t rank;
+	uint8_t bl;
+	uint8_t bl_divisor = (mrc_params->channel_width == X16) ? 2 : 1;
+
+	DPF(D_INFO, "\n---------------------------");
+	DPF(D_INFO, "\nALGO[CH:RK] BL0 BL1 BL2 BL3");
+	DPF(D_INFO, "\n===========================");
+
+	for (algo = 0; algo < MAX_ALGOS; algo++) {
+		for (channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (mrc_params->channel_enables & (1 << channel)) {
+				for (rank = 0; rank < NUM_RANKS; rank++) {
+					if (mrc_params->rank_enables &
+						(1 << rank)) {
+						switch (algo) {
+						case RCVN:
+							DPF(D_INFO,
+							    "\nRCVN[%02d:%02d]",
+							    channel, rank);
+							break;
+						case WDQS:
+							DPF(D_INFO,
+							    "\nWDQS[%02d:%02d]",
+							    channel, rank);
+							break;
+						case WDQX:
+							DPF(D_INFO,
+							    "\nWDQx[%02d:%02d]",
+							    channel, rank);
+							break;
+						case RDQS:
+							DPF(D_INFO,
+							    "\nRDQS[%02d:%02d]",
+							    channel, rank);
+							break;
+						case VREF:
+							DPF(D_INFO,
+							    "\nVREF[%02d:%02d]",
+							    channel, rank);
+							break;
+						case WCMD:
+							DPF(D_INFO,
+							    "\nWCMD[%02d:%02d]",
+							    channel, rank);
+							break;
+						case WCTL:
+							DPF(D_INFO,
+							    "\nWCTL[%02d:%02d]",
+							    channel, rank);
+							break;
+						case WCLK:
+							DPF(D_INFO,
+							    "\nWCLK[%02d:%02d]",
+							    channel, rank);
+							break;
+						default:
+							break;
+						}
+
+						for (bl = 0;
+						     bl < (NUM_BYTE_LANES / bl_divisor);
+						     bl++) {
+							switch (algo) {
+							case RCVN:
+								DPF(D_INFO,
+								    " %03d",
+								    get_rcvn(channel, rank, bl));
+								break;
+							case WDQS:
+								DPF(D_INFO,
+								    " %03d",
+								    get_wdqs(channel, rank, bl));
+								break;
+							case WDQX:
+								DPF(D_INFO,
+								    " %03d",
+								    get_wdq(channel, rank, bl));
+								break;
+							case RDQS:
+								DPF(D_INFO,
+								    " %03d",
+								    get_rdqs(channel, rank, bl));
+								break;
+							case VREF:
+								DPF(D_INFO,
+								    " %03d",
+								    get_vref(channel, bl));
+								break;
+							case WCMD:
+								DPF(D_INFO,
+								    " %03d",
+								    get_wcmd(channel));
+								break;
+							case WCTL:
+								DPF(D_INFO,
+								    " %03d",
+								    get_wctl(channel, rank));
+								break;
+							case WCLK:
+								DPF(D_INFO,
+								    " %03d",
+								    get_wclk(channel, rank));
+								break;
+							default:
+								break;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	DPF(D_INFO, "\n---------------------------");
+	DPF(D_INFO, "\n");
+}
diff --git a/arch/x86/cpu/quark/mrc_util.h b/arch/x86/cpu/quark/mrc_util.h
new file mode 100644
index 0000000..edbe219
--- /dev/null
+++ b/arch/x86/cpu/quark/mrc_util.h
@@ -0,0 +1,153 @@ 
+/*
+ * Copyright (C) 2013, Intel Corporation
+ * Copyright (C) 2015, Bin Meng <bmeng.cn@gmail.com>
+ *
+ * Ported from Intel released Quark UEFI BIOS
+ * QuarkSocPkg/QuarkNorthCluster/MemoryInit/Pei/
+ *
+ * SPDX-License-Identifier:	Intel
+ */
+
+#ifndef _MRC_UTIL_H_
+#define _MRC_UTIL_H_
+
+/* Turn on this macro to enable MRC debugging output */
+#undef  MRC_DEBUG
+
+/* MRC Debug Support */
+#define DPF		debug_cond
+
+/* debug print type */
+
+#ifdef MRC_DEBUG
+#define D_ERROR		0x0001
+#define D_INFO		0x0002
+#define D_REGRD		0x0004
+#define D_REGWR		0x0008
+#define D_FCALL		0x0010
+#define D_TRN		0x0020
+#define D_TIME		0x0040
+#else
+#define D_ERROR		0
+#define D_INFO		0
+#define D_REGRD		0
+#define D_REGWR		0
+#define D_FCALL		0
+#define D_TRN		0
+#define D_TIME		0
+#endif
+
+#define ENTERFN(...)	debug_cond(D_FCALL, "<%s>\n", __func__)
+#define LEAVEFN(...)	debug_cond(D_FCALL, "</%s>\n", __func__)
+#define REPORTFN(...)	debug_cond(D_FCALL, "<%s/>\n", __func__)
+
+/* Generic Register Bits */
+#define BIT0		0x00000001
+#define BIT1		0x00000002
+#define BIT2		0x00000004
+#define BIT3		0x00000008
+#define BIT4		0x00000010
+#define BIT5		0x00000020
+#define BIT6		0x00000040
+#define BIT7		0x00000080
+#define BIT8		0x00000100
+#define BIT9		0x00000200
+#define BIT10		0x00000400
+#define BIT11		0x00000800
+#define BIT12		0x00001000
+#define BIT13		0x00002000
+#define BIT14		0x00004000
+#define BIT15		0x00008000
+#define BIT16		0x00010000
+#define BIT17		0x00020000
+#define BIT18		0x00040000
+#define BIT19		0x00080000
+#define BIT20		0x00100000
+#define BIT21		0x00200000
+#define BIT22		0x00400000
+#define BIT23		0x00800000
+#define BIT24		0x01000000
+#define BIT25		0x02000000
+#define BIT26		0x04000000
+#define BIT27		0x08000000
+#define BIT28		0x10000000
+#define BIT29		0x20000000
+#define BIT30		0x40000000
+#define BIT31		0x80000000
+
+/* Message Bus Port */
+#define MEM_CTLR	0x01
+#define HOST_BRIDGE	0x03
+#define MEM_MGR		0x05
+#define HTE		0x11
+#define DDRPHY		0x12
+
+/* number of sample points */
+#define SAMPLE_CNT	3
+/* number of PIs to increment per sample */
+#define SAMPLE_DLY	26
+
+enum {
+	/* indicates to decrease delays when looking for edge */
+	BACKWARD,
+	/* indicates to increase delays when looking for edge */
+	FORWARD
+};
+
+enum {
+	RCVN,
+	WDQS,
+	WDQX,
+	RDQS,
+	VREF,
+	WCMD,
+	WCTL,
+	WCLK,
+	MAX_ALGOS,
+};
+
+void mrc_write_mask(u32 unit, u32 addr, u32 data, u32 mask);
+void mrc_alt_write_mask(u32 unit, u32 addr, u32 data, u32 mask);
+void mrc_post_code(uint8_t major, uint8_t minor);
+void delay_n(uint32_t ns);
+void delay_u(uint32_t ms);
+void select_mem_mgr(void);
+void select_hte(void);
+void dram_init_command(uint32_t data);
+void dram_wake_command(void);
+void training_message(uint8_t channel, uint8_t rank, uint8_t byte_lane);
+
+void set_rcvn(uint8_t channel, uint8_t rank,
+	      uint8_t byte_lane, uint32_t pi_count);
+uint32_t get_rcvn(uint8_t channel, uint8_t rank, uint8_t byte_lane);
+void set_rdqs(uint8_t channel, uint8_t rank,
+	      uint8_t byte_lane, uint32_t pi_count);
+uint32_t get_rdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane);
+void set_wdqs(uint8_t channel, uint8_t rank,
+	      uint8_t byte_lane, uint32_t pi_count);
+uint32_t get_wdqs(uint8_t channel, uint8_t rank, uint8_t byte_lane);
+void set_wdq(uint8_t channel, uint8_t rank,
+	     uint8_t byte_lane, uint32_t pi_count);
+uint32_t get_wdq(uint8_t channel, uint8_t rank, uint8_t byte_lane);
+void set_wcmd(uint8_t channel, uint32_t pi_count);
+uint32_t get_wcmd(uint8_t channel);
+void set_wclk(uint8_t channel, uint8_t rank, uint32_t pi_count);
+uint32_t get_wclk(uint8_t channel, uint8_t rank);
+void set_wctl(uint8_t channel, uint8_t rank, uint32_t pi_count);
+uint32_t get_wctl(uint8_t channel, uint8_t rank);
+void set_vref(uint8_t channel, uint8_t byte_lane, uint32_t setting);
+uint32_t get_vref(uint8_t channel, uint8_t byte_lane);
+
+uint32_t get_addr(uint8_t channel, uint8_t rank);
+uint32_t sample_dqs(struct mrc_params *mrc_params, uint8_t channel,
+		    uint8_t rank, bool rcvn);
+void find_rising_edge(struct mrc_params *mrc_params, uint32_t delay[],
+		      uint8_t channel, uint8_t rank, bool rcvn);
+uint32_t byte_lane_mask(struct mrc_params *mrc_params);
+uint32_t check_rw_coarse(struct mrc_params *mrc_params, uint32_t address);
+uint32_t check_bls_ex(struct mrc_params *mrc_params, uint32_t address);
+void lfsr32(uint32_t *lfsr_ptr);
+void clear_pointers(void);
+void print_timings(struct mrc_params *mrc_params);
+
+#endif /* _MRC_UTIL_H_ */