diff mbox series

[v5,27/33] riscv: Fix race conditions when initializing IPI

Message ID 20200228210552.615672-28-seanga2@gmail.com
State Superseded
Delegated to: Andes
Headers show
Series riscv: Add Sipeed Maix support | expand

Commit Message

Sean Anderson Feb. 28, 2020, 9:05 p.m. UTC
The IPI code could have race conditions in several places.
* Several harts could race on the value of gd->arch->clint/plic
* Non-boot harts could race with the main hart on the DM subsystem In
  addition, if an IPI was pending when U-Boot started, it would cause the
  IPI handler to jump to address 0.

To address these problems, a new function riscv_init_ipi is introduced. It
is called once during arch_cpu_init_dm. Before this point, no riscv_*_ipi
functions may be called. Access is synchronized by gd->arch->ipi_ready.

Signed-off-by: Sean Anderson <seanga2@gmail.com>
---

Changes in v5:
- New

 arch/riscv/cpu/cpu.c                 |  9 ++++
 arch/riscv/include/asm/global_data.h |  1 +
 arch/riscv/include/asm/smp.h         | 43 ++++++++++++++++++
 arch/riscv/lib/andes_plic.c          | 34 +++++---------
 arch/riscv/lib/sbi_ipi.c             |  5 ++
 arch/riscv/lib/sifive_clint.c        | 33 +++++---------
 arch/riscv/lib/smp.c                 | 68 ++++++++--------------------
 7 files changed, 101 insertions(+), 92 deletions(-)

Comments

Rick Chen March 2, 2020, 9:08 a.m. UTC | #1
Hi Sean

> The IPI code could have race conditions in several places.
> * Several harts could race on the value of gd->arch->clint/plic
> * Non-boot harts could race with the main hart on the DM subsystem In
>   addition, if an IPI was pending when U-Boot started, it would cause the
>   IPI handler to jump to address 0.
>
> To address these problems, a new function riscv_init_ipi is introduced. It
> is called once during arch_cpu_init_dm. Before this point, no riscv_*_ipi
> functions may be called. Access is synchronized by gd->arch->ipi_ready.
>
> Signed-off-by: Sean Anderson <seanga2@gmail.com>
> ---
>
> Changes in v5:
> - New
>
>  arch/riscv/cpu/cpu.c                 |  9 ++++
>  arch/riscv/include/asm/global_data.h |  1 +
>  arch/riscv/include/asm/smp.h         | 43 ++++++++++++++++++
>  arch/riscv/lib/andes_plic.c          | 34 +++++---------
>  arch/riscv/lib/sbi_ipi.c             |  5 ++
>  arch/riscv/lib/sifive_clint.c        | 33 +++++---------
>  arch/riscv/lib/smp.c                 | 68 ++++++++--------------------
>  7 files changed, 101 insertions(+), 92 deletions(-)
>
> diff --git a/arch/riscv/cpu/cpu.c b/arch/riscv/cpu/cpu.c
> index e457f6acbf..a971ec8694 100644
> --- a/arch/riscv/cpu/cpu.c
> +++ b/arch/riscv/cpu/cpu.c
> @@ -96,6 +96,15 @@ int arch_cpu_init_dm(void)
>                         csr_write(CSR_SATP, 0);
>         }
>
> +#ifdef CONFIG_SMP
> +       ret = riscv_init_ipi();
> +       if (ret)
> +               return ret;
> +
> +       /* Atomically set a flag enabling IPI handling */
> +       WRITE_ONCE(gd->arch.ipi_ready, 1);

I think it shall not have race condition here.
Can you explain more detail why there will occur race condition ?

Hi Lukas

Do you have any comments ?

Thanks
Rick

> +#endif
> +
>         return 0;
>  }
>
> diff --git a/arch/riscv/include/asm/global_data.h b/arch/riscv/include/asm/global_data.h
> index 7276d9763f..b24f8fd2a7 100644
> --- a/arch/riscv/include/asm/global_data.h
> +++ b/arch/riscv/include/asm/global_data.h
> @@ -28,6 +28,7 @@ struct arch_global_data {
>  #endif
>  #ifdef CONFIG_SMP
>         struct ipi_data ipi[CONFIG_NR_CPUS];
> +       long ipi_ready; /* Set after riscv_init_ipi is called */
>  #endif
>  #ifndef CONFIG_XIP
>         ulong available_harts;
> diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h
> index 74de92ed13..1b428856b2 100644
> --- a/arch/riscv/include/asm/smp.h
> +++ b/arch/riscv/include/asm/smp.h
> @@ -51,4 +51,47 @@ void handle_ipi(ulong hart);
>   */
>  int smp_call_function(ulong addr, ulong arg0, ulong arg1, int wait);
>
> +/**
> + * riscv_init_ipi() - Initialize inter-process interrupt (IPI) driver
> + *
> + * Platform code must provide this function. This function is called once after
> + * the cpu driver is initialized. No other riscv_*_ipi() calls will be made
> + * before this function is called.
> + *
> + * @return 0 if OK, -ve on error
> + */
> +int riscv_init_ipi(void);
> +
> +/**
> + * riscv_send_ipi() - Send inter-processor interrupt (IPI)
> + *
> + * Platform code must provide this function.
> + *
> + * @hart: Hart ID of receiving hart
> + * @return 0 if OK, -ve on error
> + */
> +int riscv_send_ipi(int hart);
> +
> +/**
> + * riscv_clear_ipi() - Clear inter-processor interrupt (IPI)
> + *
> + * Platform code must provide this function.
> + *
> + * @hart: Hart ID of hart to be cleared
> + * @return 0 if OK, -ve on error
> + */
> +int riscv_clear_ipi(int hart);
> +
> +/**
> + * riscv_get_ipi() - Get status of inter-processor interrupt (IPI)
> + *
> + * Platform code must provide this function.
> + *
> + * @hart: Hart ID of hart to be checked
> + * @pending: Pointer to variable with result of the check,
> + *           1 if IPI is pending, 0 otherwise
> + * @return 0 if OK, -ve on error
> + */
> +int riscv_get_ipi(int hart, int *pending);
> +
>  #endif
> diff --git a/arch/riscv/lib/andes_plic.c b/arch/riscv/lib/andes_plic.c
> index 20529ab3eb..8484f76386 100644
> --- a/arch/riscv/lib/andes_plic.c
> +++ b/arch/riscv/lib/andes_plic.c
> @@ -30,20 +30,6 @@
>  #define SEND_IPI_TO_HART(hart)  (0x80 >> (hart))
>
>  DECLARE_GLOBAL_DATA_PTR;
> -static int init_plic(void);
> -
> -#define PLIC_BASE_GET(void)                                            \
> -       do {                                                            \
> -               long *ret;                                              \
> -                                                                       \
> -               if (!gd->arch.plic) {                                   \
> -                       ret = syscon_get_first_range(RISCV_SYSCON_PLIC); \
> -                       if (IS_ERR(ret))                                \
> -                               return PTR_ERR(ret);                    \
> -                       gd->arch.plic = ret;                            \
> -                       init_plic();                                    \
> -               }                                                       \
> -       } while (0)
>
>  static int enable_ipi(int hart)
>  {
> @@ -93,13 +79,21 @@ static int init_plic(void)
>         return -ENODEV;
>  }
>
> +int riscv_init_ipi(void)
> +{
> +       int ret = syscon_get_first_range(RISCV_SYSCON_PLIC);
> +
> +       if (IS_ERR(ret))
> +               return PTR_ERR(ret);
> +       gd->arch.plic = ret;
> +
> +       return init_plic();
> +}
> +
>  int riscv_send_ipi(int hart)
>  {
> -       unsigned int ipi;
> +       unsigned int ipi = (SEND_IPI_TO_HART(hart) << (8 * gd->arch.boot_hart));
>
> -       PLIC_BASE_GET();
> -
> -       ipi = (SEND_IPI_TO_HART(hart) << (8 * gd->arch.boot_hart));
>         writel(ipi, (void __iomem *)PENDING_REG(gd->arch.plic,
>                                 gd->arch.boot_hart));
>
> @@ -110,8 +104,6 @@ int riscv_clear_ipi(int hart)
>  {
>         u32 source_id;
>
> -       PLIC_BASE_GET();
> -
>         source_id = readl((void __iomem *)CLAIM_REG(gd->arch.plic, hart));
>         writel(source_id, (void __iomem *)CLAIM_REG(gd->arch.plic, hart));
>
> @@ -120,8 +112,6 @@ int riscv_clear_ipi(int hart)
>
>  int riscv_get_ipi(int hart, int *pending)
>  {
> -       PLIC_BASE_GET();
> -
>         *pending = readl((void __iomem *)PENDING_REG(gd->arch.plic,
>                                                      gd->arch.boot_hart));
>         *pending = !!(*pending & SEND_IPI_TO_HART(hart));
> diff --git a/arch/riscv/lib/sbi_ipi.c b/arch/riscv/lib/sbi_ipi.c
> index 9a698ce74e..310d1bd2a4 100644
> --- a/arch/riscv/lib/sbi_ipi.c
> +++ b/arch/riscv/lib/sbi_ipi.c
> @@ -7,6 +7,11 @@
>  #include <common.h>
>  #include <asm/sbi.h>
>
> +int riscv_init_ipi(void)
> +{
> +       return 0;
> +}
> +
>  int riscv_send_ipi(int hart)
>  {
>         ulong mask;
> diff --git a/arch/riscv/lib/sifive_clint.c b/arch/riscv/lib/sifive_clint.c
> index 5e0d25720b..62c1b2b0ef 100644
> --- a/arch/riscv/lib/sifive_clint.c
> +++ b/arch/riscv/lib/sifive_clint.c
> @@ -24,22 +24,8 @@
>
>  DECLARE_GLOBAL_DATA_PTR;
>
> -#define CLINT_BASE_GET(void)                                           \
> -       do {                                                            \
> -               long *ret;                                              \
> -                                                                       \
> -               if (!gd->arch.clint) {                                  \
> -                       ret = syscon_get_first_range(RISCV_SYSCON_CLINT); \
> -                       if (IS_ERR(ret))                                \
> -                               return PTR_ERR(ret);                    \
> -                       gd->arch.clint = ret;                           \
> -               }                                                       \
> -       } while (0)
> -
>  int riscv_get_time(u64 *time)
>  {
> -       CLINT_BASE_GET();
> -
>         *time = readq((void __iomem *)MTIME_REG(gd->arch.clint));
>
>         return 0;
> @@ -47,17 +33,24 @@ int riscv_get_time(u64 *time)
>
>  int riscv_set_timecmp(int hart, u64 cmp)
>  {
> -       CLINT_BASE_GET();
> -
>         writeq(cmp, (void __iomem *)MTIMECMP_REG(gd->arch.clint, hart));
>
>         return 0;
>  }
>
> +int riscv_init_ipi(void)
> +{
> +               long *ret = syscon_get_first_range(RISCV_SYSCON_CLINT);
> +
> +               if (IS_ERR(ret))
> +                       return PTR_ERR(ret);
> +               gd->arch.clint = ret;
> +
> +               return 0;
> +}
> +
>  int riscv_send_ipi(int hart)
>  {
> -       CLINT_BASE_GET();
> -
>         writel(1, (void __iomem *)MSIP_REG(gd->arch.clint, hart));
>
>         return 0;
> @@ -65,8 +58,6 @@ int riscv_send_ipi(int hart)
>
>  int riscv_clear_ipi(int hart)
>  {
> -       CLINT_BASE_GET();
> -
>         writel(0, (void __iomem *)MSIP_REG(gd->arch.clint, hart));
>
>         return 0;
> @@ -74,8 +65,6 @@ int riscv_clear_ipi(int hart)
>
>  int riscv_get_ipi(int hart, int *pending)
>  {
> -       CLINT_BASE_GET();
> -
>         *pending = readl((void __iomem *)MSIP_REG(gd->arch.clint, hart));
>
>         return 0;
> diff --git a/arch/riscv/lib/smp.c b/arch/riscv/lib/smp.c
> index 17adb35730..3b1e52e9b2 100644
> --- a/arch/riscv/lib/smp.c
> +++ b/arch/riscv/lib/smp.c
> @@ -12,38 +12,6 @@
>
>  DECLARE_GLOBAL_DATA_PTR;
>
> -/**
> - * riscv_send_ipi() - Send inter-processor interrupt (IPI)
> - *
> - * Platform code must provide this function.
> - *
> - * @hart: Hart ID of receiving hart
> - * @return 0 if OK, -ve on error
> - */
> -extern int riscv_send_ipi(int hart);
> -
> -/**
> - * riscv_clear_ipi() - Clear inter-processor interrupt (IPI)
> - *
> - * Platform code must provide this function.
> - *
> - * @hart: Hart ID of hart to be cleared
> - * @return 0 if OK, -ve on error
> - */
> -extern int riscv_clear_ipi(int hart);
> -
> -/**
> - * riscv_get_ipi() - Get status of inter-processor interrupt (IPI)
> - *
> - * Platform code must provide this function.
> - *
> - * @hart: Hart ID of hart to be checked
> - * @pending: Pointer to variable with result of the check,
> - *           1 if IPI is pending, 0 otherwise
> - * @return 0 if OK, -ve on error
> - */
> -extern int riscv_get_ipi(int hart, int *pending);
> -
>  static int send_ipi_many(struct ipi_data *ipi, int wait)
>  {
>         ofnode node, cpus;
> @@ -110,37 +78,41 @@ void handle_ipi(ulong hart)
>         int ret;
>         void (*smp_function)(ulong hart, ulong arg0, ulong arg1);
>
> -       if (hart >= CONFIG_NR_CPUS)
> +       if (hart >= CONFIG_NR_CPUS || !READ_ONCE(gd->arch.ipi_ready))
>                 return;
>
> -       __smp_mb();
> -
> -       smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
> -       invalidate_icache_all();
> -
>         /*
>          * Clear the IPI to acknowledge the request before jumping to the
>          * requested function.
>          */
>         ret = riscv_clear_ipi(hart);
>         if (ret) {
> -               pr_err("Cannot clear IPI of hart %ld\n", hart);
> +               pr_err("Cannot clear IPI of hart %ld (error %d)\n", hart, ret);
>                 return;
>         }
>
> +       __smp_mb();
> +
> +       smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
> +       /*
> +        * There may be an IPI raised before u-boot begins execution, so check
> +        * to ensure we actually have a function to call.
> +        */
> +       if (!smp_function)
> +               return;
> +       log_debug("hart = %lu func = %p\n", hart, smp_function);
> +       invalidate_icache_all();
> +
>         smp_function(hart, gd->arch.ipi[hart].arg0, gd->arch.ipi[hart].arg1);
>  }
>
>  int smp_call_function(ulong addr, ulong arg0, ulong arg1, int wait)
>  {
> -       int ret = 0;
> -       struct ipi_data ipi;
> +       struct ipi_data ipi = {
> +               .addr = addr,
> +               .arg0 = arg0,
> +               .arg1 = arg1,
> +       };
>
> -       ipi.addr = addr;
> -       ipi.arg0 = arg0;
> -       ipi.arg1 = arg1;
> -
> -       ret = send_ipi_many(&ipi, wait);
> -
> -       return ret;
> +       return send_ipi_many(&ipi, wait);
>  }
> --
> 2.25.0
>
Sean Anderson March 2, 2020, 3:43 p.m. UTC | #2
On 3/2/20 4:08 AM, Rick Chen wrote:
> Hi Sean
> 
>> The IPI code could have race conditions in several places.
>> * Several harts could race on the value of gd->arch->clint/plic
>> * Non-boot harts could race with the main hart on the DM subsystem In
>>   addition, if an IPI was pending when U-Boot started, it would cause the
>>   IPI handler to jump to address 0.
>>
>> To address these problems, a new function riscv_init_ipi is introduced. It
>> is called once during arch_cpu_init_dm. Before this point, no riscv_*_ipi
>> functions may be called. Access is synchronized by gd->arch->ipi_ready.
>>
>> Signed-off-by: Sean Anderson <seanga2@gmail.com>
>> ---
>>
>> Changes in v5:
>> - New
>>
>>  arch/riscv/cpu/cpu.c                 |  9 ++++
>>  arch/riscv/include/asm/global_data.h |  1 +
>>  arch/riscv/include/asm/smp.h         | 43 ++++++++++++++++++
>>  arch/riscv/lib/andes_plic.c          | 34 +++++---------
>>  arch/riscv/lib/sbi_ipi.c             |  5 ++
>>  arch/riscv/lib/sifive_clint.c        | 33 +++++---------
>>  arch/riscv/lib/smp.c                 | 68 ++++++++--------------------
>>  7 files changed, 101 insertions(+), 92 deletions(-)
>>
>> diff --git a/arch/riscv/cpu/cpu.c b/arch/riscv/cpu/cpu.c
>> index e457f6acbf..a971ec8694 100644
>> --- a/arch/riscv/cpu/cpu.c
>> +++ b/arch/riscv/cpu/cpu.c
>> @@ -96,6 +96,15 @@ int arch_cpu_init_dm(void)
>>                         csr_write(CSR_SATP, 0);
>>         }
>>
>> +#ifdef CONFIG_SMP
>> +       ret = riscv_init_ipi();
>> +       if (ret)
>> +               return ret;
>> +
>> +       /* Atomically set a flag enabling IPI handling */
>> +       WRITE_ONCE(gd->arch.ipi_ready, 1);
> 
> I think it shall not have race condition here.
> Can you explain more detail why there will occur race condition ?
> 
> Hi Lukas
> 
> Do you have any comments ?
> 
> Thanks
> Rick

On the K210, there may already be an IPI pending when U-Boot starts.
(Perhaps the prior stage sends an IPI but does not clear it). As soon as
interrupts are enabled, the hart then tries to call riscv_clear_ipi().
Because the clint/plic has not yet been enabled, the clear_ipi function
will try and bind/probe the device. This can have really nasty effects, since
the boot hart is *also* trying to bind/probe devices.

In addition, a hart could end up trying to probe the clint/plic because
it could receive the IPI before (from its perspective) gd->arch.clint
(or plic) gets initialized.

Aside from the above, I think the macro approach is a bit confusing,
since it's unclear at first glance what function will be initializing
the clint/plic. Given U-Boot's otherwise completely SMP-unsafe design, I
think it's better to be explicit and conservative in these areas.

--Sean
Lukas Auer March 2, 2020, 11:15 p.m. UTC | #3
On Mon, 2020-03-02 at 10:43 -0500, Sean Anderson wrote:

> On 3/2/20 4:08 AM, Rick Chen wrote:
> > Hi Sean
> > 
> > > The IPI code could have race conditions in several places.
> > > * Several harts could race on the value of gd->arch->clint/plic
> > > * Non-boot harts could race with the main hart on the DM subsystem In
> > >   addition, if an IPI was pending when U-Boot started, it would cause the
> > >   IPI handler to jump to address 0.
> > > 
> > > To address these problems, a new function riscv_init_ipi is introduced. It
> > > is called once during arch_cpu_init_dm. Before this point, no riscv_*_ipi
> > > functions may be called. Access is synchronized by gd->arch->ipi_ready.
> > > 
> > > Signed-off-by: Sean Anderson <seanga2@gmail.com>
> > > ---
> > > 
> > > Changes in v5:
> > > - New
> > > 
> > >  arch/riscv/cpu/cpu.c                 |  9 ++++
> > >  arch/riscv/include/asm/global_data.h |  1 +
> > >  arch/riscv/include/asm/smp.h         | 43 ++++++++++++++++++
> > >  arch/riscv/lib/andes_plic.c          | 34 +++++---------
> > >  arch/riscv/lib/sbi_ipi.c             |  5 ++
> > >  arch/riscv/lib/sifive_clint.c        | 33 +++++---------
> > >  arch/riscv/lib/smp.c                 | 68 ++++++++--------------------
> > >  7 files changed, 101 insertions(+), 92 deletions(-)
> > > 
> > > diff --git a/arch/riscv/cpu/cpu.c b/arch/riscv/cpu/cpu.c
> > > index e457f6acbf..a971ec8694 100644
> > > --- a/arch/riscv/cpu/cpu.c
> > > +++ b/arch/riscv/cpu/cpu.c
> > > @@ -96,6 +96,15 @@ int arch_cpu_init_dm(void)
> > >                         csr_write(CSR_SATP, 0);
> > >         }
> > > 
> > > +#ifdef CONFIG_SMP
> > > +       ret = riscv_init_ipi();
> > > +       if (ret)
> > > +               return ret;
> > > +
> > > +       /* Atomically set a flag enabling IPI handling */
> > > +       WRITE_ONCE(gd->arch.ipi_ready, 1);
> > 
> > I think it shall not have race condition here.
> > Can you explain more detail why there will occur race condition ?
> > 
> > Hi Lukas
> > 
> > Do you have any comments ?
> > 
> > Thanks
> > Rick
> 
> On the K210, there may already be an IPI pending when U-Boot starts.
> (Perhaps the prior stage sends an IPI but does not clear it). As soon as
> interrupts are enabled, the hart then tries to call riscv_clear_ipi().
> Because the clint/plic has not yet been enabled, the clear_ipi function
> will try and bind/probe the device. This can have really nasty effects, since
> the boot hart is *also* trying to bind/probe devices.
> 
> In addition, a hart could end up trying to probe the clint/plic because
> it could receive the IPI before (from its perspective) gd->arch.clint
> (or plic) gets initialized.
> 

We did not have a problem with pending IPIs on other platforms. It
should suffice to clear SSIP / MSIP before enabling the interrupts.

> Aside from the above, I think the macro approach is a bit confusing,
> since it's unclear at first glance what function will be initializing
> the clint/plic. Given U-Boot's otherwise completely SMP-unsafe design, I
> think it's better to be explicit and conservative in these areas.
> 

I agree, the patch makes this more clear and helps make the code more
robust.

Thanks,
Lukas
Lukas Auer March 2, 2020, 11:17 p.m. UTC | #4
On Fri, 2020-02-28 at 16:05 -0500, Sean Anderson wrote:

> The IPI code could have race conditions in several places.
> * Several harts could race on the value of gd->arch->clint/plic
> * Non-boot harts could race with the main hart on the DM subsystem In
>   addition, if an IPI was pending when U-Boot started, it would cause the
>   IPI handler to jump to address 0.
> 
> To address these problems, a new function riscv_init_ipi is introduced. It
> is called once during arch_cpu_init_dm. Before this point, no riscv_*_ipi
> functions may be called. Access is synchronized by gd->arch->ipi_ready.
> 
> Signed-off-by: Sean Anderson <seanga2@gmail.com>
> ---
> 
> Changes in v5:
> - New
> 
>  arch/riscv/cpu/cpu.c                 |  9 ++++
>  arch/riscv/include/asm/global_data.h |  1 +
>  arch/riscv/include/asm/smp.h         | 43 ++++++++++++++++++
>  arch/riscv/lib/andes_plic.c          | 34 +++++---------
>  arch/riscv/lib/sbi_ipi.c             |  5 ++
>  arch/riscv/lib/sifive_clint.c        | 33 +++++---------
>  arch/riscv/lib/smp.c                 | 68 ++++++++--------------------
>  7 files changed, 101 insertions(+), 92 deletions(-)
> 
> diff --git a/arch/riscv/cpu/cpu.c b/arch/riscv/cpu/cpu.c
> index e457f6acbf..a971ec8694 100644
> --- a/arch/riscv/cpu/cpu.c
> +++ b/arch/riscv/cpu/cpu.c
> @@ -96,6 +96,15 @@ int arch_cpu_init_dm(void)
>  			csr_write(CSR_SATP, 0);
>  	}
>  
> +#ifdef CONFIG_SMP
> +	ret = riscv_init_ipi();
> +	if (ret)
> +		return ret;
> +
> +	/* Atomically set a flag enabling IPI handling */
> +	WRITE_ONCE(gd->arch.ipi_ready, 1);
> +#endif
> +
>  	return 0;
>  }
>  
> diff --git a/arch/riscv/include/asm/global_data.h b/arch/riscv/include/asm/global_data.h
> index 7276d9763f..b24f8fd2a7 100644
> --- a/arch/riscv/include/asm/global_data.h
> +++ b/arch/riscv/include/asm/global_data.h
> @@ -28,6 +28,7 @@ struct arch_global_data {
>  #endif
>  #ifdef CONFIG_SMP
>  	struct ipi_data ipi[CONFIG_NR_CPUS];
> +	long ipi_ready; /* Set after riscv_init_ipi is called */
>  #endif
>  #ifndef CONFIG_XIP
>  	ulong available_harts;
> diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h
> index 74de92ed13..1b428856b2 100644
> --- a/arch/riscv/include/asm/smp.h
> +++ b/arch/riscv/include/asm/smp.h
> @@ -51,4 +51,47 @@ void handle_ipi(ulong hart);
>   */
>  int smp_call_function(ulong addr, ulong arg0, ulong arg1, int wait);
>  
> +/**
> + * riscv_init_ipi() - Initialize inter-process interrupt (IPI) driver
> + *
> + * Platform code must provide this function. This function is called once after
> + * the cpu driver is initialized. No other riscv_*_ipi() calls will be made
> + * before this function is called.
> + *
> + * @return 0 if OK, -ve on error
> + */
> +int riscv_init_ipi(void);
> +
> +/**
> + * riscv_send_ipi() - Send inter-processor interrupt (IPI)
> + *
> + * Platform code must provide this function.
> + *
> + * @hart: Hart ID of receiving hart
> + * @return 0 if OK, -ve on error
> + */
> +int riscv_send_ipi(int hart);
> +
> +/**
> + * riscv_clear_ipi() - Clear inter-processor interrupt (IPI)
> + *
> + * Platform code must provide this function.
> + *
> + * @hart: Hart ID of hart to be cleared
> + * @return 0 if OK, -ve on error
> + */
> +int riscv_clear_ipi(int hart);
> +
> +/**
> + * riscv_get_ipi() - Get status of inter-processor interrupt (IPI)
> + *
> + * Platform code must provide this function.
> + *
> + * @hart: Hart ID of hart to be checked
> + * @pending: Pointer to variable with result of the check,
> + *           1 if IPI is pending, 0 otherwise
> + * @return 0 if OK, -ve on error
> + */
> +int riscv_get_ipi(int hart, int *pending);
> +
>  #endif
> diff --git a/arch/riscv/lib/andes_plic.c b/arch/riscv/lib/andes_plic.c
> index 20529ab3eb..8484f76386 100644
> --- a/arch/riscv/lib/andes_plic.c
> +++ b/arch/riscv/lib/andes_plic.c
> @@ -30,20 +30,6 @@
>  #define SEND_IPI_TO_HART(hart)  (0x80 >> (hart))
>  
>  DECLARE_GLOBAL_DATA_PTR;
> -static int init_plic(void);
> -
> -#define PLIC_BASE_GET(void)						\
> -	do {								\
> -		long *ret;						\
> -									\
> -		if (!gd->arch.plic) {					\
> -			ret = syscon_get_first_range(RISCV_SYSCON_PLIC); \
> -			if (IS_ERR(ret))				\
> -				return PTR_ERR(ret);			\
> -			gd->arch.plic = ret;				\
> -			init_plic();					\
> -		}							\
> -	} while (0)
>  
>  static int enable_ipi(int hart)
>  {
> @@ -93,13 +79,21 @@ static int init_plic(void)
>  	return -ENODEV;
>  }
>  
> +int riscv_init_ipi(void)
> +{
> +	int ret = syscon_get_first_range(RISCV_SYSCON_PLIC);
> +
> +	if (IS_ERR(ret))
> +		return PTR_ERR(ret);
> +	gd->arch.plic = ret;
> +
> +	return init_plic();
> +}
> +
>  int riscv_send_ipi(int hart)
>  {
> -	unsigned int ipi;
> +	unsigned int ipi = (SEND_IPI_TO_HART(hart) << (8 * gd->arch.boot_hart));
>  
> -	PLIC_BASE_GET();
> -
> -	ipi = (SEND_IPI_TO_HART(hart) << (8 * gd->arch.boot_hart));
>  	writel(ipi, (void __iomem *)PENDING_REG(gd->arch.plic,
>  				gd->arch.boot_hart));
>  
> @@ -110,8 +104,6 @@ int riscv_clear_ipi(int hart)
>  {
>  	u32 source_id;
>  
> -	PLIC_BASE_GET();
> -
>  	source_id = readl((void __iomem *)CLAIM_REG(gd->arch.plic, hart));
>  	writel(source_id, (void __iomem *)CLAIM_REG(gd->arch.plic, hart));
>  
> @@ -120,8 +112,6 @@ int riscv_clear_ipi(int hart)
>  
>  int riscv_get_ipi(int hart, int *pending)
>  {
> -	PLIC_BASE_GET();
> -
>  	*pending = readl((void __iomem *)PENDING_REG(gd->arch.plic,
>  						     gd->arch.boot_hart));
>  	*pending = !!(*pending & SEND_IPI_TO_HART(hart));
> diff --git a/arch/riscv/lib/sbi_ipi.c b/arch/riscv/lib/sbi_ipi.c
> index 9a698ce74e..310d1bd2a4 100644
> --- a/arch/riscv/lib/sbi_ipi.c
> +++ b/arch/riscv/lib/sbi_ipi.c
> @@ -7,6 +7,11 @@
>  #include <common.h>
>  #include <asm/sbi.h>
>  
> +int riscv_init_ipi(void)
> +{
> +	return 0;
> +}
> +
>  int riscv_send_ipi(int hart)
>  {
>  	ulong mask;
> diff --git a/arch/riscv/lib/sifive_clint.c b/arch/riscv/lib/sifive_clint.c
> index 5e0d25720b..62c1b2b0ef 100644
> --- a/arch/riscv/lib/sifive_clint.c
> +++ b/arch/riscv/lib/sifive_clint.c
> @@ -24,22 +24,8 @@
>  
>  DECLARE_GLOBAL_DATA_PTR;
>  
> -#define CLINT_BASE_GET(void)						\
> -	do {								\
> -		long *ret;						\
> -									\
> -		if (!gd->arch.clint) {					\
> -			ret = syscon_get_first_range(RISCV_SYSCON_CLINT); \
> -			if (IS_ERR(ret))				\
> -				return PTR_ERR(ret);			\
> -			gd->arch.clint = ret;				\
> -		}							\
> -	} while (0)
> -
>  int riscv_get_time(u64 *time)
>  {
> -	CLINT_BASE_GET();
> -
>  	*time = readq((void __iomem *)MTIME_REG(gd->arch.clint));
>  
>  	return 0;
> @@ -47,17 +33,24 @@ int riscv_get_time(u64 *time)
>  
>  int riscv_set_timecmp(int hart, u64 cmp)
>  {
> -	CLINT_BASE_GET();
> -
>  	writeq(cmp, (void __iomem *)MTIMECMP_REG(gd->arch.clint, hart));
>  
>  	return 0;
>  }
>  
> +int riscv_init_ipi(void)
> +{
> +		long *ret = syscon_get_first_range(RISCV_SYSCON_CLINT);
> +
> +		if (IS_ERR(ret))
> +			return PTR_ERR(ret);
> +		gd->arch.clint = ret;
> +
> +		return 0;
> +}
> +

Please fix the indentation here.

>  int riscv_send_ipi(int hart)
>  {
> -	CLINT_BASE_GET();
> -
>  	writel(1, (void __iomem *)MSIP_REG(gd->arch.clint, hart));
>  
>  	return 0;
> @@ -65,8 +58,6 @@ int riscv_send_ipi(int hart)
>  
>  int riscv_clear_ipi(int hart)
>  {
> -	CLINT_BASE_GET();
> -
>  	writel(0, (void __iomem *)MSIP_REG(gd->arch.clint, hart));
>  
>  	return 0;
> @@ -74,8 +65,6 @@ int riscv_clear_ipi(int hart)
>  
>  int riscv_get_ipi(int hart, int *pending)
>  {
> -	CLINT_BASE_GET();
> -
>  	*pending = readl((void __iomem *)MSIP_REG(gd->arch.clint, hart));
>  
>  	return 0;
> diff --git a/arch/riscv/lib/smp.c b/arch/riscv/lib/smp.c
> index 17adb35730..3b1e52e9b2 100644
> --- a/arch/riscv/lib/smp.c
> +++ b/arch/riscv/lib/smp.c
> @@ -12,38 +12,6 @@
>  
>  DECLARE_GLOBAL_DATA_PTR;
>  
> -/**
> - * riscv_send_ipi() - Send inter-processor interrupt (IPI)
> - *
> - * Platform code must provide this function.
> - *
> - * @hart: Hart ID of receiving hart
> - * @return 0 if OK, -ve on error
> - */
> -extern int riscv_send_ipi(int hart);
> -
> -/**
> - * riscv_clear_ipi() - Clear inter-processor interrupt (IPI)
> - *
> - * Platform code must provide this function.
> - *
> - * @hart: Hart ID of hart to be cleared
> - * @return 0 if OK, -ve on error
> - */
> -extern int riscv_clear_ipi(int hart);
> -
> -/**
> - * riscv_get_ipi() - Get status of inter-processor interrupt (IPI)
> - *
> - * Platform code must provide this function.
> - *
> - * @hart: Hart ID of hart to be checked
> - * @pending: Pointer to variable with result of the check,
> - *           1 if IPI is pending, 0 otherwise
> - * @return 0 if OK, -ve on error
> - */
> -extern int riscv_get_ipi(int hart, int *pending);
> -
>  static int send_ipi_many(struct ipi_data *ipi, int wait)
>  {
>  	ofnode node, cpus;
> @@ -110,37 +78,41 @@ void handle_ipi(ulong hart)
>  	int ret;
>  	void (*smp_function)(ulong hart, ulong arg0, ulong arg1);
>  
> -	if (hart >= CONFIG_NR_CPUS)
> +	if (hart >= CONFIG_NR_CPUS || !READ_ONCE(gd->arch.ipi_ready))
>  		return;
>  
> -	__smp_mb();
> -
> -	smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
> -	invalidate_icache_all();
> -

Don't move this. It is intended to be run before the IPI is cleared.

>  	/*
>  	 * Clear the IPI to acknowledge the request before jumping to the
>  	 * requested function.
>  	 */
>  	ret = riscv_clear_ipi(hart);
>  	if (ret) {
> -		pr_err("Cannot clear IPI of hart %ld\n", hart);
> +		pr_err("Cannot clear IPI of hart %ld (error %d)\n", hart, ret);
>  		return;
>  	}
>  
> +	__smp_mb();
> +
> +	smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
> +	/*
> +	 * There may be an IPI raised before u-boot begins execution, so check
> +	 * to ensure we actually have a function to call.
> +	 */
> +	if (!smp_function)
> +		return;
> +	log_debug("hart = %lu func = %p\n", hart, smp_function);

The log messages might be corrupted if multiple harts are calling the
log function here. I have not looked into the details so this might not
be an issue. In that case it is fine to keep, otherwise please remove
it.

Thanks,
Lukas

> +	invalidate_icache_all();
> +
>  	smp_function(hart, gd->arch.ipi[hart].arg0, gd->arch.ipi[hart].arg1);
>  }
>  
>  int smp_call_function(ulong addr, ulong arg0, ulong arg1, int wait)
>  {
> -	int ret = 0;
> -	struct ipi_data ipi;
> +	struct ipi_data ipi = {
> +		.addr = addr,
> +		.arg0 = arg0,
> +		.arg1 = arg1,
> +	};
>  
> -	ipi.addr = addr;
> -	ipi.arg0 = arg0;
> -	ipi.arg1 = arg1;
> -
> -	ret = send_ipi_many(&ipi, wait);
> -
> -	return ret;
> +	return send_ipi_many(&ipi, wait);
>  }
Sean Anderson March 2, 2020, 11:43 p.m. UTC | #5
On 3/2/20 6:17 PM, Lukas Auer wrote:
> On Fri, 2020-02-28 at 16:05 -0500, Sean Anderson wrote:
> 
>> The IPI code could have race conditions in several places.
>> * Several harts could race on the value of gd->arch->clint/plic
>> * Non-boot harts could race with the main hart on the DM subsystem In
>>   addition, if an IPI was pending when U-Boot started, it would cause the
>>   IPI handler to jump to address 0.
>>
>> To address these problems, a new function riscv_init_ipi is introduced. It
>> is called once during arch_cpu_init_dm. Before this point, no riscv_*_ipi
>> functions may be called. Access is synchronized by gd->arch->ipi_ready.
>>
>> Signed-off-by: Sean Anderson <seanga2@gmail.com>
>> ---
>>
>> Changes in v5:
>> - New
>>
>>  arch/riscv/cpu/cpu.c                 |  9 ++++
>>  arch/riscv/include/asm/global_data.h |  1 +
>>  arch/riscv/include/asm/smp.h         | 43 ++++++++++++++++++
>>  arch/riscv/lib/andes_plic.c          | 34 +++++---------
>>  arch/riscv/lib/sbi_ipi.c             |  5 ++
>>  arch/riscv/lib/sifive_clint.c        | 33 +++++---------
>>  arch/riscv/lib/smp.c                 | 68 ++++++++--------------------
>>  7 files changed, 101 insertions(+), 92 deletions(-)
>>
>> diff --git a/arch/riscv/cpu/cpu.c b/arch/riscv/cpu/cpu.c
>> index e457f6acbf..a971ec8694 100644
>> --- a/arch/riscv/cpu/cpu.c
>> +++ b/arch/riscv/cpu/cpu.c
>> @@ -96,6 +96,15 @@ int arch_cpu_init_dm(void)
>>  			csr_write(CSR_SATP, 0);
>>  	}
>>  
>> +#ifdef CONFIG_SMP
>> +	ret = riscv_init_ipi();
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* Atomically set a flag enabling IPI handling */
>> +	WRITE_ONCE(gd->arch.ipi_ready, 1);
>> +#endif
>> +
>>  	return 0;
>>  }
>>  
>> diff --git a/arch/riscv/include/asm/global_data.h b/arch/riscv/include/asm/global_data.h
>> index 7276d9763f..b24f8fd2a7 100644
>> --- a/arch/riscv/include/asm/global_data.h
>> +++ b/arch/riscv/include/asm/global_data.h
>> @@ -28,6 +28,7 @@ struct arch_global_data {
>>  #endif
>>  #ifdef CONFIG_SMP
>>  	struct ipi_data ipi[CONFIG_NR_CPUS];
>> +	long ipi_ready; /* Set after riscv_init_ipi is called */
>>  #endif
>>  #ifndef CONFIG_XIP
>>  	ulong available_harts;
>> diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h
>> index 74de92ed13..1b428856b2 100644
>> --- a/arch/riscv/include/asm/smp.h
>> +++ b/arch/riscv/include/asm/smp.h
>> @@ -51,4 +51,47 @@ void handle_ipi(ulong hart);
>>   */
>>  int smp_call_function(ulong addr, ulong arg0, ulong arg1, int wait);
>>  
>> +/**
>> + * riscv_init_ipi() - Initialize inter-process interrupt (IPI) driver
>> + *
>> + * Platform code must provide this function. This function is called once after
>> + * the cpu driver is initialized. No other riscv_*_ipi() calls will be made
>> + * before this function is called.
>> + *
>> + * @return 0 if OK, -ve on error
>> + */
>> +int riscv_init_ipi(void);
>> +
>> +/**
>> + * riscv_send_ipi() - Send inter-processor interrupt (IPI)
>> + *
>> + * Platform code must provide this function.
>> + *
>> + * @hart: Hart ID of receiving hart
>> + * @return 0 if OK, -ve on error
>> + */
>> +int riscv_send_ipi(int hart);
>> +
>> +/**
>> + * riscv_clear_ipi() - Clear inter-processor interrupt (IPI)
>> + *
>> + * Platform code must provide this function.
>> + *
>> + * @hart: Hart ID of hart to be cleared
>> + * @return 0 if OK, -ve on error
>> + */
>> +int riscv_clear_ipi(int hart);
>> +
>> +/**
>> + * riscv_get_ipi() - Get status of inter-processor interrupt (IPI)
>> + *
>> + * Platform code must provide this function.
>> + *
>> + * @hart: Hart ID of hart to be checked
>> + * @pending: Pointer to variable with result of the check,
>> + *           1 if IPI is pending, 0 otherwise
>> + * @return 0 if OK, -ve on error
>> + */
>> +int riscv_get_ipi(int hart, int *pending);
>> +
>>  #endif
>> diff --git a/arch/riscv/lib/andes_plic.c b/arch/riscv/lib/andes_plic.c
>> index 20529ab3eb..8484f76386 100644
>> --- a/arch/riscv/lib/andes_plic.c
>> +++ b/arch/riscv/lib/andes_plic.c
>> @@ -30,20 +30,6 @@
>>  #define SEND_IPI_TO_HART(hart)  (0x80 >> (hart))
>>  
>>  DECLARE_GLOBAL_DATA_PTR;
>> -static int init_plic(void);
>> -
>> -#define PLIC_BASE_GET(void)						\
>> -	do {								\
>> -		long *ret;						\
>> -									\
>> -		if (!gd->arch.plic) {					\
>> -			ret = syscon_get_first_range(RISCV_SYSCON_PLIC); \
>> -			if (IS_ERR(ret))				\
>> -				return PTR_ERR(ret);			\
>> -			gd->arch.plic = ret;				\
>> -			init_plic();					\
>> -		}							\
>> -	} while (0)
>>  
>>  static int enable_ipi(int hart)
>>  {
>> @@ -93,13 +79,21 @@ static int init_plic(void)
>>  	return -ENODEV;
>>  }
>>  
>> +int riscv_init_ipi(void)
>> +{
>> +	int ret = syscon_get_first_range(RISCV_SYSCON_PLIC);
>> +
>> +	if (IS_ERR(ret))
>> +		return PTR_ERR(ret);
>> +	gd->arch.plic = ret;
>> +
>> +	return init_plic();
>> +}
>> +
>>  int riscv_send_ipi(int hart)
>>  {
>> -	unsigned int ipi;
>> +	unsigned int ipi = (SEND_IPI_TO_HART(hart) << (8 * gd->arch.boot_hart));
>>  
>> -	PLIC_BASE_GET();
>> -
>> -	ipi = (SEND_IPI_TO_HART(hart) << (8 * gd->arch.boot_hart));
>>  	writel(ipi, (void __iomem *)PENDING_REG(gd->arch.plic,
>>  				gd->arch.boot_hart));
>>  
>> @@ -110,8 +104,6 @@ int riscv_clear_ipi(int hart)
>>  {
>>  	u32 source_id;
>>  
>> -	PLIC_BASE_GET();
>> -
>>  	source_id = readl((void __iomem *)CLAIM_REG(gd->arch.plic, hart));
>>  	writel(source_id, (void __iomem *)CLAIM_REG(gd->arch.plic, hart));
>>  
>> @@ -120,8 +112,6 @@ int riscv_clear_ipi(int hart)
>>  
>>  int riscv_get_ipi(int hart, int *pending)
>>  {
>> -	PLIC_BASE_GET();
>> -
>>  	*pending = readl((void __iomem *)PENDING_REG(gd->arch.plic,
>>  						     gd->arch.boot_hart));
>>  	*pending = !!(*pending & SEND_IPI_TO_HART(hart));
>> diff --git a/arch/riscv/lib/sbi_ipi.c b/arch/riscv/lib/sbi_ipi.c
>> index 9a698ce74e..310d1bd2a4 100644
>> --- a/arch/riscv/lib/sbi_ipi.c
>> +++ b/arch/riscv/lib/sbi_ipi.c
>> @@ -7,6 +7,11 @@
>>  #include <common.h>
>>  #include <asm/sbi.h>
>>  
>> +int riscv_init_ipi(void)
>> +{
>> +	return 0;
>> +}
>> +
>>  int riscv_send_ipi(int hart)
>>  {
>>  	ulong mask;
>> diff --git a/arch/riscv/lib/sifive_clint.c b/arch/riscv/lib/sifive_clint.c
>> index 5e0d25720b..62c1b2b0ef 100644
>> --- a/arch/riscv/lib/sifive_clint.c
>> +++ b/arch/riscv/lib/sifive_clint.c
>> @@ -24,22 +24,8 @@
>>  
>>  DECLARE_GLOBAL_DATA_PTR;
>>  
>> -#define CLINT_BASE_GET(void)						\
>> -	do {								\
>> -		long *ret;						\
>> -									\
>> -		if (!gd->arch.clint) {					\
>> -			ret = syscon_get_first_range(RISCV_SYSCON_CLINT); \
>> -			if (IS_ERR(ret))				\
>> -				return PTR_ERR(ret);			\
>> -			gd->arch.clint = ret;				\
>> -		}							\
>> -	} while (0)
>> -
>>  int riscv_get_time(u64 *time)
>>  {
>> -	CLINT_BASE_GET();
>> -
>>  	*time = readq((void __iomem *)MTIME_REG(gd->arch.clint));
>>  
>>  	return 0;
>> @@ -47,17 +33,24 @@ int riscv_get_time(u64 *time)
>>  
>>  int riscv_set_timecmp(int hart, u64 cmp)
>>  {
>> -	CLINT_BASE_GET();
>> -
>>  	writeq(cmp, (void __iomem *)MTIMECMP_REG(gd->arch.clint, hart));
>>  
>>  	return 0;
>>  }
>>  
>> +int riscv_init_ipi(void)
>> +{
>> +		long *ret = syscon_get_first_range(RISCV_SYSCON_CLINT);
>> +
>> +		if (IS_ERR(ret))
>> +			return PTR_ERR(ret);
>> +		gd->arch.clint = ret;
>> +
>> +		return 0;
>> +}
>> +
> 
> Please fix the indentation here.
>

Ok.

>>  int riscv_send_ipi(int hart)
>>  {
>> -	CLINT_BASE_GET();
>> -
>>  	writel(1, (void __iomem *)MSIP_REG(gd->arch.clint, hart));
>>  
>>  	return 0;
>> @@ -65,8 +58,6 @@ int riscv_send_ipi(int hart)
>>  
>>  int riscv_clear_ipi(int hart)
>>  {
>> -	CLINT_BASE_GET();
>> -
>>  	writel(0, (void __iomem *)MSIP_REG(gd->arch.clint, hart));
>>  
>>  	return 0;
>> @@ -74,8 +65,6 @@ int riscv_clear_ipi(int hart)
>>  
>>  int riscv_get_ipi(int hart, int *pending)
>>  {
>> -	CLINT_BASE_GET();
>> -
>>  	*pending = readl((void __iomem *)MSIP_REG(gd->arch.clint, hart));
>>  
>>  	return 0;
>> diff --git a/arch/riscv/lib/smp.c b/arch/riscv/lib/smp.c
>> index 17adb35730..3b1e52e9b2 100644
>> --- a/arch/riscv/lib/smp.c
>> +++ b/arch/riscv/lib/smp.c
>> @@ -12,38 +12,6 @@
>>  
>>  DECLARE_GLOBAL_DATA_PTR;
>>  
>> -/**
>> - * riscv_send_ipi() - Send inter-processor interrupt (IPI)
>> - *
>> - * Platform code must provide this function.
>> - *
>> - * @hart: Hart ID of receiving hart
>> - * @return 0 if OK, -ve on error
>> - */
>> -extern int riscv_send_ipi(int hart);
>> -
>> -/**
>> - * riscv_clear_ipi() - Clear inter-processor interrupt (IPI)
>> - *
>> - * Platform code must provide this function.
>> - *
>> - * @hart: Hart ID of hart to be cleared
>> - * @return 0 if OK, -ve on error
>> - */
>> -extern int riscv_clear_ipi(int hart);
>> -
>> -/**
>> - * riscv_get_ipi() - Get status of inter-processor interrupt (IPI)
>> - *
>> - * Platform code must provide this function.
>> - *
>> - * @hart: Hart ID of hart to be checked
>> - * @pending: Pointer to variable with result of the check,
>> - *           1 if IPI is pending, 0 otherwise
>> - * @return 0 if OK, -ve on error
>> - */
>> -extern int riscv_get_ipi(int hart, int *pending);
>> -
>>  static int send_ipi_many(struct ipi_data *ipi, int wait)
>>  {
>>  	ofnode node, cpus;
>> @@ -110,37 +78,41 @@ void handle_ipi(ulong hart)
>>  	int ret;
>>  	void (*smp_function)(ulong hart, ulong arg0, ulong arg1);
>>  
>> -	if (hart >= CONFIG_NR_CPUS)
>> +	if (hart >= CONFIG_NR_CPUS || !READ_ONCE(gd->arch.ipi_ready))
>>  		return;
>>  
>> -	__smp_mb();
>> -
>> -	smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
>> -	invalidate_icache_all();
>> -
> 
> Don't move this. It is intended to be run before the IPI is cleared.

Hm, ok. I think I moved it to after because of the 'if (!smp_function)'
check, but those two don't really need to be done together.

>>  	/*
>>  	 * Clear the IPI to acknowledge the request before jumping to the
>>  	 * requested function.
>>  	 */
>>  	ret = riscv_clear_ipi(hart);
>>  	if (ret) {
>> -		pr_err("Cannot clear IPI of hart %ld\n", hart);
>> +		pr_err("Cannot clear IPI of hart %ld (error %d)\n", hart, ret);
>>  		return;
>>  	}
>>  
>> +	__smp_mb();
>> +
>> +	smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
>> +	/*
>> +	 * There may be an IPI raised before u-boot begins execution, so check
>> +	 * to ensure we actually have a function to call.
>> +	 */
>> +	if (!smp_function)
>> +		return;
>> +	log_debug("hart = %lu func = %p\n", hart, smp_function);
> 
> The log messages might be corrupted if multiple harts are calling the
> log function here. I have not looked into the details so this might not
> be an issue. In that case it is fine to keep, otherwise please remove
> it.

I ran into this problem a lot when debugging. I ended up implementing a
spinlock around puts/putc. I agree it's probably better to remove this,
but I worry that concurrency bugs will become much harder to track down
without some kind of feedback. (This same criticism applies to the log
message above as well).

> 
> Thanks,
> Lukas
> 
>> +	invalidate_icache_all();
>> +
>>  	smp_function(hart, gd->arch.ipi[hart].arg0, gd->arch.ipi[hart].arg1);
>>  }
>>  
>>  int smp_call_function(ulong addr, ulong arg0, ulong arg1, int wait)
>>  {
>> -	int ret = 0;
>> -	struct ipi_data ipi;
>> +	struct ipi_data ipi = {
>> +		.addr = addr,
>> +		.arg0 = arg0,
>> +		.arg1 = arg1,
>> +	};
>>  
>> -	ipi.addr = addr;
>> -	ipi.arg0 = arg0;
>> -	ipi.arg1 = arg1;
>> -
>> -	ret = send_ipi_many(&ipi, wait);
>> -
>> -	return ret;
>> +	return send_ipi_many(&ipi, wait);
>>  }
Rick Chen March 3, 2020, 8:27 a.m. UTC | #6
Hi Sean

> On Mon, 2020-03-02 at 10:43 -0500, Sean Anderson wrote:
>
> > On 3/2/20 4:08 AM, Rick Chen wrote:
> > > Hi Sean
> > >
> > > > The IPI code could have race conditions in several places.
> > > > * Several harts could race on the value of gd->arch->clint/plic
> > > > * Non-boot harts could race with the main hart on the DM subsystem In
> > > >   addition, if an IPI was pending when U-Boot started, it would cause the
> > > >   IPI handler to jump to address 0.
> > > >
> > > > To address these problems, a new function riscv_init_ipi is introduced. It
> > > > is called once during arch_cpu_init_dm. Before this point, no riscv_*_ipi
> > > > functions may be called. Access is synchronized by gd->arch->ipi_ready.
> > > >
> > > > Signed-off-by: Sean Anderson <seanga2@gmail.com>
> > > > ---
> > > >
> > > > Changes in v5:
> > > > - New
> > > >
> > > >  arch/riscv/cpu/cpu.c                 |  9 ++++
> > > >  arch/riscv/include/asm/global_data.h |  1 +
> > > >  arch/riscv/include/asm/smp.h         | 43 ++++++++++++++++++
> > > >  arch/riscv/lib/andes_plic.c          | 34 +++++---------
> > > >  arch/riscv/lib/sbi_ipi.c             |  5 ++
> > > >  arch/riscv/lib/sifive_clint.c        | 33 +++++---------
> > > >  arch/riscv/lib/smp.c                 | 68 ++++++++--------------------
> > > >  7 files changed, 101 insertions(+), 92 deletions(-)
> > > >
> > > > diff --git a/arch/riscv/cpu/cpu.c b/arch/riscv/cpu/cpu.c
> > > > index e457f6acbf..a971ec8694 100644
> > > > --- a/arch/riscv/cpu/cpu.c
> > > > +++ b/arch/riscv/cpu/cpu.c
> > > > @@ -96,6 +96,15 @@ int arch_cpu_init_dm(void)
> > > >                         csr_write(CSR_SATP, 0);
> > > >         }
> > > >
> > > > +#ifdef CONFIG_SMP
> > > > +       ret = riscv_init_ipi();
> > > > +       if (ret)
> > > > +               return ret;
> > > > +
> > > > +       /* Atomically set a flag enabling IPI handling */
> > > > +       WRITE_ONCE(gd->arch.ipi_ready, 1);
> > >
> > > I think it shall not have race condition here.
> > > Can you explain more detail why there will occur race condition ?
> > >
> > > Hi Lukas
> > >
> > > Do you have any comments ?
> > >
> > > Thanks
> > > Rick
> >
> > On the K210, there may already be an IPI pending when U-Boot starts.
> > (Perhaps the prior stage sends an IPI but does not clear it). As soon as
> > interrupts are enabled, the hart then tries to call riscv_clear_ipi().
> > Because the clint/plic has not yet been enabled, the clear_ipi function
> > will try and bind/probe the device. This can have really nasty effects, since
> > the boot hart is *also* trying to bind/probe devices.
> >
> > In addition, a hart could end up trying to probe the clint/plic because
> > it could receive the IPI before (from its perspective) gd->arch.clint
> > (or plic) gets initialized.
> >
>
> We did not have a problem with pending IPIs on other platforms. It
> should suffice to clear SSIP / MSIP before enabling the interrupts.
>

Can you try to clear mip/sip in startup flow before secondary_hart_loop:
Maybe it can help to overcome the problem of calling riscv_clear_ipi()
before riscv_init_ipi() that you added.

Thanks,
Rick

> > Aside from the above, I think the macro approach is a bit confusing,
> > since it's unclear at first glance what function will be initializing
> > the clint/plic. Given U-Boot's otherwise completely SMP-unsafe design, I
> > think it's better to be explicit and conservative in these areas.
> >
>
> I agree, the patch makes this more clear and helps make the code more
> robust.
>
> Thanks,
> Lukas
Lukas Auer March 3, 2020, 9:53 p.m. UTC | #7
On Mon, 2020-03-02 at 18:43 -0500, Sean Anderson wrote:
> On 3/2/20 6:17 PM, Lukas Auer wrote:
> > On Fri, 2020-02-28 at 16:05 -0500, Sean Anderson wrote:
> > 
> > > The IPI code could have race conditions in several places.
> > > * Several harts could race on the value of gd->arch->clint/plic
> > > * Non-boot harts could race with the main hart on the DM subsystem In
> > >   addition, if an IPI was pending when U-Boot started, it would cause the
> > >   IPI handler to jump to address 0.
> > > 
> > > To address these problems, a new function riscv_init_ipi is introduced. It
> > > is called once during arch_cpu_init_dm. Before this point, no riscv_*_ipi
> > > functions may be called. Access is synchronized by gd->arch->ipi_ready.
> > > 
> > > Signed-off-by: Sean Anderson <seanga2@gmail.com>
> > > ---
> > > 
> > > Changes in v5:
> > > - New
> > > 
> > >  arch/riscv/cpu/cpu.c                 |  9 ++++
> > >  arch/riscv/include/asm/global_data.h |  1 +
> > >  arch/riscv/include/asm/smp.h         | 43 ++++++++++++++++++
> > >  arch/riscv/lib/andes_plic.c          | 34 +++++---------
> > >  arch/riscv/lib/sbi_ipi.c             |  5 ++
> > >  arch/riscv/lib/sifive_clint.c        | 33 +++++---------
> > >  arch/riscv/lib/smp.c                 | 68 ++++++++--------------------
> > >  7 files changed, 101 insertions(+), 92 deletions(-)
> > > 
> > > diff --git a/arch/riscv/cpu/cpu.c b/arch/riscv/cpu/cpu.c
> > > index e457f6acbf..a971ec8694 100644
> > > --- a/arch/riscv/cpu/cpu.c
> > > +++ b/arch/riscv/cpu/cpu.c
> > > @@ -96,6 +96,15 @@ int arch_cpu_init_dm(void)
> > >  			csr_write(CSR_SATP, 0);
> > >  	}
> > >  
> > > +#ifdef CONFIG_SMP
> > > +	ret = riscv_init_ipi();
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	/* Atomically set a flag enabling IPI handling */
> > > +	WRITE_ONCE(gd->arch.ipi_ready, 1);
> > > +#endif
> > > +
> > >  	return 0;
> > >  }
> > >  
> > > diff --git a/arch/riscv/include/asm/global_data.h b/arch/riscv/include/asm/global_data.h
> > > index 7276d9763f..b24f8fd2a7 100644
> > > --- a/arch/riscv/include/asm/global_data.h
> > > +++ b/arch/riscv/include/asm/global_data.h
> > > @@ -28,6 +28,7 @@ struct arch_global_data {
> > >  #endif
> > >  #ifdef CONFIG_SMP
> > >  	struct ipi_data ipi[CONFIG_NR_CPUS];
> > > +	long ipi_ready; /* Set after riscv_init_ipi is called */
> > >  #endif
> > >  #ifndef CONFIG_XIP
> > >  	ulong available_harts;
> > > diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h
> > > index 74de92ed13..1b428856b2 100644
> > > --- a/arch/riscv/include/asm/smp.h
> > > +++ b/arch/riscv/include/asm/smp.h
> > > @@ -51,4 +51,47 @@ void handle_ipi(ulong hart);
> > >   */
> > >  int smp_call_function(ulong addr, ulong arg0, ulong arg1, int wait);
> > >  
> > > +/**
> > > + * riscv_init_ipi() - Initialize inter-process interrupt (IPI) driver
> > > + *
> > > + * Platform code must provide this function. This function is called once after
> > > + * the cpu driver is initialized. No other riscv_*_ipi() calls will be made
> > > + * before this function is called.
> > > + *
> > > + * @return 0 if OK, -ve on error
> > > + */
> > > +int riscv_init_ipi(void);
> > > +
> > > +/**
> > > + * riscv_send_ipi() - Send inter-processor interrupt (IPI)
> > > + *
> > > + * Platform code must provide this function.
> > > + *
> > > + * @hart: Hart ID of receiving hart
> > > + * @return 0 if OK, -ve on error
> > > + */
> > > +int riscv_send_ipi(int hart);
> > > +
> > > +/**
> > > + * riscv_clear_ipi() - Clear inter-processor interrupt (IPI)
> > > + *
> > > + * Platform code must provide this function.
> > > + *
> > > + * @hart: Hart ID of hart to be cleared
> > > + * @return 0 if OK, -ve on error
> > > + */
> > > +int riscv_clear_ipi(int hart);
> > > +
> > > +/**
> > > + * riscv_get_ipi() - Get status of inter-processor interrupt (IPI)
> > > + *
> > > + * Platform code must provide this function.
> > > + *
> > > + * @hart: Hart ID of hart to be checked
> > > + * @pending: Pointer to variable with result of the check,
> > > + *           1 if IPI is pending, 0 otherwise
> > > + * @return 0 if OK, -ve on error
> > > + */
> > > +int riscv_get_ipi(int hart, int *pending);
> > > +
> > >  #endif
> > > diff --git a/arch/riscv/lib/andes_plic.c b/arch/riscv/lib/andes_plic.c
> > > index 20529ab3eb..8484f76386 100644
> > > --- a/arch/riscv/lib/andes_plic.c
> > > +++ b/arch/riscv/lib/andes_plic.c
> > > @@ -30,20 +30,6 @@
> > >  #define SEND_IPI_TO_HART(hart)  (0x80 >> (hart))
> > >  
> > >  DECLARE_GLOBAL_DATA_PTR;
> > > -static int init_plic(void);
> > > -
> > > -#define PLIC_BASE_GET(void)						\
> > > -	do {								\
> > > -		long *ret;						\
> > > -									\
> > > -		if (!gd->arch.plic) {					\
> > > -			ret = syscon_get_first_range(RISCV_SYSCON_PLIC); \
> > > -			if (IS_ERR(ret))				\
> > > -				return PTR_ERR(ret);			\
> > > -			gd->arch.plic = ret;				\
> > > -			init_plic();					\
> > > -		}							\
> > > -	} while (0)
> > >  
> > >  static int enable_ipi(int hart)
> > >  {
> > > @@ -93,13 +79,21 @@ static int init_plic(void)
> > >  	return -ENODEV;
> > >  }
> > >  
> > > +int riscv_init_ipi(void)
> > > +{
> > > +	int ret = syscon_get_first_range(RISCV_SYSCON_PLIC);
> > > +
> > > +	if (IS_ERR(ret))
> > > +		return PTR_ERR(ret);
> > > +	gd->arch.plic = ret;
> > > +
> > > +	return init_plic();
> > > +}
> > > +
> > >  int riscv_send_ipi(int hart)
> > >  {
> > > -	unsigned int ipi;
> > > +	unsigned int ipi = (SEND_IPI_TO_HART(hart) << (8 * gd->arch.boot_hart));
> > >  
> > > -	PLIC_BASE_GET();
> > > -
> > > -	ipi = (SEND_IPI_TO_HART(hart) << (8 * gd->arch.boot_hart));
> > >  	writel(ipi, (void __iomem *)PENDING_REG(gd->arch.plic,
> > >  				gd->arch.boot_hart));
> > >  
> > > @@ -110,8 +104,6 @@ int riscv_clear_ipi(int hart)
> > >  {
> > >  	u32 source_id;
> > >  
> > > -	PLIC_BASE_GET();
> > > -
> > >  	source_id = readl((void __iomem *)CLAIM_REG(gd->arch.plic, hart));
> > >  	writel(source_id, (void __iomem *)CLAIM_REG(gd->arch.plic, hart));
> > >  
> > > @@ -120,8 +112,6 @@ int riscv_clear_ipi(int hart)
> > >  
> > >  int riscv_get_ipi(int hart, int *pending)
> > >  {
> > > -	PLIC_BASE_GET();
> > > -
> > >  	*pending = readl((void __iomem *)PENDING_REG(gd->arch.plic,
> > >  						     gd->arch.boot_hart));
> > >  	*pending = !!(*pending & SEND_IPI_TO_HART(hart));
> > > diff --git a/arch/riscv/lib/sbi_ipi.c b/arch/riscv/lib/sbi_ipi.c
> > > index 9a698ce74e..310d1bd2a4 100644
> > > --- a/arch/riscv/lib/sbi_ipi.c
> > > +++ b/arch/riscv/lib/sbi_ipi.c
> > > @@ -7,6 +7,11 @@
> > >  #include <common.h>
> > >  #include <asm/sbi.h>
> > >  
> > > +int riscv_init_ipi(void)
> > > +{
> > > +	return 0;
> > > +}
> > > +
> > >  int riscv_send_ipi(int hart)
> > >  {
> > >  	ulong mask;
> > > diff --git a/arch/riscv/lib/sifive_clint.c b/arch/riscv/lib/sifive_clint.c
> > > index 5e0d25720b..62c1b2b0ef 100644
> > > --- a/arch/riscv/lib/sifive_clint.c
> > > +++ b/arch/riscv/lib/sifive_clint.c
> > > @@ -24,22 +24,8 @@
> > >  
> > >  DECLARE_GLOBAL_DATA_PTR;
> > >  
> > > -#define CLINT_BASE_GET(void)						\
> > > -	do {								\
> > > -		long *ret;						\
> > > -									\
> > > -		if (!gd->arch.clint) {					\
> > > -			ret = syscon_get_first_range(RISCV_SYSCON_CLINT); \
> > > -			if (IS_ERR(ret))				\
> > > -				return PTR_ERR(ret);			\
> > > -			gd->arch.clint = ret;				\
> > > -		}							\
> > > -	} while (0)
> > > -
> > >  int riscv_get_time(u64 *time)
> > >  {
> > > -	CLINT_BASE_GET();
> > > -
> > >  	*time = readq((void __iomem *)MTIME_REG(gd->arch.clint));
> > >  
> > >  	return 0;
> > > @@ -47,17 +33,24 @@ int riscv_get_time(u64 *time)
> > >  
> > >  int riscv_set_timecmp(int hart, u64 cmp)
> > >  {
> > > -	CLINT_BASE_GET();
> > > -
> > >  	writeq(cmp, (void __iomem *)MTIMECMP_REG(gd->arch.clint, hart));
> > >  
> > >  	return 0;
> > >  }
> > >  
> > > +int riscv_init_ipi(void)
> > > +{
> > > +		long *ret = syscon_get_first_range(RISCV_SYSCON_CLINT);
> > > +
> > > +		if (IS_ERR(ret))
> > > +			return PTR_ERR(ret);
> > > +		gd->arch.clint = ret;
> > > +
> > > +		return 0;
> > > +}
> > > +
> > 
> > Please fix the indentation here.
> > 
> 
> Ok.
> 
> > >  int riscv_send_ipi(int hart)
> > >  {
> > > -	CLINT_BASE_GET();
> > > -
> > >  	writel(1, (void __iomem *)MSIP_REG(gd->arch.clint, hart));
> > >  
> > >  	return 0;
> > > @@ -65,8 +58,6 @@ int riscv_send_ipi(int hart)
> > >  
> > >  int riscv_clear_ipi(int hart)
> > >  {
> > > -	CLINT_BASE_GET();
> > > -
> > >  	writel(0, (void __iomem *)MSIP_REG(gd->arch.clint, hart));
> > >  
> > >  	return 0;
> > > @@ -74,8 +65,6 @@ int riscv_clear_ipi(int hart)
> > >  
> > >  int riscv_get_ipi(int hart, int *pending)
> > >  {
> > > -	CLINT_BASE_GET();
> > > -
> > >  	*pending = readl((void __iomem *)MSIP_REG(gd->arch.clint, hart));
> > >  
> > >  	return 0;
> > > diff --git a/arch/riscv/lib/smp.c b/arch/riscv/lib/smp.c
> > > index 17adb35730..3b1e52e9b2 100644
> > > --- a/arch/riscv/lib/smp.c
> > > +++ b/arch/riscv/lib/smp.c
> > > @@ -12,38 +12,6 @@
> > >  
> > >  DECLARE_GLOBAL_DATA_PTR;
> > >  
> > > -/**
> > > - * riscv_send_ipi() - Send inter-processor interrupt (IPI)
> > > - *
> > > - * Platform code must provide this function.
> > > - *
> > > - * @hart: Hart ID of receiving hart
> > > - * @return 0 if OK, -ve on error
> > > - */
> > > -extern int riscv_send_ipi(int hart);
> > > -
> > > -/**
> > > - * riscv_clear_ipi() - Clear inter-processor interrupt (IPI)
> > > - *
> > > - * Platform code must provide this function.
> > > - *
> > > - * @hart: Hart ID of hart to be cleared
> > > - * @return 0 if OK, -ve on error
> > > - */
> > > -extern int riscv_clear_ipi(int hart);
> > > -
> > > -/**
> > > - * riscv_get_ipi() - Get status of inter-processor interrupt (IPI)
> > > - *
> > > - * Platform code must provide this function.
> > > - *
> > > - * @hart: Hart ID of hart to be checked
> > > - * @pending: Pointer to variable with result of the check,
> > > - *           1 if IPI is pending, 0 otherwise
> > > - * @return 0 if OK, -ve on error
> > > - */
> > > -extern int riscv_get_ipi(int hart, int *pending);
> > > -
> > >  static int send_ipi_many(struct ipi_data *ipi, int wait)
> > >  {
> > >  	ofnode node, cpus;
> > > @@ -110,37 +78,41 @@ void handle_ipi(ulong hart)
> > >  	int ret;
> > >  	void (*smp_function)(ulong hart, ulong arg0, ulong arg1);
> > >  
> > > -	if (hart >= CONFIG_NR_CPUS)
> > > +	if (hart >= CONFIG_NR_CPUS || !READ_ONCE(gd->arch.ipi_ready))
> > >  		return;
> > >  
> > > -	__smp_mb();
> > > -
> > > -	smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
> > > -	invalidate_icache_all();
> > > -
> > 
> > Don't move this. It is intended to be run before the IPI is cleared.
> 
> Hm, ok. I think I moved it to after because of the 'if (!smp_function)'
> check, but those two don't really need to be done together.
> 

Thanks! We had problems with code corruption in some situations,
because some secondary harts entered OpenSBI after the main hart while
OpenSBI expected all harts to be running OpenSBI by that time. Moving
this code block was part of the fix for this situation, see [1].

[1]: 
https://gitlab.denx.de/u-boot/u-boot/commit/90ae28143700bae4edd23930a7772899ad259058

> > >  	/*
> > >  	 * Clear the IPI to acknowledge the request before jumping to the
> > >  	 * requested function.
> > >  	 */
> > >  	ret = riscv_clear_ipi(hart);
> > >  	if (ret) {
> > > -		pr_err("Cannot clear IPI of hart %ld\n", hart);
> > > +		pr_err("Cannot clear IPI of hart %ld (error %d)\n", hart, ret);
> > >  		return;
> > >  	}
> > >  
> > > +	__smp_mb();
> > > +
> > > +	smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
> > > +	/*
> > > +	 * There may be an IPI raised before u-boot begins execution, so check
> > > +	 * to ensure we actually have a function to call.
> > > +	 */
> > > +	if (!smp_function)
> > > +		return;
> > > +	log_debug("hart = %lu func = %p\n", hart, smp_function);
> > 
> > The log messages might be corrupted if multiple harts are calling the
> > log function here. I have not looked into the details so this might not
> > be an issue. In that case it is fine to keep, otherwise please remove
> > it.
> 
> I ran into this problem a lot when debugging. I ended up implementing a
> spinlock around puts/putc. I agree it's probably better to remove this,
> but I worry that concurrency bugs will become much harder to track down
> without some kind of feedback. (This same criticism applies to the log
> message above as well).
> 

Especially with your changes, I hope we already have or will soon reach
a code robustness level where we won't have too many concurrency bugs
in the future. :)
Let's remove it for now until the logging backend can handle this
cleanly.

Thanks,
Lukas
Sean Anderson March 3, 2020, 9:57 p.m. UTC | #8
On 3/3/20 4:53 PM, Lukas Auer wrote:
> On Mon, 2020-03-02 at 18:43 -0500, Sean Anderson wrote:
>> On 3/2/20 6:17 PM, Lukas Auer wrote:
>>> Don't move this. It is intended to be run before the IPI is cleared.
>>
>> Hm, ok. I think I moved it to after because of the 'if (!smp_function)'
>> check, but those two don't really need to be done together.
>>
> 
> Thanks! We had problems with code corruption in some situations,
> because some secondary harts entered OpenSBI after the main hart while
> OpenSBI expected all harts to be running OpenSBI by that time. Moving
> this code block was part of the fix for this situation, see [1].
> 
> [1]: 
> https://gitlab.denx.de/u-boot/u-boot/commit/90ae28143700bae4edd23930a7772899ad259058

Ah, this makes a lot more sense why it was located where it was.

>>>>  	/*
>>>>  	 * Clear the IPI to acknowledge the request before jumping to the
>>>>  	 * requested function.
>>>>  	 */
>>>>  	ret = riscv_clear_ipi(hart);
>>>>  	if (ret) {
>>>> -		pr_err("Cannot clear IPI of hart %ld\n", hart);
>>>> +		pr_err("Cannot clear IPI of hart %ld (error %d)\n", hart, ret);
>>>>  		return;
>>>>  	}
>>>>  
>>>> +	__smp_mb();
>>>> +
>>>> +	smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
>>>> +	/*
>>>> +	 * There may be an IPI raised before u-boot begins execution, so check
>>>> +	 * to ensure we actually have a function to call.
>>>> +	 */
>>>> +	if (!smp_function)
>>>> +		return;
>>>> +	log_debug("hart = %lu func = %p\n", hart, smp_function);
>>>
>>> The log messages might be corrupted if multiple harts are calling the
>>> log function here. I have not looked into the details so this might not
>>> be an issue. In that case it is fine to keep, otherwise please remove
>>> it.
>>
>> I ran into this problem a lot when debugging. I ended up implementing a
>> spinlock around puts/putc. I agree it's probably better to remove this,
>> but I worry that concurrency bugs will become much harder to track down
>> without some kind of feedback. (This same criticism applies to the log
>> message above as well).
>>
> 
> Especially with your changes, I hope we already have or will soon reach
> a code robustness level where we won't have too many concurrency bugs
> in the future. :)
> Let's remove it for now until the logging backend can handle this
> cleanly.

Ok. Should the error message above ("Cannot clear IPI of hart...") also
be removed? I found it tended to corrupt the log output if it was ever
triggered.

--Sean
Lukas Auer March 4, 2020, 3:25 p.m. UTC | #9
On Tue, 2020-03-03 at 16:57 -0500, Sean Anderson wrote:
> On 3/3/20 4:53 PM, Lukas Auer wrote:
> > On Mon, 2020-03-02 at 18:43 -0500, Sean Anderson wrote:
> > > On 3/2/20 6:17 PM, Lukas Auer wrote:
> > > > Don't move this. It is intended to be run before the IPI is cleared.
> > > 
> > > Hm, ok. I think I moved it to after because of the 'if (!smp_function)'
> > > check, but those two don't really need to be done together.
> > > 
> > 
> > Thanks! We had problems with code corruption in some situations,
> > because some secondary harts entered OpenSBI after the main hart while
> > OpenSBI expected all harts to be running OpenSBI by that time. Moving
> > this code block was part of the fix for this situation, see [1].
> > 
> > [1]: 
> > https://gitlab.denx.de/u-boot/u-boot/commit/90ae28143700bae4edd23930a7772899ad259058
> 
> Ah, this makes a lot more sense why it was located where it was.
> 
> > > > >  	/*
> > > > >  	 * Clear the IPI to acknowledge the request before jumping to the
> > > > >  	 * requested function.
> > > > >  	 */
> > > > >  	ret = riscv_clear_ipi(hart);
> > > > >  	if (ret) {
> > > > > -		pr_err("Cannot clear IPI of hart %ld\n", hart);
> > > > > +		pr_err("Cannot clear IPI of hart %ld (error %d)\n", hart, ret);
> > > > >  		return;
> > > > >  	}
> > > > >  
> > > > > +	__smp_mb();
> > > > > +
> > > > > +	smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
> > > > > +	/*
> > > > > +	 * There may be an IPI raised before u-boot begins execution, so check
> > > > > +	 * to ensure we actually have a function to call.
> > > > > +	 */
> > > > > +	if (!smp_function)
> > > > > +		return;
> > > > > +	log_debug("hart = %lu func = %p\n", hart, smp_function);
> > > > 
> > > > The log messages might be corrupted if multiple harts are calling the
> > > > log function here. I have not looked into the details so this might not
> > > > be an issue. In that case it is fine to keep, otherwise please remove
> > > > it.
> > > 
> > > I ran into this problem a lot when debugging. I ended up implementing a
> > > spinlock around puts/putc. I agree it's probably better to remove this,
> > > but I worry that concurrency bugs will become much harder to track down
> > > without some kind of feedback. (This same criticism applies to the log
> > > message above as well).
> > > 
> > 
> > Especially with your changes, I hope we already have or will soon reach
> > a code robustness level where we won't have too many concurrency bugs
> > in the future. :)
> > Let's remove it for now until the logging backend can handle this
> > cleanly.
> 
> Ok. Should the error message above ("Cannot clear IPI of hart...") also
> be removed? I found it tended to corrupt the log output if it was ever
> triggered.
> 

Even though it's not ideal, we should keep it for now. Otherwise we
don't have a way to get notified about the error.

Thanks,
Lukas
Rick Chen March 5, 2020, 2:18 a.m. UTC | #10
Hi Sean

> Hi Sean
>
> > On Mon, 2020-03-02 at 10:43 -0500, Sean Anderson wrote:
> >
> > > On 3/2/20 4:08 AM, Rick Chen wrote:
> > > > Hi Sean
> > > >
> > > > > The IPI code could have race conditions in several places.
> > > > > * Several harts could race on the value of gd->arch->clint/plic
> > > > > * Non-boot harts could race with the main hart on the DM subsystem In
> > > > >   addition, if an IPI was pending when U-Boot started, it would cause the
> > > > >   IPI handler to jump to address 0.
> > > > >
> > > > > To address these problems, a new function riscv_init_ipi is introduced. It
> > > > > is called once during arch_cpu_init_dm. Before this point, no riscv_*_ipi
> > > > > functions may be called. Access is synchronized by gd->arch->ipi_ready.
> > > > >
> > > > > Signed-off-by: Sean Anderson <seanga2@gmail.com>
> > > > > ---
> > > > >
> > > > > Changes in v5:
> > > > > - New
> > > > >
> > > > >  arch/riscv/cpu/cpu.c                 |  9 ++++
> > > > >  arch/riscv/include/asm/global_data.h |  1 +
> > > > >  arch/riscv/include/asm/smp.h         | 43 ++++++++++++++++++
> > > > >  arch/riscv/lib/andes_plic.c          | 34 +++++---------
> > > > >  arch/riscv/lib/sbi_ipi.c             |  5 ++
> > > > >  arch/riscv/lib/sifive_clint.c        | 33 +++++---------
> > > > >  arch/riscv/lib/smp.c                 | 68 ++++++++--------------------
> > > > >  7 files changed, 101 insertions(+), 92 deletions(-)
> > > > >
> > > > > diff --git a/arch/riscv/cpu/cpu.c b/arch/riscv/cpu/cpu.c
> > > > > index e457f6acbf..a971ec8694 100644
> > > > > --- a/arch/riscv/cpu/cpu.c
> > > > > +++ b/arch/riscv/cpu/cpu.c
> > > > > @@ -96,6 +96,15 @@ int arch_cpu_init_dm(void)
> > > > >                         csr_write(CSR_SATP, 0);
> > > > >         }
> > > > >
> > > > > +#ifdef CONFIG_SMP
> > > > > +       ret = riscv_init_ipi();
> > > > > +       if (ret)
> > > > > +               return ret;
> > > > > +
> > > > > +       /* Atomically set a flag enabling IPI handling */
> > > > > +       WRITE_ONCE(gd->arch.ipi_ready, 1);
> > > >
> > > > I think it shall not have race condition here.
> > > > Can you explain more detail why there will occur race condition ?
> > > >
> > > > Hi Lukas
> > > >
> > > > Do you have any comments ?
> > > >
> > > > Thanks
> > > > Rick
> > >
> > > On the K210, there may already be an IPI pending when U-Boot starts.
> > > (Perhaps the prior stage sends an IPI but does not clear it). As soon as
> > > interrupts are enabled, the hart then tries to call riscv_clear_ipi().
> > > Because the clint/plic has not yet been enabled, the clear_ipi function
> > > will try and bind/probe the device. This can have really nasty effects, since
> > > the boot hart is *also* trying to bind/probe devices.
> > >
> > > In addition, a hart could end up trying to probe the clint/plic because
> > > it could receive the IPI before (from its perspective) gd->arch.clint
> > > (or plic) gets initialized.
> > >
> >
> > We did not have a problem with pending IPIs on other platforms. It
> > should suffice to clear SSIP / MSIP before enabling the interrupts.
> >
>
> Can you try to clear mip/sip in startup flow before secondary_hart_loop:
> Maybe it can help to overcome the problem of calling riscv_clear_ipi()
> before riscv_init_ipi() that you added.

How about the verified result ?
Is this can help to fix your problem without any modification (the
original flow)
Avoid unexpected condition can increase the robustness indeed.
But clarify the root cause more precisely is still necessary here.

Thanks,
Rick

>
> Thanks,
> Rick
>
> > > Aside from the above, I think the macro approach is a bit confusing,
> > > since it's unclear at first glance what function will be initializing
> > > the clint/plic. Given U-Boot's otherwise completely SMP-unsafe design, I
> > > think it's better to be explicit and conservative in these areas.
> > >
> >
> > I agree, the patch makes this more clear and helps make the code more
> > robust.
> >
> > Thanks,
> > Lukas
diff mbox series

Patch

diff --git a/arch/riscv/cpu/cpu.c b/arch/riscv/cpu/cpu.c
index e457f6acbf..a971ec8694 100644
--- a/arch/riscv/cpu/cpu.c
+++ b/arch/riscv/cpu/cpu.c
@@ -96,6 +96,15 @@  int arch_cpu_init_dm(void)
 			csr_write(CSR_SATP, 0);
 	}
 
+#ifdef CONFIG_SMP
+	ret = riscv_init_ipi();
+	if (ret)
+		return ret;
+
+	/* Atomically set a flag enabling IPI handling */
+	WRITE_ONCE(gd->arch.ipi_ready, 1);
+#endif
+
 	return 0;
 }
 
diff --git a/arch/riscv/include/asm/global_data.h b/arch/riscv/include/asm/global_data.h
index 7276d9763f..b24f8fd2a7 100644
--- a/arch/riscv/include/asm/global_data.h
+++ b/arch/riscv/include/asm/global_data.h
@@ -28,6 +28,7 @@  struct arch_global_data {
 #endif
 #ifdef CONFIG_SMP
 	struct ipi_data ipi[CONFIG_NR_CPUS];
+	long ipi_ready; /* Set after riscv_init_ipi is called */
 #endif
 #ifndef CONFIG_XIP
 	ulong available_harts;
diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h
index 74de92ed13..1b428856b2 100644
--- a/arch/riscv/include/asm/smp.h
+++ b/arch/riscv/include/asm/smp.h
@@ -51,4 +51,47 @@  void handle_ipi(ulong hart);
  */
 int smp_call_function(ulong addr, ulong arg0, ulong arg1, int wait);
 
+/**
+ * riscv_init_ipi() - Initialize inter-process interrupt (IPI) driver
+ *
+ * Platform code must provide this function. This function is called once after
+ * the cpu driver is initialized. No other riscv_*_ipi() calls will be made
+ * before this function is called.
+ *
+ * @return 0 if OK, -ve on error
+ */
+int riscv_init_ipi(void);
+
+/**
+ * riscv_send_ipi() - Send inter-processor interrupt (IPI)
+ *
+ * Platform code must provide this function.
+ *
+ * @hart: Hart ID of receiving hart
+ * @return 0 if OK, -ve on error
+ */
+int riscv_send_ipi(int hart);
+
+/**
+ * riscv_clear_ipi() - Clear inter-processor interrupt (IPI)
+ *
+ * Platform code must provide this function.
+ *
+ * @hart: Hart ID of hart to be cleared
+ * @return 0 if OK, -ve on error
+ */
+int riscv_clear_ipi(int hart);
+
+/**
+ * riscv_get_ipi() - Get status of inter-processor interrupt (IPI)
+ *
+ * Platform code must provide this function.
+ *
+ * @hart: Hart ID of hart to be checked
+ * @pending: Pointer to variable with result of the check,
+ *           1 if IPI is pending, 0 otherwise
+ * @return 0 if OK, -ve on error
+ */
+int riscv_get_ipi(int hart, int *pending);
+
 #endif
diff --git a/arch/riscv/lib/andes_plic.c b/arch/riscv/lib/andes_plic.c
index 20529ab3eb..8484f76386 100644
--- a/arch/riscv/lib/andes_plic.c
+++ b/arch/riscv/lib/andes_plic.c
@@ -30,20 +30,6 @@ 
 #define SEND_IPI_TO_HART(hart)  (0x80 >> (hart))
 
 DECLARE_GLOBAL_DATA_PTR;
-static int init_plic(void);
-
-#define PLIC_BASE_GET(void)						\
-	do {								\
-		long *ret;						\
-									\
-		if (!gd->arch.plic) {					\
-			ret = syscon_get_first_range(RISCV_SYSCON_PLIC); \
-			if (IS_ERR(ret))				\
-				return PTR_ERR(ret);			\
-			gd->arch.plic = ret;				\
-			init_plic();					\
-		}							\
-	} while (0)
 
 static int enable_ipi(int hart)
 {
@@ -93,13 +79,21 @@  static int init_plic(void)
 	return -ENODEV;
 }
 
+int riscv_init_ipi(void)
+{
+	int ret = syscon_get_first_range(RISCV_SYSCON_PLIC);
+
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+	gd->arch.plic = ret;
+
+	return init_plic();
+}
+
 int riscv_send_ipi(int hart)
 {
-	unsigned int ipi;
+	unsigned int ipi = (SEND_IPI_TO_HART(hart) << (8 * gd->arch.boot_hart));
 
-	PLIC_BASE_GET();
-
-	ipi = (SEND_IPI_TO_HART(hart) << (8 * gd->arch.boot_hart));
 	writel(ipi, (void __iomem *)PENDING_REG(gd->arch.plic,
 				gd->arch.boot_hart));
 
@@ -110,8 +104,6 @@  int riscv_clear_ipi(int hart)
 {
 	u32 source_id;
 
-	PLIC_BASE_GET();
-
 	source_id = readl((void __iomem *)CLAIM_REG(gd->arch.plic, hart));
 	writel(source_id, (void __iomem *)CLAIM_REG(gd->arch.plic, hart));
 
@@ -120,8 +112,6 @@  int riscv_clear_ipi(int hart)
 
 int riscv_get_ipi(int hart, int *pending)
 {
-	PLIC_BASE_GET();
-
 	*pending = readl((void __iomem *)PENDING_REG(gd->arch.plic,
 						     gd->arch.boot_hart));
 	*pending = !!(*pending & SEND_IPI_TO_HART(hart));
diff --git a/arch/riscv/lib/sbi_ipi.c b/arch/riscv/lib/sbi_ipi.c
index 9a698ce74e..310d1bd2a4 100644
--- a/arch/riscv/lib/sbi_ipi.c
+++ b/arch/riscv/lib/sbi_ipi.c
@@ -7,6 +7,11 @@ 
 #include <common.h>
 #include <asm/sbi.h>
 
+int riscv_init_ipi(void)
+{
+	return 0;
+}
+
 int riscv_send_ipi(int hart)
 {
 	ulong mask;
diff --git a/arch/riscv/lib/sifive_clint.c b/arch/riscv/lib/sifive_clint.c
index 5e0d25720b..62c1b2b0ef 100644
--- a/arch/riscv/lib/sifive_clint.c
+++ b/arch/riscv/lib/sifive_clint.c
@@ -24,22 +24,8 @@ 
 
 DECLARE_GLOBAL_DATA_PTR;
 
-#define CLINT_BASE_GET(void)						\
-	do {								\
-		long *ret;						\
-									\
-		if (!gd->arch.clint) {					\
-			ret = syscon_get_first_range(RISCV_SYSCON_CLINT); \
-			if (IS_ERR(ret))				\
-				return PTR_ERR(ret);			\
-			gd->arch.clint = ret;				\
-		}							\
-	} while (0)
-
 int riscv_get_time(u64 *time)
 {
-	CLINT_BASE_GET();
-
 	*time = readq((void __iomem *)MTIME_REG(gd->arch.clint));
 
 	return 0;
@@ -47,17 +33,24 @@  int riscv_get_time(u64 *time)
 
 int riscv_set_timecmp(int hart, u64 cmp)
 {
-	CLINT_BASE_GET();
-
 	writeq(cmp, (void __iomem *)MTIMECMP_REG(gd->arch.clint, hart));
 
 	return 0;
 }
 
+int riscv_init_ipi(void)
+{
+		long *ret = syscon_get_first_range(RISCV_SYSCON_CLINT);
+
+		if (IS_ERR(ret))
+			return PTR_ERR(ret);
+		gd->arch.clint = ret;
+
+		return 0;
+}
+
 int riscv_send_ipi(int hart)
 {
-	CLINT_BASE_GET();
-
 	writel(1, (void __iomem *)MSIP_REG(gd->arch.clint, hart));
 
 	return 0;
@@ -65,8 +58,6 @@  int riscv_send_ipi(int hart)
 
 int riscv_clear_ipi(int hart)
 {
-	CLINT_BASE_GET();
-
 	writel(0, (void __iomem *)MSIP_REG(gd->arch.clint, hart));
 
 	return 0;
@@ -74,8 +65,6 @@  int riscv_clear_ipi(int hart)
 
 int riscv_get_ipi(int hart, int *pending)
 {
-	CLINT_BASE_GET();
-
 	*pending = readl((void __iomem *)MSIP_REG(gd->arch.clint, hart));
 
 	return 0;
diff --git a/arch/riscv/lib/smp.c b/arch/riscv/lib/smp.c
index 17adb35730..3b1e52e9b2 100644
--- a/arch/riscv/lib/smp.c
+++ b/arch/riscv/lib/smp.c
@@ -12,38 +12,6 @@ 
 
 DECLARE_GLOBAL_DATA_PTR;
 
-/**
- * riscv_send_ipi() - Send inter-processor interrupt (IPI)
- *
- * Platform code must provide this function.
- *
- * @hart: Hart ID of receiving hart
- * @return 0 if OK, -ve on error
- */
-extern int riscv_send_ipi(int hart);
-
-/**
- * riscv_clear_ipi() - Clear inter-processor interrupt (IPI)
- *
- * Platform code must provide this function.
- *
- * @hart: Hart ID of hart to be cleared
- * @return 0 if OK, -ve on error
- */
-extern int riscv_clear_ipi(int hart);
-
-/**
- * riscv_get_ipi() - Get status of inter-processor interrupt (IPI)
- *
- * Platform code must provide this function.
- *
- * @hart: Hart ID of hart to be checked
- * @pending: Pointer to variable with result of the check,
- *           1 if IPI is pending, 0 otherwise
- * @return 0 if OK, -ve on error
- */
-extern int riscv_get_ipi(int hart, int *pending);
-
 static int send_ipi_many(struct ipi_data *ipi, int wait)
 {
 	ofnode node, cpus;
@@ -110,37 +78,41 @@  void handle_ipi(ulong hart)
 	int ret;
 	void (*smp_function)(ulong hart, ulong arg0, ulong arg1);
 
-	if (hart >= CONFIG_NR_CPUS)
+	if (hart >= CONFIG_NR_CPUS || !READ_ONCE(gd->arch.ipi_ready))
 		return;
 
-	__smp_mb();
-
-	smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
-	invalidate_icache_all();
-
 	/*
 	 * Clear the IPI to acknowledge the request before jumping to the
 	 * requested function.
 	 */
 	ret = riscv_clear_ipi(hart);
 	if (ret) {
-		pr_err("Cannot clear IPI of hart %ld\n", hart);
+		pr_err("Cannot clear IPI of hart %ld (error %d)\n", hart, ret);
 		return;
 	}
 
+	__smp_mb();
+
+	smp_function = (void (*)(ulong, ulong, ulong))gd->arch.ipi[hart].addr;
+	/*
+	 * There may be an IPI raised before u-boot begins execution, so check
+	 * to ensure we actually have a function to call.
+	 */
+	if (!smp_function)
+		return;
+	log_debug("hart = %lu func = %p\n", hart, smp_function);
+	invalidate_icache_all();
+
 	smp_function(hart, gd->arch.ipi[hart].arg0, gd->arch.ipi[hart].arg1);
 }
 
 int smp_call_function(ulong addr, ulong arg0, ulong arg1, int wait)
 {
-	int ret = 0;
-	struct ipi_data ipi;
+	struct ipi_data ipi = {
+		.addr = addr,
+		.arg0 = arg0,
+		.arg1 = arg1,
+	};
 
-	ipi.addr = addr;
-	ipi.arg0 = arg0;
-	ipi.arg1 = arg1;
-
-	ret = send_ipi_many(&ipi, wait);
-
-	return ret;
+	return send_ipi_many(&ipi, wait);
 }