diff mbox series

[RFC,1/3] riscv: set HAVE_EFFICIENT_UNALIGNED_ACCESS

Message ID 20190115083518.10149-2-bjorn.topel@gmail.com
State RFC
Delegated to: BPF Maintainers
Headers show
Series RV64G eBPF JIT | expand

Commit Message

Björn Töpel Jan. 15, 2019, 8:35 a.m. UTC
Signed-off-by: Björn Töpel <bjorn.topel@gmail.com>
---
 arch/riscv/Kconfig | 1 +
 1 file changed, 1 insertion(+)

Comments

Christoph Hellwig Jan. 15, 2019, 3:39 p.m. UTC | #1
Hmm, while the RISC-V spec requires misaligned load/store support,
who says they are efficient?  Maybe add a little comment that says
on which cpus they are efficient.
Björn Töpel Jan. 15, 2019, 4:06 p.m. UTC | #2
Den tis 15 jan. 2019 kl 16:39 skrev Christoph Hellwig <hch@infradead.org>:
>
> Hmm, while the RISC-V spec requires misaligned load/store support,
> who says they are efficient?  Maybe add a little comment that says
> on which cpus they are efficient.

Good point! :-) I need to check how other architectures does this.
Enabling it for *all* RV64 is probably not correct.
Palmer Dabbelt Jan. 25, 2019, 8:21 p.m. UTC | #3
On Tue, 15 Jan 2019 08:06:47 PST (-0800), bjorn.topel@gmail.com wrote:
> Den tis 15 jan. 2019 kl 16:39 skrev Christoph Hellwig <hch@infradead.org>:
>>
>> Hmm, while the RISC-V spec requires misaligned load/store support,
>> who says they are efficient?  Maybe add a little comment that says
>> on which cpus they are efficient.
>
> Good point! :-) I need to check how other architectures does this.
> Enabling it for *all* RV64 is probably not correct.

RISC-V mandates that misaligned memory accesses execute correctly in S-mode, 
but allow them to be trapped and emulated in M-mode.  As a result they can be 
quite slow.  Every microarchitecture I know of traps misaligned accesses into 
M-mode, so for now we're probably safe just unconditionally saying they're 
slow.

GCC does have a tuning parameter that says "are misaligned accesses fast?" that 
we set depending on -mtune, but it doesn't appear to be exposed as a 
preprocessor macro.  I think it's probably best to just expose the tuning 
parameter as a macro so software that needs to know this has one standard way 
of doing it.

Jim, would you be opposed to something like this?

    diff --git a/riscv-c-api.md b/riscv-c-api.md
    index 0b0236c38826..a790f5cc23ee 100644
    --- a/riscv-c-api.md
    +++ b/riscv-c-api.md
    @@ -52,6 +52,10 @@ https://creativecommons.org/licenses/by/4.0/.
     * `__riscv_cmodel_medlow`
     * `__riscv_cmodel_medany`
     * `__riscv_cmodel_pic`
    +* `__riscv_tune_misaligned_load_cost`: The number of cycles a word-sized
    +  misaligned load will take.
    +* `__riscv_tune_misaligned_store_cost`: The number of cycles a word-sized
    +  misaligned store will take.
    
     ## Function Attributes

Which I think shouldn't be too much of a headache to implement in GCC -- I 
haven't compiled this yet, though...

    diff --git a/gcc/config/riscv/riscv-c.c b/gcc/config/riscv/riscv-c.c
    index ca72de74a7b4..fa71a4a22104 100644
    --- a/gcc/config/riscv/riscv-c.c
    +++ b/gcc/config/riscv/riscv-c.c
    @@ -98,4 +98,9 @@ riscv_cpu_cpp_builtins (cpp_reader *pfile)
           builtin_define ("__riscv_cmodel_pic");
           break;
         }
    +
    +    builtin_define_with_int_value ("__riscv_tune_misaligned_load_cost",
    +                                   riscv_tune_info->slow_unaligned_access ? 1024 : 1);
    +    builtin_define_with_int_value ("__riscv_tune_misaligned_store_cost",
    +                                   riscv_tune_info->slow_unaligned_access ? 1024 : 1);
     }
    diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
    index a3ab6cec33b4..d58a307d27b4 100644
    --- a/gcc/config/riscv/riscv-opts.h
    +++ b/gcc/config/riscv/riscv-opts.h
    @@ -39,4 +39,6 @@ enum riscv_code_model {
     };
     extern enum riscv_code_model riscv_cmodel;
     
    +extern struct riscv_tune_info riscv_tune_info;
    +
     #endif /* ! GCC_RISCV_OPTS_H */
    diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
    index bf4571d91b8c..671c2ddaaa0f 100644
    --- a/gcc/config/riscv/riscv.c
    +++ b/gcc/config/riscv/riscv.c
    @@ -226,7 +226,7 @@ struct riscv_cpu_info {
       const char *name;
     
       /* Tuning parameters for this CPU.  */
    -  const struct riscv_tune_info *tune_info;
    +  const struct riscv_tune_info *riscv_tune_info;
     };
     
     /* Global variables for machine-dependent things.  */
    @@ -243,7 +243,7 @@ unsigned riscv_stack_boundary;
     static int epilogue_cfa_sp_offset;
     
     /* Which tuning parameters to use.  */
    -static const struct riscv_tune_info *tune_info;
    +const struct riscv_tune_info *riscv_tune_info;
     
     /* Index R is the smallest register class that contains register R.  */
     const enum reg_class riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = {
    @@ -1528,7 +1528,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
     	 instructions it needs.  */
           if ((cost = riscv_address_insns (XEXP (x, 0), mode, true)) > 0)
     	{
    -	  *total = COSTS_N_INSNS (cost + tune_info->memory_cost);
    +	  *total = COSTS_N_INSNS (cost + riscv_tune_info->memory_cost);
     	  return true;
     	}
           /* Otherwise use the default handling.  */
    @@ -1592,7 +1592,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
     	 mode instead.  */
           mode = GET_MODE (XEXP (x, 0));
           if (float_mode_p)
    -	*total = tune_info->fp_add[mode == DFmode];
    +	*total = riscv_tune_info->fp_add[mode == DFmode];
           else
     	*total = riscv_binary_cost (x, 1, 3);
           return false;
    @@ -1601,14 +1601,14 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
         case ORDERED:
           /* (FEQ(A, A) & FEQ(B, B)) compared against 0.  */
           mode = GET_MODE (XEXP (x, 0));
    -      *total = tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (2);
    +      *total = riscv_tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (2);
           return false;
     
         case UNEQ:
         case LTGT:
           /* (FEQ(A, A) & FEQ(B, B)) compared against FEQ(A, B).  */
           mode = GET_MODE (XEXP (x, 0));
    -      *total = tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (3);
    +      *total = riscv_tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (3);
           return false;
     
         case UNGE:
    @@ -1617,13 +1617,13 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
         case UNLT:
           /* FLT or FLE, but guarded by an FFLAGS read and write.  */
           mode = GET_MODE (XEXP (x, 0));
    -      *total = tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (4);
    +      *total = riscv_tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (4);
           return false;
     
         case MINUS:
         case PLUS:
           if (float_mode_p)
    -	*total = tune_info->fp_add[mode == DFmode];
    +	*total = riscv_tune_info->fp_add[mode == DFmode];
           else
     	*total = riscv_binary_cost (x, 1, 4);
           return false;
    @@ -1633,7 +1633,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
     	rtx op = XEXP (x, 0);
     	if (GET_CODE (op) == FMA && !HONOR_SIGNED_ZEROS (mode))
     	  {
    -	    *total = (tune_info->fp_mul[mode == DFmode]
    +	    *total = (riscv_tune_info->fp_mul[mode == DFmode]
     		      + set_src_cost (XEXP (op, 0), mode, speed)
     		      + set_src_cost (XEXP (op, 1), mode, speed)
     		      + set_src_cost (XEXP (op, 2), mode, speed));
    @@ -1642,23 +1642,23 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
           }
     
           if (float_mode_p)
    -	*total = tune_info->fp_add[mode == DFmode];
    +	*total = riscv_tune_info->fp_add[mode == DFmode];
           else
     	*total = COSTS_N_INSNS (GET_MODE_SIZE (mode) > UNITS_PER_WORD ? 4 : 1);
           return false;
     
         case MULT:
           if (float_mode_p)
    -	*total = tune_info->fp_mul[mode == DFmode];
    +	*total = riscv_tune_info->fp_mul[mode == DFmode];
           else if (!TARGET_MUL)
     	/* Estimate the cost of a library call.  */
     	*total = COSTS_N_INSNS (speed ? 32 : 6);
           else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
    -	*total = 3 * tune_info->int_mul[0] + COSTS_N_INSNS (2);
    +	*total = 3 * riscv_tune_info->int_mul[0] + COSTS_N_INSNS (2);
           else if (!speed)
     	*total = COSTS_N_INSNS (1);
           else
    -	*total = tune_info->int_mul[mode == DImode];
    +	*total = riscv_tune_info->int_mul[mode == DImode];
           return false;
     
         case DIV:
    @@ -1666,7 +1666,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
         case MOD:
           if (float_mode_p)
     	{
    -	  *total = tune_info->fp_div[mode == DFmode];
    +	  *total = riscv_tune_info->fp_div[mode == DFmode];
     	  return false;
     	}
           /* Fall through.  */
    @@ -1677,7 +1677,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
     	/* Estimate the cost of a library call.  */
     	*total = COSTS_N_INSNS (speed ? 32 : 6);
           else if (speed)
    -	*total = tune_info->int_div[mode == DImode];
    +	*total = riscv_tune_info->int_div[mode == DImode];
           else
     	*total = COSTS_N_INSNS (1);
           return false;
    @@ -1699,11 +1699,11 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
         case FIX:
         case FLOAT_EXTEND:
         case FLOAT_TRUNCATE:
    -      *total = tune_info->fp_add[mode == DFmode];
    +      *total = riscv_tune_info->fp_add[mode == DFmode];
           return false;
     
         case FMA:
    -      *total = (tune_info->fp_mul[mode == DFmode]
    +      *total = (riscv_tune_info->fp_mul[mode == DFmode]
     		+ set_src_cost (XEXP (x, 0), mode, speed)
     		+ set_src_cost (XEXP (x, 1), mode, speed)
     		+ set_src_cost (XEXP (x, 2), mode, speed));
    @@ -4165,7 +4165,7 @@ riscv_class_max_nregs (reg_class_t rclass, machine_mode mode)
     static int
     riscv_memory_move_cost (machine_mode mode, reg_class_t rclass, bool in)
     {
    -  return (tune_info->memory_cost
    +  return (riscv_tune_info->memory_cost
     	  + memory_move_secondary_cost (mode, rclass, in));
     }
     
    @@ -4174,7 +4174,7 @@ riscv_memory_move_cost (machine_mode mode, reg_class_t rclass, bool in)
     static int
     riscv_issue_rate (void)
     {
    -  return tune_info->issue_rate;
    +  return riscv_tune_info->issue_rate;
     }
     
     /* Implement TARGET_ASM_FILE_START.  */
    @@ -4307,22 +4307,22 @@ riscv_option_override (void)
       /* Handle -mtune.  */
       cpu = riscv_parse_cpu (riscv_tune_string ? riscv_tune_string :
     			 RISCV_TUNE_STRING_DEFAULT);
    -  tune_info = optimize_size ? &optimize_size_tune_info : cpu->tune_info;
    +  riscv_tune_info = optimize_size ? &optimize_size_tune_info : cpu->riscv_tune_info;
     
       /* Use -mtune's setting for slow_unaligned_access, even when optimizing
          for size.  For architectures that trap and emulate unaligned accesses,
          the performance cost is too great, even for -Os.  Similarly, if
          -m[no-]strict-align is left unspecified, heed -mtune's advice.  */
    -  riscv_slow_unaligned_access_p = (cpu->tune_info->slow_unaligned_access
    +  riscv_slow_unaligned_access_p = (cpu->riscv_tune_info->slow_unaligned_access
     				   || TARGET_STRICT_ALIGN);
       if ((target_flags_explicit & MASK_STRICT_ALIGN) == 0
    -      && cpu->tune_info->slow_unaligned_access)
    +      && cpu->riscv_tune_info->slow_unaligned_access)
         target_flags |= MASK_STRICT_ALIGN;
     
       /* If the user hasn't specified a branch cost, use the processor's
          default.  */
       if (riscv_branch_cost == 0)
    -    riscv_branch_cost = tune_info->branch_cost;
    +    riscv_branch_cost = riscv_tune_info->branch_cost;
     
       /* Function to allocate machine-dependent function status.  */
       init_machine_status = &riscv_init_machine_status;
Jim Wilson Jan. 26, 2019, 1:33 a.m. UTC | #4
On Fri, Jan 25, 2019 at 12:21 PM Palmer Dabbelt <palmer@sifive.com> wrote:
> Jim, would you be opposed to something like this?

This looks OK to me.

>     +    builtin_define_with_int_value ("__riscv_tune_misaligned_load_cost",
>     +                                   riscv_tune_info->slow_unaligned_access ? 1024 : 1);
>     +    builtin_define_with_int_value ("__riscv_tune_misaligned_store_cost",
>     +                                   riscv_tune_info->slow_unaligned_access ? 1024 : 1);

It would be nice to have a better way to compute these values, maybe
an extra field in the tune structure, but we can always worry about
that later when we need it.

Jim
Palmer Dabbelt Jan. 29, 2019, 2:43 a.m. UTC | #5
On Fri, 25 Jan 2019 17:33:50 PST (-0800), Jim Wilson wrote:
> On Fri, Jan 25, 2019 at 12:21 PM Palmer Dabbelt <palmer@sifive.com> wrote:
>> Jim, would you be opposed to something like this?
>
> This looks OK to me.

OK, thanks.  I'll send some patches around :)

>
>>     +    builtin_define_with_int_value ("__riscv_tune_misaligned_load_cost",
>>     +                                   riscv_tune_info->slow_unaligned_access ? 1024 : 1);
>>     +    builtin_define_with_int_value ("__riscv_tune_misaligned_store_cost",
>>     +                                   riscv_tune_info->slow_unaligned_access ? 1024 : 1);
>
> It would be nice to have a better way to compute these values, maybe
> an extra field in the tune structure, but we can always worry about
> that later when we need it.

I agree.  I just went and designed the external interface first and hid the 
ugliness here.  The internal interfaces are easier to change :)
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index feeeaa60697c..f13220904d7c 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -49,6 +49,7 @@  config RISCV
 	select RISCV_TIMER
 	select GENERIC_IRQ_MULTI_HANDLER
 	select ARCH_HAS_PTE_SPECIAL
+	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 
 config MMU
 	def_bool y