diff mbox series

AArch64[RFC] Force complicated constant to memory when beneficial

Message ID patch-14775-tamar@arm.com
State New
Headers show
Series AArch64[RFC] Force complicated constant to memory when beneficial | expand

Commit Message

Tamar Christina Aug. 31, 2021, 1:26 p.m. UTC
Hi All,

Consider the following case

#include <arm_neon.h>

uint64_t
test4 (uint8x16_t input)
{
    uint8x16_t bool_input = vshrq_n_u8(input, 7);
    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
				vgetq_lane_p64(mask, 0));
    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
    return vget_lane_u16((uint16x4_t)res, 3);
}

which generates (after my CSE patches):

test4:
	ushr	v0.16b, v0.16b, 7
	mov	x0, 16512
	movk	x0, 0x1020, lsl 16
	movk	x0, 0x408, lsl 32
	movk	x0, 0x102, lsl 48
	fmov	d1, x0
	pmull	v2.1q, v0.1d, v1.1d
	dup	v1.2d, v1.d[0]
	pmull2	v0.1q, v0.2d, v1.2d
	trn2	v2.8b, v2.8b, v0.8b
	umov	w0, v2.h[3]
	re

which is suboptimal since the constant is never needed on the genreg side and
should have been materialized on the SIMD side since the constant is so big
that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov.

The problem is that the choice of on which side to materialize the constant can
only be done during reload.  We may need an extra register (to hold the
addressing) and so can't be done after reload.

I have tried to support this with a pattern during reload, but the problem is I
can't seem to find a way to tell reload it should spill a constant under
condition x.  Instead I tried with a split which reload selects when the
condition hold.

This has a couple of issues:

1. The pattern can be expanded late (could be fixed with !reload_completed).
2. Because it's split so late we can't seem to be able to share the anchors for
   the ADRP.
3. Because it's split so late and basically reload doesn't know about the spill
   and so the ADD lo12 isn't pushed into the addressing mode of the LDR.

I don't know how to properly fix these since I think the only way is for reload
to do the spill properly itself, but in this case not having the patter makes it
avoid the mem pattern and pick r <- n instead followed by r -> w.

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W.
	* config/aarch64/constraints.md (Dx): New.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644


--

Comments

Richard Sandiford Oct. 8, 2021, 4:12 p.m. UTC | #1
Catching up on backlog, sorry for the very late response:

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> Consider the following case
>
> #include <arm_neon.h>
>
> uint64_t
> test4 (uint8x16_t input)
> {
>     uint8x16_t bool_input = vshrq_n_u8(input, 7);
>     poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
>     poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
> 				vgetq_lane_p64(mask, 0));
>     poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
>     uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
>     return vget_lane_u16((uint16x4_t)res, 3);
> }
>
> which generates (after my CSE patches):
>
> test4:
> 	ushr	v0.16b, v0.16b, 7
> 	mov	x0, 16512
> 	movk	x0, 0x1020, lsl 16
> 	movk	x0, 0x408, lsl 32
> 	movk	x0, 0x102, lsl 48
> 	fmov	d1, x0
> 	pmull	v2.1q, v0.1d, v1.1d
> 	dup	v1.2d, v1.d[0]
> 	pmull2	v0.1q, v0.2d, v1.2d
> 	trn2	v2.8b, v2.8b, v0.8b
> 	umov	w0, v2.h[3]
> 	re
>
> which is suboptimal since the constant is never needed on the genreg side and
> should have been materialized on the SIMD side since the constant is so big
> that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov.
>
> The problem is that the choice of on which side to materialize the constant can
> only be done during reload.  We may need an extra register (to hold the
> addressing) and so can't be done after reload.
>
> I have tried to support this with a pattern during reload, but the problem is I
> can't seem to find a way to tell reload it should spill a constant under
> condition x.  Instead I tried with a split which reload selects when the
> condition hold.

If this is still an issue, one thing to try would be to put a "$" before
the "r" in the GPR alternative.  If that doesn't work then yeah,
I think we're out of luck describing this directly.  If "$" does work,
it'd be interesting to see whether "^" does too.

Thanks,
Richard

>
> This has a couple of issues:
>
> 1. The pattern can be expanded late (could be fixed with !reload_completed).
> 2. Because it's split so late we can't seem to be able to share the anchors for
>    the ADRP.
> 3. Because it's split so late and basically reload doesn't know about the spill
>    and so the ADD lo12 isn't pushed into the addressing mode of the LDR.
>
> I don't know how to properly fix these since I think the only way is for reload
> to do the spill properly itself, but in this case not having the patter makes it
> avoid the mem pattern and pick r <- n instead followed by r -> w.
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W.
> 	* config/aarch64/constraints.md (Dx): New.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
>  )
>  
>  (define_insn_and_split "*movdi_aarch64"
> -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,  r,  r, w,r,w, w")
> -	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w  ,r  ,r,w, m,m,  r,  r, w,r,w,w")
> +	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
>    "(register_operand (operands[0], DImode)
>      || aarch64_reg_or_zero (operands[1], DImode))"
>    "@
> @@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
>     mov\\t%x0, %1
>     mov\\t%w0, %1
>     #
> +   #
>     * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
>     ldr\\t%x0, %1
>     ldr\\t%d0, %1
> @@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
>     fmov\\t%d0, %d1
>     * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
>     "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
> -    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
> +    && REG_P (operands[0])
> +    && (GP_REGNUM_P (REGNO (operands[0]))
> +	|| (can_create_pseudo_p ()
> +	    && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
>     [(const_int 0)]
>     "{
> -       aarch64_expand_mov_immediate (operands[0], operands[1]);
> +       if (GP_REGNUM_P (REGNO (operands[0])))
> +	 aarch64_expand_mov_immediate (operands[0], operands[1]);
> +       else
> +	 {
> +	   rtx mem = force_const_mem (DImode, operands[1]);
> +	   gcc_assert (mem);
> +	   emit_move_insn (operands[0], mem);
> +	 }
>         DONE;
>      }"
>    ;; The "mov_imm" type for CNTD is just a placeholder.
> -  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
> +  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
>  		     load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
>  		     neon_move")
> -   (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
> +   (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
>  )
>  
>  (define_insn "insv_imm<mode>"
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -474,6 +474,14 @@ (define_address_constraint "Dp"
>   An address valid for a prefetch instruction."
>   (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
>  
> +(define_constraint "Dx"
> +  "@internal
> + A constraint that matches an integer immediate operand not valid\
> + for AdvSIMD scalar operations in DImode."
> + (and (match_code "const_int")
> +      (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
> +      (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
> +
>  (define_constraint "vgb"
>    "@internal
>     A constraint that matches an immediate offset valid for SVE LD1B
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1298,8 +1298,8 @@  (define_insn_and_split "*movsi_aarch64"
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,  r,  r, w,r,w, w")
-	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w  ,r  ,r,w, m,m,  r,  r, w,r,w,w")
+	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
   "@
@@ -1309,6 +1309,7 @@  (define_insn_and_split "*movdi_aarch64"
    mov\\t%x0, %1
    mov\\t%w0, %1
    #
+   #
    * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
    ldr\\t%x0, %1
    ldr\\t%d0, %1
@@ -1321,17 +1322,27 @@  (define_insn_and_split "*movdi_aarch64"
    fmov\\t%d0, %d1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
    "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+    && REG_P (operands[0])
+    && (GP_REGNUM_P (REGNO (operands[0]))
+	|| (can_create_pseudo_p ()
+	    && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
    [(const_int 0)]
    "{
-       aarch64_expand_mov_immediate (operands[0], operands[1]);
+       if (GP_REGNUM_P (REGNO (operands[0])))
+	 aarch64_expand_mov_immediate (operands[0], operands[1]);
+       else
+	 {
+	   rtx mem = force_const_mem (DImode, operands[1]);
+	   gcc_assert (mem);
+	   emit_move_insn (operands[0], mem);
+	 }
        DONE;
     }"
   ;; The "mov_imm" type for CNTD is just a placeholder.
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
 		     load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
 		     neon_move")
-   (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
+   (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
 )
 
 (define_insn "insv_imm<mode>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -474,6 +474,14 @@  (define_address_constraint "Dp"
  An address valid for a prefetch instruction."
  (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
 
+(define_constraint "Dx"
+  "@internal
+ A constraint that matches an integer immediate operand not valid\
+ for AdvSIMD scalar operations in DImode."
+ (and (match_code "const_int")
+      (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
+      (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
+
 (define_constraint "vgb"
   "@internal
    A constraint that matches an immediate offset valid for SVE LD1B