diff mbox series

rs6000: Remove unspecs for vec_mrghl[bhw]

Message ID 20210524090213.2813103-1-luoxhu@linux.ibm.com
State New
Headers show
Series rs6000: Remove unspecs for vec_mrghl[bhw] | expand

Commit Message

Xionghu Luo May 24, 2021, 9:02 a.m. UTC
From: Xiong Hu Luo <luoxhu@linux.ibm.com>

vmrghb only accepts permute index {0, 16, 1, 17, 2, 18, 3, 19, 4, 20,
5, 21, 6, 22, 7, 23} no matter for BE or LE in ISA, similarly for vmrghlb.
Remove UNSPEC_VMRGH_DIRECT/UNSPEC_VMRGL_DIRECT pattern as vec_select
+ vec_concat as normal RTL.

Tested pass on P8LE, P9LE and P8BE{m32}, ok for trunk?

gcc/ChangeLog:

	* config/rs6000/altivec.md (*altivec_vmrghb_internal): Delete.
	(altivec_vmrghb_direct): New.
	(*altivec_vmrghh_internal): Delete.
	(altivec_vmrghh_direct): New.
	(*altivec_vmrghw_internal): Delete.
	(altivec_vmrghw_direct_<mode>): New.
	(altivec_vmrghw_direct): Delete.
	(*altivec_vmrglb_internal): Delete.
	(altivec_vmrglb_direct): New.
	(*altivec_vmrglh_internal): Delete.
	(altivec_vmrglh_direct): New.
	(*altivec_vmrglw_internal): Delete.
	(altivec_vmrglw_direct_<mode>): New.
	(altivec_vmrglw_direct): Delete.
	* config/rs6000/rs6000-p8swap.c (rtx_is_swappable_p): Adjust.
	* config/rs6000/rs6000.c (altivec_expand_vec_perm_const):
	Adjust.
	* config/rs6000/vsx.md (vsx_xxmrghw_<mode>): Adjust.
	(vsx_xxmrglw_<mode>):

gcc/testsuite/ChangeLog:

	* gcc.target/powerpc/builtins-1.c: Update instruction counts.
---
 gcc/config/rs6000/altivec.md                  | 231 ++++++------------
 gcc/config/rs6000/rs6000-p8swap.c             |   2 -
 gcc/config/rs6000/rs6000.c                    |  10 +-
 gcc/config/rs6000/vsx.md                      |  18 +-
 gcc/testsuite/gcc.target/powerpc/builtins-1.c |   8 +-
 5 files changed, 95 insertions(+), 174 deletions(-)

Comments

Xionghu Luo June 7, 2021, 5:09 a.m. UTC | #1
Ping, thanks.


On 2021/5/24 17:02, Xionghu Luo wrote:
> From: Xiong Hu Luo <luoxhu@linux.ibm.com>
> 
> vmrghb only accepts permute index {0, 16, 1, 17, 2, 18, 3, 19, 4, 20,
> 5, 21, 6, 22, 7, 23} no matter for BE or LE in ISA, similarly for vmrghlb.
> Remove UNSPEC_VMRGH_DIRECT/UNSPEC_VMRGL_DIRECT pattern as vec_select
> + vec_concat as normal RTL.
> 
> Tested pass on P8LE, P9LE and P8BE{m32}, ok for trunk?
> 
> gcc/ChangeLog:
> 
> 	* config/rs6000/altivec.md (*altivec_vmrghb_internal): Delete.
> 	(altivec_vmrghb_direct): New.
> 	(*altivec_vmrghh_internal): Delete.
> 	(altivec_vmrghh_direct): New.
> 	(*altivec_vmrghw_internal): Delete.
> 	(altivec_vmrghw_direct_<mode>): New.
> 	(altivec_vmrghw_direct): Delete.
> 	(*altivec_vmrglb_internal): Delete.
> 	(altivec_vmrglb_direct): New.
> 	(*altivec_vmrglh_internal): Delete.
> 	(altivec_vmrglh_direct): New.
> 	(*altivec_vmrglw_internal): Delete.
> 	(altivec_vmrglw_direct_<mode>): New.
> 	(altivec_vmrglw_direct): Delete.
> 	* config/rs6000/rs6000-p8swap.c (rtx_is_swappable_p): Adjust.
> 	* config/rs6000/rs6000.c (altivec_expand_vec_perm_const):
> 	Adjust.
> 	* config/rs6000/vsx.md (vsx_xxmrghw_<mode>): Adjust.
> 	(vsx_xxmrglw_<mode>):
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/powerpc/builtins-1.c: Update instruction counts.
> ---
>   gcc/config/rs6000/altivec.md                  | 231 ++++++------------
>   gcc/config/rs6000/rs6000-p8swap.c             |   2 -
>   gcc/config/rs6000/rs6000.c                    |  10 +-
>   gcc/config/rs6000/vsx.md                      |  18 +-
>   gcc/testsuite/gcc.target/powerpc/builtins-1.c |   8 +-
>   5 files changed, 95 insertions(+), 174 deletions(-)
> 
> diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
> index 208d6343225..cae05be2c2d 100644
> --- a/gcc/config/rs6000/altivec.md
> +++ b/gcc/config/rs6000/altivec.md
> @@ -143,8 +143,6 @@ (define_c_enum "unspec"
>      UNSPEC_VUPKHU_V4SF
>      UNSPEC_VUPKLU_V4SF
>      UNSPEC_VGBBD
> -   UNSPEC_VMRGH_DIRECT
> -   UNSPEC_VMRGL_DIRECT
>      UNSPEC_VSPLT_DIRECT
>      UNSPEC_VMRGEW_DIRECT
>      UNSPEC_VMRGOW_DIRECT
> @@ -1291,44 +1289,29 @@ (define_expand "altivec_vmrghb"
>      (use (match_operand:V16QI 2 "register_operand"))]
>     "TARGET_ALTIVEC"
>   {
> -  rtvec v = gen_rtvec (16, GEN_INT (0), GEN_INT (16), GEN_INT (1), GEN_INT (17),
> -		       GEN_INT (2), GEN_INT (18), GEN_INT (3), GEN_INT (19),
> -		       GEN_INT (4), GEN_INT (20), GEN_INT (5), GEN_INT (21),
> -		       GEN_INT (6), GEN_INT (22), GEN_INT (7), GEN_INT (23));
> -  rtx x = gen_rtx_VEC_CONCAT (V32QImode, operands[1], operands[2]);
> -  x = gen_rtx_VEC_SELECT (V16QImode, x, gen_rtx_PARALLEL (VOIDmode, v));
> -  emit_insn (gen_rtx_SET (operands[0], x));
> +  if (BYTES_BIG_ENDIAN)
> +    emit_insn (
> +      gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2]));
> +  else
> +    emit_insn (
> +      gen_altivec_vmrglb_direct (operands[0], operands[2], operands[1]));
>     DONE;
>   })
>   
> -(define_insn "*altivec_vmrghb_internal"
> +(define_insn "altivec_vmrghb_direct"
>     [(set (match_operand:V16QI 0 "register_operand" "=v")
> -        (vec_select:V16QI
> +    (vec_select:V16QI
>   	  (vec_concat:V32QI
>   	    (match_operand:V16QI 1 "register_operand" "v")
>   	    (match_operand:V16QI 2 "register_operand" "v"))
> -	  (parallel [(const_int 0) (const_int 16)
> -		     (const_int 1) (const_int 17)
> -		     (const_int 2) (const_int 18)
> -		     (const_int 3) (const_int 19)
> -		     (const_int 4) (const_int 20)
> -		     (const_int 5) (const_int 21)
> -		     (const_int 6) (const_int 22)
> -		     (const_int 7) (const_int 23)])))]
> -  "TARGET_ALTIVEC"
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    return "vmrghb %0,%1,%2";
> -  else
> -    return "vmrglb %0,%2,%1";
> -}
> -  [(set_attr "type" "vecperm")])
> -
> -(define_insn "altivec_vmrghb_direct"
> -  [(set (match_operand:V16QI 0 "register_operand" "=v")
> -	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")
> -		       (match_operand:V16QI 2 "register_operand" "v")]
> -		      UNSPEC_VMRGH_DIRECT))]
> +	  (parallel [(const_int  0) (const_int 16)
> +		     (const_int  1) (const_int 17)
> +		     (const_int  2) (const_int 18)
> +		     (const_int  3) (const_int 19)
> +		     (const_int  4) (const_int 20)
> +		     (const_int  5) (const_int 21)
> +		     (const_int  6) (const_int 22)
> +		     (const_int  7) (const_int 23)])))]
>     "TARGET_ALTIVEC"
>     "vmrghb %0,%1,%2"
>     [(set_attr "type" "vecperm")])
> @@ -1339,16 +1322,16 @@ (define_expand "altivec_vmrghh"
>      (use (match_operand:V8HI 2 "register_operand"))]
>     "TARGET_ALTIVEC"
>   {
> -  rtvec v = gen_rtvec (8, GEN_INT (0), GEN_INT (8), GEN_INT (1), GEN_INT (9),
> -		       GEN_INT (2), GEN_INT (10), GEN_INT (3), GEN_INT (11));
> -  rtx x = gen_rtx_VEC_CONCAT (V16HImode, operands[1], operands[2]);
> -
> -  x = gen_rtx_VEC_SELECT (V8HImode, x, gen_rtx_PARALLEL (VOIDmode, v));
> -  emit_insn (gen_rtx_SET (operands[0], x));
> +  if (BYTES_BIG_ENDIAN)
> +    emit_insn (
> +      gen_altivec_vmrghh_direct (operands[0], operands[1], operands[2]));
> +  else
> +    emit_insn (
> +      gen_altivec_vmrglh_direct (operands[0], operands[2], operands[1]));
>     DONE;
>   })
>   
> -(define_insn "*altivec_vmrghh_internal"
> +(define_insn "altivec_vmrghh_direct"
>     [(set (match_operand:V8HI 0 "register_operand" "=v")
>           (vec_select:V8HI
>   	  (vec_concat:V16HI
> @@ -1359,20 +1342,6 @@ (define_insn "*altivec_vmrghh_internal"
>   		     (const_int 2) (const_int 10)
>   		     (const_int 3) (const_int 11)])))]
>     "TARGET_ALTIVEC"
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    return "vmrghh %0,%1,%2";
> -  else
> -    return "vmrglh %0,%2,%1";
> -}
> -  [(set_attr "type" "vecperm")])
> -
> -(define_insn "altivec_vmrghh_direct"
> -  [(set (match_operand:V8HI 0 "register_operand" "=v")
> -        (unspec:V8HI [(match_operand:V8HI 1 "register_operand" "v")
> -                      (match_operand:V8HI 2 "register_operand" "v")]
> -                     UNSPEC_VMRGH_DIRECT))]
> -  "TARGET_ALTIVEC"
>     "vmrghh %0,%1,%2"
>     [(set_attr "type" "vecperm")])
>   
> @@ -1382,39 +1351,27 @@ (define_expand "altivec_vmrghw"
>      (use (match_operand:V4SI 2 "register_operand"))]
>     "VECTOR_MEM_ALTIVEC_P (V4SImode)"
>   {
> -  rtvec v = gen_rtvec (4, GEN_INT (0), GEN_INT (4), GEN_INT (1), GEN_INT (5));
> -  rtx x = gen_rtx_VEC_CONCAT (V8SImode, operands[1], operands[2]);
> -  x = gen_rtx_VEC_SELECT (V4SImode, x, gen_rtx_PARALLEL (VOIDmode, v));
> -  emit_insn (gen_rtx_SET (operands[0], x));
> +  if (BYTES_BIG_ENDIAN)
> +    emit_insn (
> +      gen_altivec_vmrghw_direct_v4si (operands[0], operands[1], operands[2]));
> +  else
> +    emit_insn (
> +      gen_altivec_vmrglw_direct_v4si (operands[0], operands[2], operands[1]));
>     DONE;
>   })
>   
> -(define_insn "*altivec_vmrghw_internal"
> -  [(set (match_operand:V4SI 0 "register_operand" "=v")
> -        (vec_select:V4SI
> -	  (vec_concat:V8SI
> -	    (match_operand:V4SI 1 "register_operand" "v")
> -	    (match_operand:V4SI 2 "register_operand" "v"))
> +(define_insn "altivec_vmrghw_direct_<mode>"
> +  [(set (match_operand:VSX_W 0 "register_operand" "=wa,v")
> +        (vec_select:VSX_W
> +	  (vec_concat:<VS_double>
> +	    (match_operand:VSX_W 1 "register_operand" "wa,v")
> +	    (match_operand:VSX_W 2 "register_operand" "wa,v"))
>   	  (parallel [(const_int 0) (const_int 4)
>   		     (const_int 1) (const_int 5)])))]
> -  "VECTOR_MEM_ALTIVEC_P (V4SImode)"
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    return "vmrghw %0,%1,%2";
> -  else
> -    return "vmrglw %0,%2,%1";
> -}
> -  [(set_attr "type" "vecperm")])
> -
> -(define_insn "altivec_vmrghw_direct"
> -  [(set (match_operand:V4SI 0 "register_operand" "=wa,v")
> -	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "wa,v")
> -		      (match_operand:V4SI 2 "register_operand" "wa,v")]
> -		     UNSPEC_VMRGH_DIRECT))]
>     "TARGET_ALTIVEC"
>     "@
> -   xxmrghw %x0,%x1,%x2
> -   vmrghw %0,%1,%2"
> +  xxmrghw %x0,%x1,%x2
> +  vmrghw %0,%1,%2"
>     [(set_attr "type" "vecperm")])
>   
>   (define_insn "*altivec_vmrghsf"
> @@ -1440,19 +1397,18 @@ (define_expand "altivec_vmrglb"
>      (use (match_operand:V16QI 2 "register_operand"))]
>     "TARGET_ALTIVEC"
>   {
> -  rtvec v = gen_rtvec (16, GEN_INT (8), GEN_INT (24), GEN_INT (9), GEN_INT (25),
> -		       GEN_INT (10), GEN_INT (26), GEN_INT (11), GEN_INT (27),
> -		       GEN_INT (12), GEN_INT (28), GEN_INT (13), GEN_INT (29),
> -		       GEN_INT (14), GEN_INT (30), GEN_INT (15), GEN_INT (31));
> -  rtx x = gen_rtx_VEC_CONCAT (V32QImode, operands[1], operands[2]);
> -  x = gen_rtx_VEC_SELECT (V16QImode, x, gen_rtx_PARALLEL (VOIDmode, v));
> -  emit_insn (gen_rtx_SET (operands[0], x));
> +  if (BYTES_BIG_ENDIAN)
> +    emit_insn (
> +      gen_altivec_vmrglb_direct (operands[0], operands[1], operands[2]));
> +  else
> +    emit_insn (
> +      gen_altivec_vmrghb_direct (operands[0], operands[2], operands[1]));
>     DONE;
>   })
>   
> -(define_insn "*altivec_vmrglb_internal"
> +(define_insn "altivec_vmrglb_direct"
>     [(set (match_operand:V16QI 0 "register_operand" "=v")
> -        (vec_select:V16QI
> +    (vec_select:V16QI
>   	  (vec_concat:V32QI
>   	    (match_operand:V16QI 1 "register_operand" "v")
>   	    (match_operand:V16QI 2 "register_operand" "v"))
> @@ -1465,20 +1421,6 @@ (define_insn "*altivec_vmrglb_internal"
>   		     (const_int 14) (const_int 30)
>   		     (const_int 15) (const_int 31)])))]
>     "TARGET_ALTIVEC"
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    return "vmrglb %0,%1,%2";
> -  else
> -    return "vmrghb %0,%2,%1";
> -}
> -  [(set_attr "type" "vecperm")])
> -
> -(define_insn "altivec_vmrglb_direct"
> -  [(set (match_operand:V16QI 0 "register_operand" "=v")
> -	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")
> -		       (match_operand:V16QI 2 "register_operand" "v")]
> -		      UNSPEC_VMRGL_DIRECT))]
> -  "TARGET_ALTIVEC"
>     "vmrglb %0,%1,%2"
>     [(set_attr "type" "vecperm")])
>   
> @@ -1488,15 +1430,16 @@ (define_expand "altivec_vmrglh"
>      (use (match_operand:V8HI 2 "register_operand"))]
>     "TARGET_ALTIVEC"
>   {
> -  rtvec v = gen_rtvec (8, GEN_INT (4), GEN_INT (12), GEN_INT (5), GEN_INT (13),
> -		       GEN_INT (6), GEN_INT (14), GEN_INT (7), GEN_INT (15));
> -  rtx x = gen_rtx_VEC_CONCAT (V16HImode, operands[1], operands[2]);
> -  x = gen_rtx_VEC_SELECT (V8HImode, x, gen_rtx_PARALLEL (VOIDmode, v));
> -  emit_insn (gen_rtx_SET (operands[0], x));
> +  if (BYTES_BIG_ENDIAN)
> +    emit_insn (
> +      gen_altivec_vmrglh_direct (operands[0], operands[1], operands[2]));
> +  else
> +    emit_insn (
> +      gen_altivec_vmrghh_direct (operands[0], operands[2], operands[1]));
>     DONE;
>   })
>   
> -(define_insn "*altivec_vmrglh_internal"
> +(define_insn "altivec_vmrglh_direct"
>     [(set (match_operand:V8HI 0 "register_operand" "=v")
>           (vec_select:V8HI
>   	  (vec_concat:V16HI
> @@ -1507,20 +1450,6 @@ (define_insn "*altivec_vmrglh_internal"
>   		     (const_int 6) (const_int 14)
>   		     (const_int 7) (const_int 15)])))]
>     "TARGET_ALTIVEC"
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    return "vmrglh %0,%1,%2";
> -  else
> -    return "vmrghh %0,%2,%1";
> -}
> -  [(set_attr "type" "vecperm")])
> -
> -(define_insn "altivec_vmrglh_direct"
> -  [(set (match_operand:V8HI 0 "register_operand" "=v")
> -        (unspec:V8HI [(match_operand:V8HI 1 "register_operand" "v")
> -		      (match_operand:V8HI 2 "register_operand" "v")]
> -		     UNSPEC_VMRGL_DIRECT))]
> -  "TARGET_ALTIVEC"
>     "vmrglh %0,%1,%2"
>     [(set_attr "type" "vecperm")])
>   
> @@ -1530,39 +1459,27 @@ (define_expand "altivec_vmrglw"
>      (use (match_operand:V4SI 2 "register_operand"))]
>     "VECTOR_MEM_ALTIVEC_P (V4SImode)"
>   {
> -  rtvec v = gen_rtvec (4, GEN_INT (2), GEN_INT (6), GEN_INT (3), GEN_INT (7));
> -  rtx x = gen_rtx_VEC_CONCAT (V8SImode, operands[1], operands[2]);
> -  x = gen_rtx_VEC_SELECT (V4SImode, x, gen_rtx_PARALLEL (VOIDmode, v));
> -  emit_insn (gen_rtx_SET (operands[0], x));
> +  if (BYTES_BIG_ENDIAN)
> +    emit_insn (
> +      gen_altivec_vmrglw_direct_v4si (operands[0], operands[1], operands[2]));
> +  else
> +    emit_insn (
> +      gen_altivec_vmrghw_direct_v4si (operands[0], operands[2], operands[1]));
>     DONE;
>   })
>   
> -(define_insn "*altivec_vmrglw_internal"
> -  [(set (match_operand:V4SI 0 "register_operand" "=v")
> -        (vec_select:V4SI
> -	  (vec_concat:V8SI
> -	    (match_operand:V4SI 1 "register_operand" "v")
> -	    (match_operand:V4SI 2 "register_operand" "v"))
> +(define_insn "altivec_vmrglw_direct_<mode>"
> +  [(set (match_operand:VSX_W 0 "register_operand" "=wa,v")
> +        (vec_select:VSX_W
> +	  (vec_concat:<VS_double>
> +	    (match_operand:VSX_W 1 "register_operand" "wa,v")
> +	    (match_operand:VSX_W 2 "register_operand" "wa,v"))
>   	  (parallel [(const_int 2) (const_int 6)
>   		     (const_int 3) (const_int 7)])))]
> -  "VECTOR_MEM_ALTIVEC_P (V4SImode)"
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    return "vmrglw %0,%1,%2";
> -  else
> -    return "vmrghw %0,%2,%1";
> -}
> -  [(set_attr "type" "vecperm")])
> -
> -(define_insn "altivec_vmrglw_direct"
> -  [(set (match_operand:V4SI 0 "register_operand" "=wa,v")
> -	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "wa,v")
> -		      (match_operand:V4SI 2 "register_operand" "wa,v")]
> -		     UNSPEC_VMRGL_DIRECT))]
>     "TARGET_ALTIVEC"
>     "@
> -   xxmrglw %x0,%x1,%x2
> -   vmrglw %0,%1,%2"
> +  xxmrglw %x0,%x1,%x2
> +  vmrglw %0,%1,%2"
>     [(set_attr "type" "vecperm")])
>   
>   (define_insn "*altivec_vmrglsf"
> @@ -3929,13 +3846,13 @@ (define_expand "vec_widen_umult_hi_v8hi"
>       {
>         emit_insn (gen_altivec_vmuleuh (ve, operands[1], operands[2]));
>         emit_insn (gen_altivec_vmulouh (vo, operands[1], operands[2]));
> -      emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo));
> +      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo));
>       }
>     else
>       {
>         emit_insn (gen_altivec_vmulouh (ve, operands[1], operands[2]));
>         emit_insn (gen_altivec_vmuleuh (vo, operands[1], operands[2]));
> -      emit_insn (gen_altivec_vmrghw_direct (operands[0], vo, ve));
> +      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], vo, ve));
>       }
>     DONE;
>   })
> @@ -3954,13 +3871,13 @@ (define_expand "vec_widen_umult_lo_v8hi"
>       {
>         emit_insn (gen_altivec_vmuleuh (ve, operands[1], operands[2]));
>         emit_insn (gen_altivec_vmulouh (vo, operands[1], operands[2]));
> -      emit_insn (gen_altivec_vmrglw_direct (operands[0], ve, vo));
> +      emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], ve, vo));
>       }
>     else
>       {
>         emit_insn (gen_altivec_vmulouh (ve, operands[1], operands[2]));
>         emit_insn (gen_altivec_vmuleuh (vo, operands[1], operands[2]));
> -      emit_insn (gen_altivec_vmrglw_direct (operands[0], vo, ve));
> +      emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], vo, ve));
>       }
>     DONE;
>   })
> @@ -3979,13 +3896,13 @@ (define_expand "vec_widen_smult_hi_v8hi"
>       {
>         emit_insn (gen_altivec_vmulesh (ve, operands[1], operands[2]));
>         emit_insn (gen_altivec_vmulosh (vo, operands[1], operands[2]));
> -      emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo));
> +      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo));
>       }
>     else
>       {
>         emit_insn (gen_altivec_vmulosh (ve, operands[1], operands[2]));
>         emit_insn (gen_altivec_vmulesh (vo, operands[1], operands[2]));
> -      emit_insn (gen_altivec_vmrghw_direct (operands[0], vo, ve));
> +      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], vo, ve));
>       }
>     DONE;
>   })
> @@ -4004,13 +3921,13 @@ (define_expand "vec_widen_smult_lo_v8hi"
>       {
>         emit_insn (gen_altivec_vmulesh (ve, operands[1], operands[2]));
>         emit_insn (gen_altivec_vmulosh (vo, operands[1], operands[2]));
> -      emit_insn (gen_altivec_vmrglw_direct (operands[0], ve, vo));
> +      emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], ve, vo));
>       }
>     else
>       {
>         emit_insn (gen_altivec_vmulosh (ve, operands[1], operands[2]));
>         emit_insn (gen_altivec_vmulesh (vo, operands[1], operands[2]));
> -      emit_insn (gen_altivec_vmrglw_direct (operands[0], vo, ve));
> +      emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], vo, ve));
>       }
>     DONE;
>   })
> diff --git a/gcc/config/rs6000/rs6000-p8swap.c b/gcc/config/rs6000/rs6000-p8swap.c
> index ad2b3023819..ec503ab742f 100644
> --- a/gcc/config/rs6000/rs6000-p8swap.c
> +++ b/gcc/config/rs6000/rs6000-p8swap.c
> @@ -744,8 +744,6 @@ rtx_is_swappable_p (rtx op, unsigned int *special)
>   	  default:
>   	    break;
>   	  case UNSPEC_VBPERMQ:
> -	  case UNSPEC_VMRGH_DIRECT:
> -	  case UNSPEC_VMRGL_DIRECT:
>   	  case UNSPEC_VPACK_SIGN_SIGN_SAT:
>   	  case UNSPEC_VPACK_SIGN_UNS_SAT:
>   	  case UNSPEC_VPACK_UNS_UNS_MOD:
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 5a5202c455b..ad11b67b125 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -23013,7 +23013,7 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
>         {  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } },
>       { OPTION_MASK_ALTIVEC, CODE_FOR_altivec_vpkuwum_direct,
>         {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } },
> -    { OPTION_MASK_ALTIVEC,
> +    { OPTION_MASK_ALTIVEC,
>         (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghb_direct
>          : CODE_FOR_altivec_vmrglb_direct),
>         {  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23 } },
> @@ -23022,8 +23022,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
>          : CODE_FOR_altivec_vmrglh_direct),
>         {  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23 } },
>       { OPTION_MASK_ALTIVEC,
> -      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct
> -       : CODE_FOR_altivec_vmrglw_direct),
> +      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si
> +       : CODE_FOR_altivec_vmrglw_direct_v4si),
>         {  0,  1,  2,  3, 16, 17, 18, 19,  4,  5,  6,  7, 20, 21, 22, 23 } },
>       { OPTION_MASK_ALTIVEC,
>         (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglb_direct
> @@ -23034,8 +23034,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
>          : CODE_FOR_altivec_vmrghh_direct),
>         {  8,  9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } },
>       { OPTION_MASK_ALTIVEC,
> -      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglw_direct
> -       : CODE_FOR_altivec_vmrghw_direct),
> +      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglw_direct_v4si
> +       : CODE_FOR_altivec_vmrghw_direct_v4si),
>         {  8,  9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } },
>       { OPTION_MASK_P8_VECTOR,
>         (BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgew_v4sf_direct
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 55d830d0f20..2214544c047 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -4568,7 +4568,7 @@ (define_insn "vsx_xxspltd_<mode>"
>     [(set_attr "type" "vecperm")])
>   
>   ;; V4SF/V4SI interleave
> -(define_insn "vsx_xxmrghw_<mode>"
> +(define_expand "vsx_xxmrghw_<mode>"
>     [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
>           (vec_select:VSX_W
>   	  (vec_concat:<VS_double>
> @@ -4579,13 +4579,16 @@ (define_insn "vsx_xxmrghw_<mode>"
>     "VECTOR_MEM_VSX_P (<MODE>mode)"
>   {
>     if (BYTES_BIG_ENDIAN)
> -    return "xxmrghw %x0,%x1,%x2";
> +    emit_insn (
> +      gen_altivec_vmrghw_direct_<mode> (operands[0], operands[1], operands[2]));
>     else
> -    return "xxmrglw %x0,%x2,%x1";
> +    emit_insn (
> +      gen_altivec_vmrglw_direct_<mode> (operands[0], operands[2], operands[1]));
> +  DONE;
>   }
>     [(set_attr "type" "vecperm")])
>   
> -(define_insn "vsx_xxmrglw_<mode>"
> +(define_expand "vsx_xxmrglw_<mode>"
>     [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
>   	(vec_select:VSX_W
>   	  (vec_concat:<VS_double>
> @@ -4596,9 +4599,12 @@ (define_insn "vsx_xxmrglw_<mode>"
>     "VECTOR_MEM_VSX_P (<MODE>mode)"
>   {
>     if (BYTES_BIG_ENDIAN)
> -    return "xxmrglw %x0,%x1,%x2";
> +    emit_insn (
> +      gen_altivec_vmrglw_direct_<mode> (operands[0], operands[1], operands[2]));
>     else
> -    return "xxmrghw %x0,%x2,%x1";
> +    emit_insn (
> +      gen_altivec_vmrghw_direct_<mode> (operands[0], operands[2], operands[1]));
> +  DONE;
>   }
>     [(set_attr "type" "vecperm")])
>   
> diff --git a/gcc/testsuite/gcc.target/powerpc/builtins-1.c b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
> index 3ec1024a955..63fbd2e3be1 100644
> --- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c
> +++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
> @@ -317,10 +317,10 @@ int main ()
>   /* { dg-final { scan-assembler-times "vctuxs" 2 } } */
>   
>   /* { dg-final { scan-assembler-times "vmrghb" 4 { target be } } } */
> -/* { dg-final { scan-assembler-times "vmrghb" 5 { target le } } } */
> +/* { dg-final { scan-assembler-times "vmrghb" 6 { target le } } } */
>   /* { dg-final { scan-assembler-times "vmrghh" 8 } } */
> -/* { dg-final { scan-assembler-times "xxmrghw" 8 } } */
> -/* { dg-final { scan-assembler-times "xxmrglw" 8 } } */
> +/* { dg-final { scan-assembler-times "xxmrghw" 4 } } */
> +/* { dg-final { scan-assembler-times "xxmrglw" 4 } } */
>   /* { dg-final { scan-assembler-times "vmrglh" 8 } } */
>   /* { dg-final { scan-assembler-times "xxlnor" 6 } } */
>   /* { dg-final { scan-assembler-times {\mvpkudus\M} 1 } } */
> @@ -347,7 +347,7 @@ int main ()
>   /* { dg-final { scan-assembler-times "vspltb" 6 } } */
>   /* { dg-final { scan-assembler-times "vspltw" 0 } } */
>   /* { dg-final { scan-assembler-times "vmrgow" 8 } } */
> -/* { dg-final { scan-assembler-times "vmrglb" 5 { target le } } } */
> +/* { dg-final { scan-assembler-times "vmrglb" 4 { target le } } } */
>   /* { dg-final { scan-assembler-times "vmrglb" 6 { target be } } } */
>   /* { dg-final { scan-assembler-times "vmrgew" 8 } } */
>   /* { dg-final { scan-assembler-times "vsplth" 8 } } */
>
Segher Boessenkool June 8, 2021, 11:25 p.m. UTC | #2
On Mon, May 24, 2021 at 04:02:13AM -0500, Xionghu Luo wrote:
> vmrghb only accepts permute index {0, 16, 1, 17, 2, 18, 3, 19, 4, 20,
> 5, 21, 6, 22, 7, 23} no matter for BE or LE in ISA, similarly for vmrghlb.

(vmrglb)

> +  if (BYTES_BIG_ENDIAN)
> +    emit_insn (
> +      gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2]));
> +  else
> +    emit_insn (
> +      gen_altivec_vmrglb_direct (operands[0], operands[2], operands[1]));

Please don't indent like that, it doesn't match what we do elsewhere.
For better or for worse (for worse imo), we use deep hanging indents.
If you have to, you can do something like

  rtx insn;
  if (BYTES_BIG_ENDIAN)
    insn = gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2]);
  else
    insn = gen_altivec_vmrglb_direct (operands[0], operands[2], operands[1]);
  emit_insn (insn);

(this is better even, in that it has only one emit_insn), or even

  rtx (*fun) () = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
				   : gen_altivec_vmrglb_direct;
  if (!BYTES_BIG_ENDIAN)
    std::swap (operands[1], operands[2]);
  emit_insn (fun (operands[0], operands[1], operands[2]));

Well, C++ does not allow that last example like that, sigh, so
  rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
						: gen_altivec_vmrglb_direct;

This is shorter than the other two options ;-)

> +(define_insn "altivec_vmrghb_direct"
>    [(set (match_operand:V16QI 0 "register_operand" "=v")
> +    (vec_select:V16QI

This should be indented one space more.

>    "TARGET_ALTIVEC"
>    "@
> -   xxmrghw %x0,%x1,%x2
> -   vmrghw %0,%1,%2"
> +  xxmrghw %x0,%x1,%x2
> +  vmrghw %0,%1,%2"

The original indent was correct, please restore.

> -      emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo));
> +      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo));

When you see a mode as part of a pattern name, chances are that it will
be a good candidate for using parameterized names with.  (But don't do
that now, just keep it in mind as a nice cleanup to do).

> @@ -23022,8 +23022,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
>         : CODE_FOR_altivec_vmrglh_direct),
>        {  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23 } },
>      { OPTION_MASK_ALTIVEC,
> -      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct
> -       : CODE_FOR_altivec_vmrglw_direct),
> +      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si
> +       : CODE_FOR_altivec_vmrglw_direct_v4si),

The correct way is to align the ? and the : (or put everything on one
line of course, if that fits)

The parens around this are not needed btw, and are a distraction.

> --- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c
> +++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
> @@ -317,10 +317,10 @@ int main ()
>  /* { dg-final { scan-assembler-times "vctuxs" 2 } } */
>  
>  /* { dg-final { scan-assembler-times "vmrghb" 4 { target be } } } */
> -/* { dg-final { scan-assembler-times "vmrghb" 5 { target le } } } */
> +/* { dg-final { scan-assembler-times "vmrghb" 6 { target le } } } */
>  /* { dg-final { scan-assembler-times "vmrghh" 8 } } */
> -/* { dg-final { scan-assembler-times "xxmrghw" 8 } } */
> -/* { dg-final { scan-assembler-times "xxmrglw" 8 } } */
> +/* { dg-final { scan-assembler-times "xxmrghw" 4 } } */
> +/* { dg-final { scan-assembler-times "xxmrglw" 4 } } */
>  /* { dg-final { scan-assembler-times "vmrglh" 8 } } */
>  /* { dg-final { scan-assembler-times "xxlnor" 6 } } */
>  /* { dg-final { scan-assembler-times {\mvpkudus\M} 1 } } */
> @@ -347,7 +347,7 @@ int main ()
>  /* { dg-final { scan-assembler-times "vspltb" 6 } } */
>  /* { dg-final { scan-assembler-times "vspltw" 0 } } */
>  /* { dg-final { scan-assembler-times "vmrgow" 8 } } */
> -/* { dg-final { scan-assembler-times "vmrglb" 5 { target le } } } */
> +/* { dg-final { scan-assembler-times "vmrglb" 4 { target le } } } */
>  /* { dg-final { scan-assembler-times "vmrglb" 6 { target be } } } */
>  /* { dg-final { scan-assembler-times "vmrgew" 8 } } */
>  /* { dg-final { scan-assembler-times "vsplth" 8 } } */

Are those changes correct?  It looks like a vmrglb became a vmrghb, and
that 4 each of xxmrghw and xxmrglw disappeared?  Both seem wrong?


Segher
Xionghu Luo June 9, 2021, 8:03 a.m. UTC | #3
Hi,

On 2021/6/9 07:25, Segher Boessenkool wrote:
> On Mon, May 24, 2021 at 04:02:13AM -0500, Xionghu Luo wrote:
>> vmrghb only accepts permute index {0, 16, 1, 17, 2, 18, 3, 19, 4, 20,
>> 5, 21, 6, 22, 7, 23} no matter for BE or LE in ISA, similarly for vmrghlb.
> 
> (vmrglb)
> 
>> +  if (BYTES_BIG_ENDIAN)
>> +    emit_insn (
>> +      gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2]));
>> +  else
>> +    emit_insn (
>> +      gen_altivec_vmrglb_direct (operands[0], operands[2], operands[1]));
> 
> Please don't indent like that, it doesn't match what we do elsewhere.
> For better or for worse (for worse imo), we use deep hanging indents.
> If you have to, you can do something like
> 
>    rtx insn;
>    if (BYTES_BIG_ENDIAN)
>      insn = gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2]);
>    else
>      insn = gen_altivec_vmrglb_direct (operands[0], operands[2], operands[1]);
>    emit_insn (insn);
> 
> (this is better even, in that it has only one emit_insn), or even
> 
>    rtx (*fun) () = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
> 				   : gen_altivec_vmrglb_direct;
>    if (!BYTES_BIG_ENDIAN)
>      std::swap (operands[1], operands[2]);
>    emit_insn (fun (operands[0], operands[1], operands[2]));
> 
> Well, C++ does not allow that last example like that, sigh, so
>    rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
> 						: gen_altivec_vmrglb_direct;
> 
> This is shorter than the other two options ;-)

Changed.

> 
>> +(define_insn "altivec_vmrghb_direct"
>>     [(set (match_operand:V16QI 0 "register_operand" "=v")
>> +    (vec_select:V16QI
> 
> This should be indented one space more.
> 
>>     "TARGET_ALTIVEC"
>>     "@
>> -   xxmrghw %x0,%x1,%x2
>> -   vmrghw %0,%1,%2"
>> +  xxmrghw %x0,%x1,%x2
>> +  vmrghw %0,%1,%2"
> 
> The original indent was correct, please restore.
> 
>> -      emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo));
>> +      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo));
> 
> When you see a mode as part of a pattern name, chances are that it will
> be a good candidate for using parameterized names with.  (But don't do
> that now, just keep it in mind as a nice cleanup to do).

OK.

> 
>> @@ -23022,8 +23022,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
>>          : CODE_FOR_altivec_vmrglh_direct),
>>         {  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23 } },
>>       { OPTION_MASK_ALTIVEC,
>> -      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct
>> -       : CODE_FOR_altivec_vmrglw_direct),
>> +      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si
>> +       : CODE_FOR_altivec_vmrglw_direct_v4si),
> 
> The correct way is to align the ? and the : (or put everything on one
> line of course, if that fits)
> 
> The parens around this are not needed btw, and are a distraction.

Changed.

> 
>> --- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
>> @@ -317,10 +317,10 @@ int main ()
>>   /* { dg-final { scan-assembler-times "vctuxs" 2 } } */
>>   
>>   /* { dg-final { scan-assembler-times "vmrghb" 4 { target be } } } */
>> -/* { dg-final { scan-assembler-times "vmrghb" 5 { target le } } } */
>> +/* { dg-final { scan-assembler-times "vmrghb" 6 { target le } } } */
>>   /* { dg-final { scan-assembler-times "vmrghh" 8 } } */
>> -/* { dg-final { scan-assembler-times "xxmrghw" 8 } } */
>> -/* { dg-final { scan-assembler-times "xxmrglw" 8 } } */
>> +/* { dg-final { scan-assembler-times "xxmrghw" 4 } } */
>> +/* { dg-final { scan-assembler-times "xxmrglw" 4 } } */
>>   /* { dg-final { scan-assembler-times "vmrglh" 8 } } */
>>   /* { dg-final { scan-assembler-times "xxlnor" 6 } } */
>>   /* { dg-final { scan-assembler-times {\mvpkudus\M} 1 } } */
>> @@ -347,7 +347,7 @@ int main ()
>>   /* { dg-final { scan-assembler-times "vspltb" 6 } } */
>>   /* { dg-final { scan-assembler-times "vspltw" 0 } } */
>>   /* { dg-final { scan-assembler-times "vmrgow" 8 } } */
>> -/* { dg-final { scan-assembler-times "vmrglb" 5 { target le } } } */
>> +/* { dg-final { scan-assembler-times "vmrglb" 4 { target le } } } */
>>   /* { dg-final { scan-assembler-times "vmrglb" 6 { target be } } } */
>>   /* { dg-final { scan-assembler-times "vmrgew" 8 } } */
>>   /* { dg-final { scan-assembler-times "vsplth" 8 } } */
> 
> Are those changes correct?  It looks like a vmrglb became a vmrghb, and
> that 4 each of xxmrghw and xxmrglw disappeared?  Both seem wrong?


This case is built with "-mdejagnu-cpu=power8 -O0 -mno-fold-gimple -dp"
and it also counted the generated instruction patterns.

1) "vsx_xxmrghw_v4si" is replaced by "altivec_vmrglw_direct_v4si/0", so 
it decreases from 8 to 4. (Likewise for vsx_xxmrglw_v4si.)

         li 9,48          # 1282 [c=4 l=4]  *movdi_internal64/3
-       lxvd2x 0,31,9    # 31   [c=8 l=4]  *vsx_lxvd2x4_le_v4si
-       xxpermdi 0,0,0,2         # 32   [c=4 l=4]  xxswapd_v4si
-       xxmrglw 0,0,12   # 33   [c=4 l=4]  vsx_xxmrghw_v4si
+       lxvd2x 12,31,9   # 31   [c=8 l=4]  *vsx_lxvd2x4_le_v4si
+       xxpermdi 12,12,12,2      # 32   [c=4 l=4]  xxswapd_v4si
+       xxmrglw 0,12,0   # 33   [c=4 l=4]  altivec_vmrglw_direct_v4si/0
         xxpermdi 0,0,0,2         # 35   [c=4 l=4]  xxswapd_v4sf

Note that v0 and v12 is swapped in lxvd2x, these new 3 instructions
produces same result than before.

2) "*altivec_vmrglb_internal" is replaced by "altivec_vmrghb_direct" 
with this patch, then vmrglb count decreases from 5 to 4 and vmrghb
increases from 5 to 6. (BYTES_BIG_ENDIAN is checked early in RTL 
generation instead of final to remove the UNSPECs for potential 
optimization through backend.)

         li 9,928                 # 1424 [c=4 l=4]  *movdi_internal64/3
         lxvd2x 0,31,9    # 416  [c=8 l=4]  *vsx_lxvd2x16_le_V16QI
-       xxpermdi 33,0,0,2        # 417  [c=4 l=4]  xxswapd_v16qi
+       xxpermdi 32,0,0,2        # 417  [c=4 l=4]  xxswapd_v16qi
         li 9,944                 # 1425 [c=4 l=4]  *movdi_internal64/3
         lxvd2x 0,31,9    # 418  [c=8 l=4]  *vsx_lxvd2x16_le_V16QI
-       xxpermdi 32,0,0,2        # 419  [c=4 l=4]  xxswapd_v16qi
-       vmrghb 0,0,1     # 420  [c=4 l=4]  *altivec_vmrglb_internal
+       xxpermdi 33,0,0,2        # 419  [c=4 l=4]  xxswapd_v16qi
+       vmrghb 0,1,0     # 420  [c=4 l=4]  altivec_vmrghb_direct
         xxpermdi 0,32,32,2       # 421  [c=4 l=4]  xxswapd_v16qi

Seems not necessary to also use \m and \M here to count only ASM here?
Update the patch as attached.
Segher Boessenkool June 9, 2021, 11:57 a.m. UTC | #4
Hi!

On Wed, Jun 09, 2021 at 04:03:43PM +0800, Xionghu Luo wrote:
> >>--- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c
> >>+++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
> >>@@ -317,10 +317,10 @@ int main ()
> >>  /* { dg-final { scan-assembler-times "vctuxs" 2 } } */
> >>  
> >>  /* { dg-final { scan-assembler-times "vmrghb" 4 { target be } } } */
> >>-/* { dg-final { scan-assembler-times "vmrghb" 5 { target le } } } */
> >>+/* { dg-final { scan-assembler-times "vmrghb" 6 { target le } } } */
> >>  /* { dg-final { scan-assembler-times "vmrghh" 8 } } */
> >>-/* { dg-final { scan-assembler-times "xxmrghw" 8 } } */
> >>-/* { dg-final { scan-assembler-times "xxmrglw" 8 } } */
> >>+/* { dg-final { scan-assembler-times "xxmrghw" 4 } } */
> >>+/* { dg-final { scan-assembler-times "xxmrglw" 4 } } */
> >>  /* { dg-final { scan-assembler-times "vmrglh" 8 } } */
> >>  /* { dg-final { scan-assembler-times "xxlnor" 6 } } */
> >>  /* { dg-final { scan-assembler-times {\mvpkudus\M} 1 } } */
> >>@@ -347,7 +347,7 @@ int main ()
> >>  /* { dg-final { scan-assembler-times "vspltb" 6 } } */
> >>  /* { dg-final { scan-assembler-times "vspltw" 0 } } */
> >>  /* { dg-final { scan-assembler-times "vmrgow" 8 } } */
> >>-/* { dg-final { scan-assembler-times "vmrglb" 5 { target le } } } */
> >>+/* { dg-final { scan-assembler-times "vmrglb" 4 { target le } } } */
> >>  /* { dg-final { scan-assembler-times "vmrglb" 6 { target be } } } */
> >>  /* { dg-final { scan-assembler-times "vmrgew" 8 } } */
> >>  /* { dg-final { scan-assembler-times "vsplth" 8 } } */
> >
> >Are those changes correct?  It looks like a vmrglb became a vmrghb, and
> >that 4 each of xxmrghw and xxmrglw disappeared?  Both seem wrong?
> 
> 
> This case is built with "-mdejagnu-cpu=power8 -O0 -mno-fold-gimple -dp"
> and it also counted the generated instruction patterns.
> 
> 1) "vsx_xxmrghw_v4si" is replaced by "altivec_vmrglw_direct_v4si/0", so 
> it decreases from 8 to 4. (Likewise for vsx_xxmrglw_v4si.)
> 
>         li 9,48          # 1282 [c=4 l=4]  *movdi_internal64/3
> -       lxvd2x 0,31,9    # 31   [c=8 l=4]  *vsx_lxvd2x4_le_v4si
> -       xxpermdi 0,0,0,2         # 32   [c=4 l=4]  xxswapd_v4si
> -       xxmrglw 0,0,12   # 33   [c=4 l=4]  vsx_xxmrghw_v4si
> +       lxvd2x 12,31,9   # 31   [c=8 l=4]  *vsx_lxvd2x4_le_v4si
> +       xxpermdi 12,12,12,2      # 32   [c=4 l=4]  xxswapd_v4si
> +       xxmrglw 0,12,0   # 33   [c=4 l=4]  altivec_vmrglw_direct_v4si/0
>         xxpermdi 0,0,0,2         # 35   [c=4 l=4]  xxswapd_v4sf
> 
> Note that v0 and v12 is swapped in lxvd2x, these new 3 instructions
> produces same result than before.

And there was one xxmrglw in this snippet before, and now there still
is only one.

But, the testcase uses -dp, I see.  Please use \m and \M in the scans,
it helps :-)  (And convert more than just the few that hit errors ;-) )

(You may want to do that as a separate patch before this one, to make
counting easier (also for me ;-) ),

(I'll review the new patch later today).


Segher
Xionghu Luo June 30, 2021, 1:47 a.m. UTC | #5
Gentle ping, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572330.html


On 2021/6/9 16:03, Xionghu Luo via Gcc-patches wrote:
> Hi,
> 
> On 2021/6/9 07:25, Segher Boessenkool wrote:
>> On Mon, May 24, 2021 at 04:02:13AM -0500, Xionghu Luo wrote:
>>> vmrghb only accepts permute index {0, 16, 1, 17, 2, 18, 3, 19, 4, 20,
>>> 5, 21, 6, 22, 7, 23} no matter for BE or LE in ISA, similarly for 
>>> vmrghlb.
>>
>> (vmrglb)
>>
>>> +  if (BYTES_BIG_ENDIAN)
>>> +    emit_insn (
>>> +      gen_altivec_vmrghb_direct (operands[0], operands[1], 
>>> operands[2]));
>>> +  else
>>> +    emit_insn (
>>> +      gen_altivec_vmrglb_direct (operands[0], operands[2], 
>>> operands[1]));
>>
>> Please don't indent like that, it doesn't match what we do elsewhere.
>> For better or for worse (for worse imo), we use deep hanging indents.
>> If you have to, you can do something like
>>
>>    rtx insn;
>>    if (BYTES_BIG_ENDIAN)
>>      insn = gen_altivec_vmrghb_direct (operands[0], operands[1], 
>> operands[2]);
>>    else
>>      insn = gen_altivec_vmrglb_direct (operands[0], operands[2], 
>> operands[1]);
>>    emit_insn (insn);
>>
>> (this is better even, in that it has only one emit_insn), or even
>>
>>    rtx (*fun) () = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
>>                    : gen_altivec_vmrglb_direct;
>>    if (!BYTES_BIG_ENDIAN)
>>      std::swap (operands[1], operands[2]);
>>    emit_insn (fun (operands[0], operands[1], operands[2]));
>>
>> Well, C++ does not allow that last example like that, sigh, so
>>    rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? 
>> gen_altivec_vmrghb_direct
>>                         : gen_altivec_vmrglb_direct;
>>
>> This is shorter than the other two options ;-)
> 
> Changed.
> 
>>
>>> +(define_insn "altivec_vmrghb_direct"
>>>     [(set (match_operand:V16QI 0 "register_operand" "=v")
>>> +    (vec_select:V16QI
>>
>> This should be indented one space more.
>>
>>>     "TARGET_ALTIVEC"
>>>     "@
>>> -   xxmrghw %x0,%x1,%x2
>>> -   vmrghw %0,%1,%2"
>>> +  xxmrghw %x0,%x1,%x2
>>> +  vmrghw %0,%1,%2"
>>
>> The original indent was correct, please restore.
>>
>>> -      emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo));
>>> +      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo));
>>
>> When you see a mode as part of a pattern name, chances are that it will
>> be a good candidate for using parameterized names with.  (But don't do
>> that now, just keep it in mind as a nice cleanup to do).
> 
> OK.
> 
>>
>>> @@ -23022,8 +23022,8 @@ altivec_expand_vec_perm_const (rtx target, 
>>> rtx op0, rtx op1,
>>>          : CODE_FOR_altivec_vmrglh_direct),
>>>         {  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 
>>> 23 } },
>>>       { OPTION_MASK_ALTIVEC,
>>> -      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct
>>> -       : CODE_FOR_altivec_vmrglw_direct),
>>> +      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si
>>> +       : CODE_FOR_altivec_vmrglw_direct_v4si),
>>
>> The correct way is to align the ? and the : (or put everything on one
>> line of course, if that fits)
>>
>> The parens around this are not needed btw, and are a distraction.
> 
> Changed.
> 
>>
>>> --- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c
>>> +++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
>>> @@ -317,10 +317,10 @@ int main ()
>>>   /* { dg-final { scan-assembler-times "vctuxs" 2 } } */
>>>   /* { dg-final { scan-assembler-times "vmrghb" 4 { target be } } } */
>>> -/* { dg-final { scan-assembler-times "vmrghb" 5 { target le } } } */
>>> +/* { dg-final { scan-assembler-times "vmrghb" 6 { target le } } } */
>>>   /* { dg-final { scan-assembler-times "vmrghh" 8 } } */
>>> -/* { dg-final { scan-assembler-times "xxmrghw" 8 } } */
>>> -/* { dg-final { scan-assembler-times "xxmrglw" 8 } } */
>>> +/* { dg-final { scan-assembler-times "xxmrghw" 4 } } */
>>> +/* { dg-final { scan-assembler-times "xxmrglw" 4 } } */
>>>   /* { dg-final { scan-assembler-times "vmrglh" 8 } } */
>>>   /* { dg-final { scan-assembler-times "xxlnor" 6 } } */
>>>   /* { dg-final { scan-assembler-times {\mvpkudus\M} 1 } } */
>>> @@ -347,7 +347,7 @@ int main ()
>>>   /* { dg-final { scan-assembler-times "vspltb" 6 } } */
>>>   /* { dg-final { scan-assembler-times "vspltw" 0 } } */
>>>   /* { dg-final { scan-assembler-times "vmrgow" 8 } } */
>>> -/* { dg-final { scan-assembler-times "vmrglb" 5 { target le } } } */
>>> +/* { dg-final { scan-assembler-times "vmrglb" 4 { target le } } } */
>>>   /* { dg-final { scan-assembler-times "vmrglb" 6 { target be } } } */
>>>   /* { dg-final { scan-assembler-times "vmrgew" 8 } } */
>>>   /* { dg-final { scan-assembler-times "vsplth" 8 } } */
>>
>> Are those changes correct?  It looks like a vmrglb became a vmrghb, and
>> that 4 each of xxmrghw and xxmrglw disappeared?  Both seem wrong?
> 
> 
> This case is built with "-mdejagnu-cpu=power8 -O0 -mno-fold-gimple -dp"
> and it also counted the generated instruction patterns.
> 
> 1) "vsx_xxmrghw_v4si" is replaced by "altivec_vmrglw_direct_v4si/0", so 
> it decreases from 8 to 4. (Likewise for vsx_xxmrglw_v4si.)
> 
>          li 9,48          # 1282 [c=4 l=4]  *movdi_internal64/3
> -       lxvd2x 0,31,9    # 31   [c=8 l=4]  *vsx_lxvd2x4_le_v4si
> -       xxpermdi 0,0,0,2         # 32   [c=4 l=4]  xxswapd_v4si
> -       xxmrglw 0,0,12   # 33   [c=4 l=4]  vsx_xxmrghw_v4si
> +       lxvd2x 12,31,9   # 31   [c=8 l=4]  *vsx_lxvd2x4_le_v4si
> +       xxpermdi 12,12,12,2      # 32   [c=4 l=4]  xxswapd_v4si
> +       xxmrglw 0,12,0   # 33   [c=4 l=4]  altivec_vmrglw_direct_v4si/0
>          xxpermdi 0,0,0,2         # 35   [c=4 l=4]  xxswapd_v4sf
> 
> Note that v0 and v12 is swapped in lxvd2x, these new 3 instructions
> produces same result than before.
> 
> 2) "*altivec_vmrglb_internal" is replaced by "altivec_vmrghb_direct" 
> with this patch, then vmrglb count decreases from 5 to 4 and vmrghb
> increases from 5 to 6. (BYTES_BIG_ENDIAN is checked early in RTL 
> generation instead of final to remove the UNSPECs for potential 
> optimization through backend.)
> 
>          li 9,928                 # 1424 [c=4 l=4]  *movdi_internal64/3
>          lxvd2x 0,31,9    # 416  [c=8 l=4]  *vsx_lxvd2x16_le_V16QI
> -       xxpermdi 33,0,0,2        # 417  [c=4 l=4]  xxswapd_v16qi
> +       xxpermdi 32,0,0,2        # 417  [c=4 l=4]  xxswapd_v16qi
>          li 9,944                 # 1425 [c=4 l=4]  *movdi_internal64/3
>          lxvd2x 0,31,9    # 418  [c=8 l=4]  *vsx_lxvd2x16_le_V16QI
> -       xxpermdi 32,0,0,2        # 419  [c=4 l=4]  xxswapd_v16qi
> -       vmrghb 0,0,1     # 420  [c=4 l=4]  *altivec_vmrglb_internal
> +       xxpermdi 33,0,0,2        # 419  [c=4 l=4]  xxswapd_v16qi
> +       vmrghb 0,1,0     # 420  [c=4 l=4]  altivec_vmrghb_direct
>          xxpermdi 0,32,32,2       # 421  [c=4 l=4]  xxswapd_v16qi
> 
> Seems not necessary to also use \m and \M here to count only ASM here?
> Update the patch as attached.
>
Xionghu Luo Sept. 6, 2021, 12:54 a.m. UTC | #6
Ping^2, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572330.html


On 2021/6/30 09:47, Xionghu Luo via Gcc-patches wrote:
> Gentle ping, thanks.
> 
> https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572330.html
> 
> 
> On 2021/6/9 16:03, Xionghu Luo via Gcc-patches wrote:
>> Hi,
>>
>> On 2021/6/9 07:25, Segher Boessenkool wrote:
>>> On Mon, May 24, 2021 at 04:02:13AM -0500, Xionghu Luo wrote:
>>>> vmrghb only accepts permute index {0, 16, 1, 17, 2, 18, 3, 19, 4, 20,
>>>> 5, 21, 6, 22, 7, 23} no matter for BE or LE in ISA, similarly for 
>>>> vmrghlb.
>>>
>>> (vmrglb)
>>>
>>>> +  if (BYTES_BIG_ENDIAN)
>>>> +    emit_insn (
>>>> +      gen_altivec_vmrghb_direct (operands[0], operands[1], 
>>>> operands[2]));
>>>> +  else
>>>> +    emit_insn (
>>>> +      gen_altivec_vmrglb_direct (operands[0], operands[2], 
>>>> operands[1]));
>>>
>>> Please don't indent like that, it doesn't match what we do elsewhere.
>>> For better or for worse (for worse imo), we use deep hanging indents.
>>> If you have to, you can do something like
>>>
>>>    rtx insn;
>>>    if (BYTES_BIG_ENDIAN)
>>>      insn = gen_altivec_vmrghb_direct (operands[0], operands[1], 
>>> operands[2]);
>>>    else
>>>      insn = gen_altivec_vmrglb_direct (operands[0], operands[2], 
>>> operands[1]);
>>>    emit_insn (insn);
>>>
>>> (this is better even, in that it has only one emit_insn), or even
>>>
>>>    rtx (*fun) () = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
>>>                    : gen_altivec_vmrglb_direct;
>>>    if (!BYTES_BIG_ENDIAN)
>>>      std::swap (operands[1], operands[2]);
>>>    emit_insn (fun (operands[0], operands[1], operands[2]));
>>>
>>> Well, C++ does not allow that last example like that, sigh, so
>>>    rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? 
>>> gen_altivec_vmrghb_direct
>>>                         : gen_altivec_vmrglb_direct;
>>>
>>> This is shorter than the other two options ;-)
>>
>> Changed.
>>
>>>
>>>> +(define_insn "altivec_vmrghb_direct"
>>>>     [(set (match_operand:V16QI 0 "register_operand" "=v")
>>>> +    (vec_select:V16QI
>>>
>>> This should be indented one space more.
>>>
>>>>     "TARGET_ALTIVEC"
>>>>     "@
>>>> -   xxmrghw %x0,%x1,%x2
>>>> -   vmrghw %0,%1,%2"
>>>> +  xxmrghw %x0,%x1,%x2
>>>> +  vmrghw %0,%1,%2"
>>>
>>> The original indent was correct, please restore.
>>>
>>>> -      emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo));
>>>> +      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, 
>>>> vo));
>>>
>>> When you see a mode as part of a pattern name, chances are that it will
>>> be a good candidate for using parameterized names with.  (But don't do
>>> that now, just keep it in mind as a nice cleanup to do).
>>
>> OK.
>>
>>>
>>>> @@ -23022,8 +23022,8 @@ altivec_expand_vec_perm_const (rtx target, 
>>>> rtx op0, rtx op1,
>>>>          : CODE_FOR_altivec_vmrglh_direct),
>>>>         {  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 
>>>> 22, 23 } },
>>>>       { OPTION_MASK_ALTIVEC,
>>>> -      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct
>>>> -       : CODE_FOR_altivec_vmrglw_direct),
>>>> +      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si
>>>> +       : CODE_FOR_altivec_vmrglw_direct_v4si),
>>>
>>> The correct way is to align the ? and the : (or put everything on one
>>> line of course, if that fits)
>>>
>>> The parens around this are not needed btw, and are a distraction.
>>
>> Changed.
>>
>>>
>>>> --- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c
>>>> +++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
>>>> @@ -317,10 +317,10 @@ int main ()
>>>>   /* { dg-final { scan-assembler-times "vctuxs" 2 } } */
>>>>   /* { dg-final { scan-assembler-times "vmrghb" 4 { target be } } } */
>>>> -/* { dg-final { scan-assembler-times "vmrghb" 5 { target le } } } */
>>>> +/* { dg-final { scan-assembler-times "vmrghb" 6 { target le } } } */
>>>>   /* { dg-final { scan-assembler-times "vmrghh" 8 } } */
>>>> -/* { dg-final { scan-assembler-times "xxmrghw" 8 } } */
>>>> -/* { dg-final { scan-assembler-times "xxmrglw" 8 } } */
>>>> +/* { dg-final { scan-assembler-times "xxmrghw" 4 } } */
>>>> +/* { dg-final { scan-assembler-times "xxmrglw" 4 } } */
>>>>   /* { dg-final { scan-assembler-times "vmrglh" 8 } } */
>>>>   /* { dg-final { scan-assembler-times "xxlnor" 6 } } */
>>>>   /* { dg-final { scan-assembler-times {\mvpkudus\M} 1 } } */
>>>> @@ -347,7 +347,7 @@ int main ()
>>>>   /* { dg-final { scan-assembler-times "vspltb" 6 } } */
>>>>   /* { dg-final { scan-assembler-times "vspltw" 0 } } */
>>>>   /* { dg-final { scan-assembler-times "vmrgow" 8 } } */
>>>> -/* { dg-final { scan-assembler-times "vmrglb" 5 { target le } } } */
>>>> +/* { dg-final { scan-assembler-times "vmrglb" 4 { target le } } } */
>>>>   /* { dg-final { scan-assembler-times "vmrglb" 6 { target be } } } */
>>>>   /* { dg-final { scan-assembler-times "vmrgew" 8 } } */
>>>>   /* { dg-final { scan-assembler-times "vsplth" 8 } } */
>>>
>>> Are those changes correct?  It looks like a vmrglb became a vmrghb, and
>>> that 4 each of xxmrghw and xxmrglw disappeared?  Both seem wrong?
>>
>>
>> This case is built with "-mdejagnu-cpu=power8 -O0 -mno-fold-gimple -dp"
>> and it also counted the generated instruction patterns.
>>
>> 1) "vsx_xxmrghw_v4si" is replaced by "altivec_vmrglw_direct_v4si/0", 
>> so it decreases from 8 to 4. (Likewise for vsx_xxmrglw_v4si.)
>>
>>          li 9,48          # 1282 [c=4 l=4]  *movdi_internal64/3
>> -       lxvd2x 0,31,9    # 31   [c=8 l=4]  *vsx_lxvd2x4_le_v4si
>> -       xxpermdi 0,0,0,2         # 32   [c=4 l=4]  xxswapd_v4si
>> -       xxmrglw 0,0,12   # 33   [c=4 l=4]  vsx_xxmrghw_v4si
>> +       lxvd2x 12,31,9   # 31   [c=8 l=4]  *vsx_lxvd2x4_le_v4si
>> +       xxpermdi 12,12,12,2      # 32   [c=4 l=4]  xxswapd_v4si
>> +       xxmrglw 0,12,0   # 33   [c=4 l=4]  altivec_vmrglw_direct_v4si/0
>>          xxpermdi 0,0,0,2         # 35   [c=4 l=4]  xxswapd_v4sf
>>
>> Note that v0 and v12 is swapped in lxvd2x, these new 3 instructions
>> produces same result than before.
>>
>> 2) "*altivec_vmrglb_internal" is replaced by "altivec_vmrghb_direct" 
>> with this patch, then vmrglb count decreases from 5 to 4 and vmrghb
>> increases from 5 to 6. (BYTES_BIG_ENDIAN is checked early in RTL 
>> generation instead of final to remove the UNSPECs for potential 
>> optimization through backend.)
>>
>>          li 9,928                 # 1424 [c=4 l=4]  *movdi_internal64/3
>>          lxvd2x 0,31,9    # 416  [c=8 l=4]  *vsx_lxvd2x16_le_V16QI
>> -       xxpermdi 33,0,0,2        # 417  [c=4 l=4]  xxswapd_v16qi
>> +       xxpermdi 32,0,0,2        # 417  [c=4 l=4]  xxswapd_v16qi
>>          li 9,944                 # 1425 [c=4 l=4]  *movdi_internal64/3
>>          lxvd2x 0,31,9    # 418  [c=8 l=4]  *vsx_lxvd2x16_le_V16QI
>> -       xxpermdi 32,0,0,2        # 419  [c=4 l=4]  xxswapd_v16qi
>> -       vmrghb 0,0,1     # 420  [c=4 l=4]  *altivec_vmrglb_internal
>> +       xxpermdi 33,0,0,2        # 419  [c=4 l=4]  xxswapd_v16qi
>> +       vmrghb 0,1,0     # 420  [c=4 l=4]  altivec_vmrghb_direct
>>          xxpermdi 0,32,32,2       # 421  [c=4 l=4]  xxswapd_v16qi
>>
>> Seems not necessary to also use \m and \M here to count only ASM here?
>> Update the patch as attached.
>>
>
David Edelsohn Oct. 12, 2021, 10:51 p.m. UTC | #7
Hi, Xionghu

What's the status of the \M and \m testcase beautification requested
by Segher?  Did you send an updated patch? Your messages ping the
version prior to Segher's additional comments.

It seems that the changes to the patterns are complete, but there are
remaining questions about the testcase style and if the instruction
counts are ideal. I trust that the instruction counts match the
behavior after the patch, but it seemed that Segher wanted to confirm
that the counts are the values desired / expected from optimal code
generation.  The counts are the total for the file, which doesn't
communicate if the sequences themselves are optimal.

Thanks, David

On Sun, Sep 5, 2021 at 8:54 PM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>
> Ping^2, thanks.
>
> https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572330.html
>
>
> On 2021/6/30 09:47, Xionghu Luo via Gcc-patches wrote:
> > Gentle ping, thanks.
> >
> > https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572330.html
> >
> >
> > On 2021/6/9 16:03, Xionghu Luo via Gcc-patches wrote:
> >> Hi,
> >>
> >> On 2021/6/9 07:25, Segher Boessenkool wrote:
> >>> On Mon, May 24, 2021 at 04:02:13AM -0500, Xionghu Luo wrote:
> >>>> vmrghb only accepts permute index {0, 16, 1, 17, 2, 18, 3, 19, 4, 20,
> >>>> 5, 21, 6, 22, 7, 23} no matter for BE or LE in ISA, similarly for
> >>>> vmrghlb.
> >>>
> >>> (vmrglb)
> >>>
> >>>> +  if (BYTES_BIG_ENDIAN)
> >>>> +    emit_insn (
> >>>> +      gen_altivec_vmrghb_direct (operands[0], operands[1],
> >>>> operands[2]));
> >>>> +  else
> >>>> +    emit_insn (
> >>>> +      gen_altivec_vmrglb_direct (operands[0], operands[2],
> >>>> operands[1]));
> >>>
> >>> Please don't indent like that, it doesn't match what we do elsewhere.
> >>> For better or for worse (for worse imo), we use deep hanging indents.
> >>> If you have to, you can do something like
> >>>
> >>>    rtx insn;
> >>>    if (BYTES_BIG_ENDIAN)
> >>>      insn = gen_altivec_vmrghb_direct (operands[0], operands[1],
> >>> operands[2]);
> >>>    else
> >>>      insn = gen_altivec_vmrglb_direct (operands[0], operands[2],
> >>> operands[1]);
> >>>    emit_insn (insn);
> >>>
> >>> (this is better even, in that it has only one emit_insn), or even
> >>>
> >>>    rtx (*fun) () = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
> >>>                    : gen_altivec_vmrglb_direct;
> >>>    if (!BYTES_BIG_ENDIAN)
> >>>      std::swap (operands[1], operands[2]);
> >>>    emit_insn (fun (operands[0], operands[1], operands[2]));
> >>>
> >>> Well, C++ does not allow that last example like that, sigh, so
> >>>    rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ?
> >>> gen_altivec_vmrghb_direct
> >>>                         : gen_altivec_vmrglb_direct;
> >>>
> >>> This is shorter than the other two options ;-)
> >>
> >> Changed.
> >>
> >>>
> >>>> +(define_insn "altivec_vmrghb_direct"
> >>>>     [(set (match_operand:V16QI 0 "register_operand" "=v")
> >>>> +    (vec_select:V16QI
> >>>
> >>> This should be indented one space more.
> >>>
> >>>>     "TARGET_ALTIVEC"
> >>>>     "@
> >>>> -   xxmrghw %x0,%x1,%x2
> >>>> -   vmrghw %0,%1,%2"
> >>>> +  xxmrghw %x0,%x1,%x2
> >>>> +  vmrghw %0,%1,%2"
> >>>
> >>> The original indent was correct, please restore.
> >>>
> >>>> -      emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo));
> >>>> +      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve,
> >>>> vo));
> >>>
> >>> When you see a mode as part of a pattern name, chances are that it will
> >>> be a good candidate for using parameterized names with.  (But don't do
> >>> that now, just keep it in mind as a nice cleanup to do).
> >>
> >> OK.
> >>
> >>>
> >>>> @@ -23022,8 +23022,8 @@ altivec_expand_vec_perm_const (rtx target,
> >>>> rtx op0, rtx op1,
> >>>>          : CODE_FOR_altivec_vmrglh_direct),
> >>>>         {  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7,
> >>>> 22, 23 } },
> >>>>       { OPTION_MASK_ALTIVEC,
> >>>> -      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct
> >>>> -       : CODE_FOR_altivec_vmrglw_direct),
> >>>> +      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si
> >>>> +       : CODE_FOR_altivec_vmrglw_direct_v4si),
> >>>
> >>> The correct way is to align the ? and the : (or put everything on one
> >>> line of course, if that fits)
> >>>
> >>> The parens around this are not needed btw, and are a distraction.
> >>
> >> Changed.
> >>
> >>>
> >>>> --- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c
> >>>> +++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
> >>>> @@ -317,10 +317,10 @@ int main ()
> >>>>   /* { dg-final { scan-assembler-times "vctuxs" 2 } } */
> >>>>   /* { dg-final { scan-assembler-times "vmrghb" 4 { target be } } } */
> >>>> -/* { dg-final { scan-assembler-times "vmrghb" 5 { target le } } } */
> >>>> +/* { dg-final { scan-assembler-times "vmrghb" 6 { target le } } } */
> >>>>   /* { dg-final { scan-assembler-times "vmrghh" 8 } } */
> >>>> -/* { dg-final { scan-assembler-times "xxmrghw" 8 } } */
> >>>> -/* { dg-final { scan-assembler-times "xxmrglw" 8 } } */
> >>>> +/* { dg-final { scan-assembler-times "xxmrghw" 4 } } */
> >>>> +/* { dg-final { scan-assembler-times "xxmrglw" 4 } } */
> >>>>   /* { dg-final { scan-assembler-times "vmrglh" 8 } } */
> >>>>   /* { dg-final { scan-assembler-times "xxlnor" 6 } } */
> >>>>   /* { dg-final { scan-assembler-times {\mvpkudus\M} 1 } } */
> >>>> @@ -347,7 +347,7 @@ int main ()
> >>>>   /* { dg-final { scan-assembler-times "vspltb" 6 } } */
> >>>>   /* { dg-final { scan-assembler-times "vspltw" 0 } } */
> >>>>   /* { dg-final { scan-assembler-times "vmrgow" 8 } } */
> >>>> -/* { dg-final { scan-assembler-times "vmrglb" 5 { target le } } } */
> >>>> +/* { dg-final { scan-assembler-times "vmrglb" 4 { target le } } } */
> >>>>   /* { dg-final { scan-assembler-times "vmrglb" 6 { target be } } } */
> >>>>   /* { dg-final { scan-assembler-times "vmrgew" 8 } } */
> >>>>   /* { dg-final { scan-assembler-times "vsplth" 8 } } */
> >>>
> >>> Are those changes correct?  It looks like a vmrglb became a vmrghb, and
> >>> that 4 each of xxmrghw and xxmrglw disappeared?  Both seem wrong?
> >>
> >>
> >> This case is built with "-mdejagnu-cpu=power8 -O0 -mno-fold-gimple -dp"
> >> and it also counted the generated instruction patterns.
> >>
> >> 1) "vsx_xxmrghw_v4si" is replaced by "altivec_vmrglw_direct_v4si/0",
> >> so it decreases from 8 to 4. (Likewise for vsx_xxmrglw_v4si.)
> >>
> >>          li 9,48          # 1282 [c=4 l=4]  *movdi_internal64/3
> >> -       lxvd2x 0,31,9    # 31   [c=8 l=4]  *vsx_lxvd2x4_le_v4si
> >> -       xxpermdi 0,0,0,2         # 32   [c=4 l=4]  xxswapd_v4si
> >> -       xxmrglw 0,0,12   # 33   [c=4 l=4]  vsx_xxmrghw_v4si
> >> +       lxvd2x 12,31,9   # 31   [c=8 l=4]  *vsx_lxvd2x4_le_v4si
> >> +       xxpermdi 12,12,12,2      # 32   [c=4 l=4]  xxswapd_v4si
> >> +       xxmrglw 0,12,0   # 33   [c=4 l=4]  altivec_vmrglw_direct_v4si/0
> >>          xxpermdi 0,0,0,2         # 35   [c=4 l=4]  xxswapd_v4sf
> >>
> >> Note that v0 and v12 is swapped in lxvd2x, these new 3 instructions
> >> produces same result than before.
> >>
> >> 2) "*altivec_vmrglb_internal" is replaced by "altivec_vmrghb_direct"
> >> with this patch, then vmrglb count decreases from 5 to 4 and vmrghb
> >> increases from 5 to 6. (BYTES_BIG_ENDIAN is checked early in RTL
> >> generation instead of final to remove the UNSPECs for potential
> >> optimization through backend.)
> >>
> >>          li 9,928                 # 1424 [c=4 l=4]  *movdi_internal64/3
> >>          lxvd2x 0,31,9    # 416  [c=8 l=4]  *vsx_lxvd2x16_le_V16QI
> >> -       xxpermdi 33,0,0,2        # 417  [c=4 l=4]  xxswapd_v16qi
> >> +       xxpermdi 32,0,0,2        # 417  [c=4 l=4]  xxswapd_v16qi
> >>          li 9,944                 # 1425 [c=4 l=4]  *movdi_internal64/3
> >>          lxvd2x 0,31,9    # 418  [c=8 l=4]  *vsx_lxvd2x16_le_V16QI
> >> -       xxpermdi 32,0,0,2        # 419  [c=4 l=4]  xxswapd_v16qi
> >> -       vmrghb 0,0,1     # 420  [c=4 l=4]  *altivec_vmrglb_internal
> >> +       xxpermdi 33,0,0,2        # 419  [c=4 l=4]  xxswapd_v16qi
> >> +       vmrghb 0,1,0     # 420  [c=4 l=4]  altivec_vmrghb_direct
> >>          xxpermdi 0,32,32,2       # 421  [c=4 l=4]  xxswapd_v16qi
> >>
> >> Seems not necessary to also use \m and \M here to count only ASM here?
> >> Update the patch as attached.
> >>
> >
>
> --
> Thanks,
> Xionghu
Xionghu Luo Oct. 13, 2021, 1:59 a.m. UTC | #8
Thanks David,

On 2021/10/13 06:51, David Edelsohn wrote:
> Hi, Xionghu
> 
> What's the status of the \M and \m testcase beautification requested
> by Segher?  Did you send an updated patch? Your messages ping the
> version prior to Segher's additional comments.

The pinged link already answered Segher's questions and included a patch
pasted in it.  To follow Segher's preference ;), I just post a v2 patch
here:

https://gcc.gnu.org/pipermail/gcc-patches/2021-October/581497.html

\M and \m are actually not quite necessary to the testcase
gcc.target/powerpc/builtins-1.c since it is built with
"-mdejagnu-cpu=power8 -O0 -mno-fold-gimple -dp", so the testcase also counts
the generated instruction patterns.

> 
> It seems that the changes to the patterns are complete, but there are
> remaining questions about the testcase style and if the instruction
> counts are ideal. I trust that the instruction counts match the
> behavior after the patch, but it seemed that Segher wanted to confirm
> that the counts are the values desired / expected from optimal code
> generation.  The counts are the total for the file, which doesn't
> communicate if the sequences themselves are optimal.

Will rebase and retest after Segher's review of the v2 patch.
diff mbox series

Patch

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 208d6343225..cae05be2c2d 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -143,8 +143,6 @@  (define_c_enum "unspec"
    UNSPEC_VUPKHU_V4SF
    UNSPEC_VUPKLU_V4SF
    UNSPEC_VGBBD
-   UNSPEC_VMRGH_DIRECT
-   UNSPEC_VMRGL_DIRECT
    UNSPEC_VSPLT_DIRECT
    UNSPEC_VMRGEW_DIRECT
    UNSPEC_VMRGOW_DIRECT
@@ -1291,44 +1289,29 @@  (define_expand "altivec_vmrghb"
    (use (match_operand:V16QI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
-  rtvec v = gen_rtvec (16, GEN_INT (0), GEN_INT (16), GEN_INT (1), GEN_INT (17),
-		       GEN_INT (2), GEN_INT (18), GEN_INT (3), GEN_INT (19),
-		       GEN_INT (4), GEN_INT (20), GEN_INT (5), GEN_INT (21),
-		       GEN_INT (6), GEN_INT (22), GEN_INT (7), GEN_INT (23));
-  rtx x = gen_rtx_VEC_CONCAT (V32QImode, operands[1], operands[2]);
-  x = gen_rtx_VEC_SELECT (V16QImode, x, gen_rtx_PARALLEL (VOIDmode, v));
-  emit_insn (gen_rtx_SET (operands[0], x));
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (
+      gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (
+      gen_altivec_vmrglb_direct (operands[0], operands[2], operands[1]));
   DONE;
 })
 
-(define_insn "*altivec_vmrghb_internal"
+(define_insn "altivec_vmrghb_direct"
   [(set (match_operand:V16QI 0 "register_operand" "=v")
-        (vec_select:V16QI
+    (vec_select:V16QI
 	  (vec_concat:V32QI
 	    (match_operand:V16QI 1 "register_operand" "v")
 	    (match_operand:V16QI 2 "register_operand" "v"))
-	  (parallel [(const_int 0) (const_int 16)
-		     (const_int 1) (const_int 17)
-		     (const_int 2) (const_int 18)
-		     (const_int 3) (const_int 19)
-		     (const_int 4) (const_int 20)
-		     (const_int 5) (const_int 21)
-		     (const_int 6) (const_int 22)
-		     (const_int 7) (const_int 23)])))]
-  "TARGET_ALTIVEC"
-{
-  if (BYTES_BIG_ENDIAN)
-    return "vmrghb %0,%1,%2";
-  else
-    return "vmrglb %0,%2,%1";
-}
-  [(set_attr "type" "vecperm")])
-
-(define_insn "altivec_vmrghb_direct"
-  [(set (match_operand:V16QI 0 "register_operand" "=v")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")
-		       (match_operand:V16QI 2 "register_operand" "v")]
-		      UNSPEC_VMRGH_DIRECT))]
+	  (parallel [(const_int  0) (const_int 16)
+		     (const_int  1) (const_int 17)
+		     (const_int  2) (const_int 18)
+		     (const_int  3) (const_int 19)
+		     (const_int  4) (const_int 20)
+		     (const_int  5) (const_int 21)
+		     (const_int  6) (const_int 22)
+		     (const_int  7) (const_int 23)])))]
   "TARGET_ALTIVEC"
   "vmrghb %0,%1,%2"
   [(set_attr "type" "vecperm")])
@@ -1339,16 +1322,16 @@  (define_expand "altivec_vmrghh"
    (use (match_operand:V8HI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
-  rtvec v = gen_rtvec (8, GEN_INT (0), GEN_INT (8), GEN_INT (1), GEN_INT (9),
-		       GEN_INT (2), GEN_INT (10), GEN_INT (3), GEN_INT (11));
-  rtx x = gen_rtx_VEC_CONCAT (V16HImode, operands[1], operands[2]);
-
-  x = gen_rtx_VEC_SELECT (V8HImode, x, gen_rtx_PARALLEL (VOIDmode, v));
-  emit_insn (gen_rtx_SET (operands[0], x));
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (
+      gen_altivec_vmrghh_direct (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (
+      gen_altivec_vmrglh_direct (operands[0], operands[2], operands[1]));
   DONE;
 })
 
-(define_insn "*altivec_vmrghh_internal"
+(define_insn "altivec_vmrghh_direct"
   [(set (match_operand:V8HI 0 "register_operand" "=v")
         (vec_select:V8HI
 	  (vec_concat:V16HI
@@ -1359,20 +1342,6 @@  (define_insn "*altivec_vmrghh_internal"
 		     (const_int 2) (const_int 10)
 		     (const_int 3) (const_int 11)])))]
   "TARGET_ALTIVEC"
-{
-  if (BYTES_BIG_ENDIAN)
-    return "vmrghh %0,%1,%2";
-  else
-    return "vmrglh %0,%2,%1";
-}
-  [(set_attr "type" "vecperm")])
-
-(define_insn "altivec_vmrghh_direct"
-  [(set (match_operand:V8HI 0 "register_operand" "=v")
-        (unspec:V8HI [(match_operand:V8HI 1 "register_operand" "v")
-                      (match_operand:V8HI 2 "register_operand" "v")]
-                     UNSPEC_VMRGH_DIRECT))]
-  "TARGET_ALTIVEC"
   "vmrghh %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
@@ -1382,39 +1351,27 @@  (define_expand "altivec_vmrghw"
    (use (match_operand:V4SI 2 "register_operand"))]
   "VECTOR_MEM_ALTIVEC_P (V4SImode)"
 {
-  rtvec v = gen_rtvec (4, GEN_INT (0), GEN_INT (4), GEN_INT (1), GEN_INT (5));
-  rtx x = gen_rtx_VEC_CONCAT (V8SImode, operands[1], operands[2]);
-  x = gen_rtx_VEC_SELECT (V4SImode, x, gen_rtx_PARALLEL (VOIDmode, v));
-  emit_insn (gen_rtx_SET (operands[0], x));
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (
+      gen_altivec_vmrghw_direct_v4si (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (
+      gen_altivec_vmrglw_direct_v4si (operands[0], operands[2], operands[1]));
   DONE;
 })
 
-(define_insn "*altivec_vmrghw_internal"
-  [(set (match_operand:V4SI 0 "register_operand" "=v")
-        (vec_select:V4SI
-	  (vec_concat:V8SI
-	    (match_operand:V4SI 1 "register_operand" "v")
-	    (match_operand:V4SI 2 "register_operand" "v"))
+(define_insn "altivec_vmrghw_direct_<mode>"
+  [(set (match_operand:VSX_W 0 "register_operand" "=wa,v")
+        (vec_select:VSX_W
+	  (vec_concat:<VS_double>
+	    (match_operand:VSX_W 1 "register_operand" "wa,v")
+	    (match_operand:VSX_W 2 "register_operand" "wa,v"))
 	  (parallel [(const_int 0) (const_int 4)
 		     (const_int 1) (const_int 5)])))]
-  "VECTOR_MEM_ALTIVEC_P (V4SImode)"
-{
-  if (BYTES_BIG_ENDIAN)
-    return "vmrghw %0,%1,%2";
-  else
-    return "vmrglw %0,%2,%1";
-}
-  [(set_attr "type" "vecperm")])
-
-(define_insn "altivec_vmrghw_direct"
-  [(set (match_operand:V4SI 0 "register_operand" "=wa,v")
-	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "wa,v")
-		      (match_operand:V4SI 2 "register_operand" "wa,v")]
-		     UNSPEC_VMRGH_DIRECT))]
   "TARGET_ALTIVEC"
   "@
-   xxmrghw %x0,%x1,%x2
-   vmrghw %0,%1,%2"
+  xxmrghw %x0,%x1,%x2
+  vmrghw %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
 (define_insn "*altivec_vmrghsf"
@@ -1440,19 +1397,18 @@  (define_expand "altivec_vmrglb"
    (use (match_operand:V16QI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
-  rtvec v = gen_rtvec (16, GEN_INT (8), GEN_INT (24), GEN_INT (9), GEN_INT (25),
-		       GEN_INT (10), GEN_INT (26), GEN_INT (11), GEN_INT (27),
-		       GEN_INT (12), GEN_INT (28), GEN_INT (13), GEN_INT (29),
-		       GEN_INT (14), GEN_INT (30), GEN_INT (15), GEN_INT (31));
-  rtx x = gen_rtx_VEC_CONCAT (V32QImode, operands[1], operands[2]);
-  x = gen_rtx_VEC_SELECT (V16QImode, x, gen_rtx_PARALLEL (VOIDmode, v));
-  emit_insn (gen_rtx_SET (operands[0], x));
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (
+      gen_altivec_vmrglb_direct (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (
+      gen_altivec_vmrghb_direct (operands[0], operands[2], operands[1]));
   DONE;
 })
 
-(define_insn "*altivec_vmrglb_internal"
+(define_insn "altivec_vmrglb_direct"
   [(set (match_operand:V16QI 0 "register_operand" "=v")
-        (vec_select:V16QI
+    (vec_select:V16QI
 	  (vec_concat:V32QI
 	    (match_operand:V16QI 1 "register_operand" "v")
 	    (match_operand:V16QI 2 "register_operand" "v"))
@@ -1465,20 +1421,6 @@  (define_insn "*altivec_vmrglb_internal"
 		     (const_int 14) (const_int 30)
 		     (const_int 15) (const_int 31)])))]
   "TARGET_ALTIVEC"
-{
-  if (BYTES_BIG_ENDIAN)
-    return "vmrglb %0,%1,%2";
-  else
-    return "vmrghb %0,%2,%1";
-}
-  [(set_attr "type" "vecperm")])
-
-(define_insn "altivec_vmrglb_direct"
-  [(set (match_operand:V16QI 0 "register_operand" "=v")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")
-		       (match_operand:V16QI 2 "register_operand" "v")]
-		      UNSPEC_VMRGL_DIRECT))]
-  "TARGET_ALTIVEC"
   "vmrglb %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
@@ -1488,15 +1430,16 @@  (define_expand "altivec_vmrglh"
    (use (match_operand:V8HI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
-  rtvec v = gen_rtvec (8, GEN_INT (4), GEN_INT (12), GEN_INT (5), GEN_INT (13),
-		       GEN_INT (6), GEN_INT (14), GEN_INT (7), GEN_INT (15));
-  rtx x = gen_rtx_VEC_CONCAT (V16HImode, operands[1], operands[2]);
-  x = gen_rtx_VEC_SELECT (V8HImode, x, gen_rtx_PARALLEL (VOIDmode, v));
-  emit_insn (gen_rtx_SET (operands[0], x));
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (
+      gen_altivec_vmrglh_direct (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (
+      gen_altivec_vmrghh_direct (operands[0], operands[2], operands[1]));
   DONE;
 })
 
-(define_insn "*altivec_vmrglh_internal"
+(define_insn "altivec_vmrglh_direct"
   [(set (match_operand:V8HI 0 "register_operand" "=v")
         (vec_select:V8HI
 	  (vec_concat:V16HI
@@ -1507,20 +1450,6 @@  (define_insn "*altivec_vmrglh_internal"
 		     (const_int 6) (const_int 14)
 		     (const_int 7) (const_int 15)])))]
   "TARGET_ALTIVEC"
-{
-  if (BYTES_BIG_ENDIAN)
-    return "vmrglh %0,%1,%2";
-  else
-    return "vmrghh %0,%2,%1";
-}
-  [(set_attr "type" "vecperm")])
-
-(define_insn "altivec_vmrglh_direct"
-  [(set (match_operand:V8HI 0 "register_operand" "=v")
-        (unspec:V8HI [(match_operand:V8HI 1 "register_operand" "v")
-		      (match_operand:V8HI 2 "register_operand" "v")]
-		     UNSPEC_VMRGL_DIRECT))]
-  "TARGET_ALTIVEC"
   "vmrglh %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
@@ -1530,39 +1459,27 @@  (define_expand "altivec_vmrglw"
    (use (match_operand:V4SI 2 "register_operand"))]
   "VECTOR_MEM_ALTIVEC_P (V4SImode)"
 {
-  rtvec v = gen_rtvec (4, GEN_INT (2), GEN_INT (6), GEN_INT (3), GEN_INT (7));
-  rtx x = gen_rtx_VEC_CONCAT (V8SImode, operands[1], operands[2]);
-  x = gen_rtx_VEC_SELECT (V4SImode, x, gen_rtx_PARALLEL (VOIDmode, v));
-  emit_insn (gen_rtx_SET (operands[0], x));
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (
+      gen_altivec_vmrglw_direct_v4si (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (
+      gen_altivec_vmrghw_direct_v4si (operands[0], operands[2], operands[1]));
   DONE;
 })
 
-(define_insn "*altivec_vmrglw_internal"
-  [(set (match_operand:V4SI 0 "register_operand" "=v")
-        (vec_select:V4SI
-	  (vec_concat:V8SI
-	    (match_operand:V4SI 1 "register_operand" "v")
-	    (match_operand:V4SI 2 "register_operand" "v"))
+(define_insn "altivec_vmrglw_direct_<mode>"
+  [(set (match_operand:VSX_W 0 "register_operand" "=wa,v")
+        (vec_select:VSX_W
+	  (vec_concat:<VS_double>
+	    (match_operand:VSX_W 1 "register_operand" "wa,v")
+	    (match_operand:VSX_W 2 "register_operand" "wa,v"))
 	  (parallel [(const_int 2) (const_int 6)
 		     (const_int 3) (const_int 7)])))]
-  "VECTOR_MEM_ALTIVEC_P (V4SImode)"
-{
-  if (BYTES_BIG_ENDIAN)
-    return "vmrglw %0,%1,%2";
-  else
-    return "vmrghw %0,%2,%1";
-}
-  [(set_attr "type" "vecperm")])
-
-(define_insn "altivec_vmrglw_direct"
-  [(set (match_operand:V4SI 0 "register_operand" "=wa,v")
-	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "wa,v")
-		      (match_operand:V4SI 2 "register_operand" "wa,v")]
-		     UNSPEC_VMRGL_DIRECT))]
   "TARGET_ALTIVEC"
   "@
-   xxmrglw %x0,%x1,%x2
-   vmrglw %0,%1,%2"
+  xxmrglw %x0,%x1,%x2
+  vmrglw %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
 (define_insn "*altivec_vmrglsf"
@@ -3929,13 +3846,13 @@  (define_expand "vec_widen_umult_hi_v8hi"
     {
       emit_insn (gen_altivec_vmuleuh (ve, operands[1], operands[2]));
       emit_insn (gen_altivec_vmulouh (vo, operands[1], operands[2]));
-      emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo));
+      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo));
     }
   else
     {
       emit_insn (gen_altivec_vmulouh (ve, operands[1], operands[2]));
       emit_insn (gen_altivec_vmuleuh (vo, operands[1], operands[2]));
-      emit_insn (gen_altivec_vmrghw_direct (operands[0], vo, ve));
+      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], vo, ve));
     }
   DONE;
 })
@@ -3954,13 +3871,13 @@  (define_expand "vec_widen_umult_lo_v8hi"
     {
       emit_insn (gen_altivec_vmuleuh (ve, operands[1], operands[2]));
       emit_insn (gen_altivec_vmulouh (vo, operands[1], operands[2]));
-      emit_insn (gen_altivec_vmrglw_direct (operands[0], ve, vo));
+      emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], ve, vo));
     }
   else
     {
       emit_insn (gen_altivec_vmulouh (ve, operands[1], operands[2]));
       emit_insn (gen_altivec_vmuleuh (vo, operands[1], operands[2]));
-      emit_insn (gen_altivec_vmrglw_direct (operands[0], vo, ve));
+      emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], vo, ve));
     }
   DONE;
 })
@@ -3979,13 +3896,13 @@  (define_expand "vec_widen_smult_hi_v8hi"
     {
       emit_insn (gen_altivec_vmulesh (ve, operands[1], operands[2]));
       emit_insn (gen_altivec_vmulosh (vo, operands[1], operands[2]));
-      emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo));
+      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo));
     }
   else
     {
       emit_insn (gen_altivec_vmulosh (ve, operands[1], operands[2]));
       emit_insn (gen_altivec_vmulesh (vo, operands[1], operands[2]));
-      emit_insn (gen_altivec_vmrghw_direct (operands[0], vo, ve));
+      emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], vo, ve));
     }
   DONE;
 })
@@ -4004,13 +3921,13 @@  (define_expand "vec_widen_smult_lo_v8hi"
     {
       emit_insn (gen_altivec_vmulesh (ve, operands[1], operands[2]));
       emit_insn (gen_altivec_vmulosh (vo, operands[1], operands[2]));
-      emit_insn (gen_altivec_vmrglw_direct (operands[0], ve, vo));
+      emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], ve, vo));
     }
   else
     {
       emit_insn (gen_altivec_vmulosh (ve, operands[1], operands[2]));
       emit_insn (gen_altivec_vmulesh (vo, operands[1], operands[2]));
-      emit_insn (gen_altivec_vmrglw_direct (operands[0], vo, ve));
+      emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], vo, ve));
     }
   DONE;
 })
diff --git a/gcc/config/rs6000/rs6000-p8swap.c b/gcc/config/rs6000/rs6000-p8swap.c
index ad2b3023819..ec503ab742f 100644
--- a/gcc/config/rs6000/rs6000-p8swap.c
+++ b/gcc/config/rs6000/rs6000-p8swap.c
@@ -744,8 +744,6 @@  rtx_is_swappable_p (rtx op, unsigned int *special)
 	  default:
 	    break;
 	  case UNSPEC_VBPERMQ:
-	  case UNSPEC_VMRGH_DIRECT:
-	  case UNSPEC_VMRGL_DIRECT:
 	  case UNSPEC_VPACK_SIGN_SIGN_SAT:
 	  case UNSPEC_VPACK_SIGN_UNS_SAT:
 	  case UNSPEC_VPACK_UNS_UNS_MOD:
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 5a5202c455b..ad11b67b125 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -23013,7 +23013,7 @@  altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
       {  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } },
     { OPTION_MASK_ALTIVEC, CODE_FOR_altivec_vpkuwum_direct,
       {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } },
-    { OPTION_MASK_ALTIVEC, 
+    { OPTION_MASK_ALTIVEC,
       (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghb_direct
        : CODE_FOR_altivec_vmrglb_direct),
       {  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23 } },
@@ -23022,8 +23022,8 @@  altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
        : CODE_FOR_altivec_vmrglh_direct),
       {  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23 } },
     { OPTION_MASK_ALTIVEC,
-      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct
-       : CODE_FOR_altivec_vmrglw_direct),
+      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si
+       : CODE_FOR_altivec_vmrglw_direct_v4si),
       {  0,  1,  2,  3, 16, 17, 18, 19,  4,  5,  6,  7, 20, 21, 22, 23 } },
     { OPTION_MASK_ALTIVEC,
       (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglb_direct
@@ -23034,8 +23034,8 @@  altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
        : CODE_FOR_altivec_vmrghh_direct),
       {  8,  9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } },
     { OPTION_MASK_ALTIVEC,
-      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglw_direct
-       : CODE_FOR_altivec_vmrghw_direct),
+      (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglw_direct_v4si
+       : CODE_FOR_altivec_vmrghw_direct_v4si),
       {  8,  9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } },
     { OPTION_MASK_P8_VECTOR,
       (BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgew_v4sf_direct
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 55d830d0f20..2214544c047 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -4568,7 +4568,7 @@  (define_insn "vsx_xxspltd_<mode>"
   [(set_attr "type" "vecperm")])
 
 ;; V4SF/V4SI interleave
-(define_insn "vsx_xxmrghw_<mode>"
+(define_expand "vsx_xxmrghw_<mode>"
   [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
         (vec_select:VSX_W
 	  (vec_concat:<VS_double>
@@ -4579,13 +4579,16 @@  (define_insn "vsx_xxmrghw_<mode>"
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
   if (BYTES_BIG_ENDIAN)
-    return "xxmrghw %x0,%x1,%x2";
+    emit_insn (
+      gen_altivec_vmrghw_direct_<mode> (operands[0], operands[1], operands[2]));
   else
-    return "xxmrglw %x0,%x2,%x1";
+    emit_insn (
+      gen_altivec_vmrglw_direct_<mode> (operands[0], operands[2], operands[1]));
+  DONE;
 }
   [(set_attr "type" "vecperm")])
 
-(define_insn "vsx_xxmrglw_<mode>"
+(define_expand "vsx_xxmrglw_<mode>"
   [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
 	(vec_select:VSX_W
 	  (vec_concat:<VS_double>
@@ -4596,9 +4599,12 @@  (define_insn "vsx_xxmrglw_<mode>"
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
   if (BYTES_BIG_ENDIAN)
-    return "xxmrglw %x0,%x1,%x2";
+    emit_insn (
+      gen_altivec_vmrglw_direct_<mode> (operands[0], operands[1], operands[2]));
   else
-    return "xxmrghw %x0,%x2,%x1";
+    emit_insn (
+      gen_altivec_vmrghw_direct_<mode> (operands[0], operands[2], operands[1]));
+  DONE;
 }
   [(set_attr "type" "vecperm")])
 
diff --git a/gcc/testsuite/gcc.target/powerpc/builtins-1.c b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
index 3ec1024a955..63fbd2e3be1 100644
--- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
@@ -317,10 +317,10 @@  int main ()
 /* { dg-final { scan-assembler-times "vctuxs" 2 } } */
 
 /* { dg-final { scan-assembler-times "vmrghb" 4 { target be } } } */
-/* { dg-final { scan-assembler-times "vmrghb" 5 { target le } } } */
+/* { dg-final { scan-assembler-times "vmrghb" 6 { target le } } } */
 /* { dg-final { scan-assembler-times "vmrghh" 8 } } */
-/* { dg-final { scan-assembler-times "xxmrghw" 8 } } */
-/* { dg-final { scan-assembler-times "xxmrglw" 8 } } */
+/* { dg-final { scan-assembler-times "xxmrghw" 4 } } */
+/* { dg-final { scan-assembler-times "xxmrglw" 4 } } */
 /* { dg-final { scan-assembler-times "vmrglh" 8 } } */
 /* { dg-final { scan-assembler-times "xxlnor" 6 } } */
 /* { dg-final { scan-assembler-times {\mvpkudus\M} 1 } } */
@@ -347,7 +347,7 @@  int main ()
 /* { dg-final { scan-assembler-times "vspltb" 6 } } */
 /* { dg-final { scan-assembler-times "vspltw" 0 } } */
 /* { dg-final { scan-assembler-times "vmrgow" 8 } } */
-/* { dg-final { scan-assembler-times "vmrglb" 5 { target le } } } */
+/* { dg-final { scan-assembler-times "vmrglb" 4 { target le } } } */
 /* { dg-final { scan-assembler-times "vmrglb" 6 { target be } } } */
 /* { dg-final { scan-assembler-times "vmrgew" 8 } } */
 /* { dg-final { scan-assembler-times "vsplth" 8 } } */