diff mbox

[4.8,rs6000] (Re: [PATCH, rs6000] More efficient vector permute for little endian)

Message ID 1396644313.5401.65.camel@gnopaine
State New
Headers show

Commit Message

Bill Schmidt April 4, 2014, 8:45 p.m. UTC
On Thu, 2014-03-20 at 20:38 -0500, Bill Schmidt wrote:
> The original workaround for vector permute on a little endian platform
> includes subtracting each element of the permute control vector from 31.
> Because the upper 3 bits of each element are unimportant, this was
> implemented as subtracting the whole vector from a splat of -1.  On
> reflection this can be done more efficiently with a vector nor
> operation.  This patch makes that change.

This patch was approved and committed to trunk and to the IBM 4.8
branch.  I would like approval to commit it to the FSF 4.8 branch as
well.  Per Richard Henderson's previous comment, I have changed the
patch slightly to avoid the use of emit_move_insn.

Bootstrapped and tested on powerpc64le-unknown-linux-gnu.  Previous
version burned in on trunk and IBM 4.8 branch for about two weeks.  Is
this ok for FSF 4.8?

Thanks,
Bill


2014-04-04  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* config/rs6000/rs6000.c (rs6000_expand_vector_set): Generate a
	pattern for vector nor instead of subtract from splat(-1).
	(altivec_expand_vec_perm_const_le): Likewise.

Comments

Richard Henderson April 4, 2014, 9:47 p.m. UTC | #1
On 04/04/2014 01:45 PM, Bill Schmidt wrote:
>  Per Richard Henderson's previous comment, I have changed the
> patch slightly to avoid the use of emit_move_insn.

Thanks.


r~
Bill Schmidt April 7, 2014, 11:14 p.m. UTC | #2
Hi,

I'm withdrawing this request as I just discovered it will sometimes be
advantageous to use vnand rather than vnor; will rework this and get
back to you.

Thanks,
Bill

On Fri, 2014-04-04 at 15:45 -0500, Bill Schmidt wrote:
> On Thu, 2014-03-20 at 20:38 -0500, Bill Schmidt wrote:
> > The original workaround for vector permute on a little endian platform
> > includes subtracting each element of the permute control vector from 31.
> > Because the upper 3 bits of each element are unimportant, this was
> > implemented as subtracting the whole vector from a splat of -1.  On
> > reflection this can be done more efficiently with a vector nor
> > operation.  This patch makes that change.
> 
> This patch was approved and committed to trunk and to the IBM 4.8
> branch.  I would like approval to commit it to the FSF 4.8 branch as
> well.  Per Richard Henderson's previous comment, I have changed the
> patch slightly to avoid the use of emit_move_insn.
> 
> Bootstrapped and tested on powerpc64le-unknown-linux-gnu.  Previous
> version burned in on trunk and IBM 4.8 branch for about two weeks.  Is
> this ok for FSF 4.8?
> 
> Thanks,
> Bill
> 
> 
> 2014-04-04  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	* config/rs6000/rs6000.c (rs6000_expand_vector_set): Generate a
> 	pattern for vector nor instead of subtract from splat(-1).
> 	(altivec_expand_vec_perm_const_le): Likewise.
> 
> 
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c	(revision 209122)
> +++ gcc/config/rs6000/rs6000.c	(working copy)
> @@ -5621,12 +5621,10 @@ rs6000_expand_vector_set (rtx target, rtx val, int
>    else 
>      {
>        /* Invert selector.  */
> -      rtx splat = gen_rtx_VEC_DUPLICATE (V16QImode,
> -					 gen_rtx_CONST_INT (QImode, -1));
> +      rtx notx = gen_rtx_NOT (V16QImode, force_reg (V16QImode, x));
> +      rtx andx = gen_rtx_AND (V16QImode, notx, notx);
>        rtx tmp = gen_reg_rtx (V16QImode);
> -      emit_move_insn (tmp, splat);
> -      x = gen_rtx_MINUS (V16QImode, tmp, force_reg (V16QImode, x));
> -      emit_move_insn (tmp, x);
> +      emit_insn (gen_rtx_SET (VOIDmode, tmp, andx));
>  
>        /* Permute with operands reversed and adjusted selector.  */
>        x = gen_rtx_UNSPEC (mode, gen_rtvec (3, reg, target, tmp),
> @@ -30335,18 +30333,18 @@ altivec_expand_vec_perm_const_le (rtx operands[4])
>  
>  /* Similarly to altivec_expand_vec_perm_const_le, we must adjust the
>     permute control vector.  But here it's not a constant, so we must
> -   generate a vector splat/subtract to do the adjustment.  */
> +   generate a vector NOR to do the adjustment.  */
>  
>  void
>  altivec_expand_vec_perm_le (rtx operands[4])
>  {
> -  rtx splat, unspec;
> +  rtx notx, andx, unspec;
>    rtx target = operands[0];
>    rtx op0 = operands[1];
>    rtx op1 = operands[2];
>    rtx sel = operands[3];
>    rtx tmp = target;
> -  rtx splatreg = gen_reg_rtx (V16QImode);
> +  rtx norreg = gen_reg_rtx (V16QImode);
>    enum machine_mode mode = GET_MODE (target);
>  
>    /* Get everything in regs so the pattern matches.  */
> @@ -30359,18 +30357,14 @@ altivec_expand_vec_perm_le (rtx operands[4])
>    if (!REG_P (target))
>      tmp = gen_reg_rtx (mode);
>  
> -  /* SEL = splat(31) - SEL.  */
> -  /* We want to subtract from 31, but we can't vspltisb 31 since
> -     it's out of range.  -1 works as well because only the low-order
> -     five bits of the permute control vector elements are used.  */
> -  splat = gen_rtx_VEC_DUPLICATE (V16QImode,
> -				 gen_rtx_CONST_INT (QImode, -1));
> -  emit_move_insn (splatreg, splat);
> -  sel = gen_rtx_MINUS (V16QImode, splatreg, sel);
> -  emit_move_insn (splatreg, sel);
> +  /* Invert the selector with a VNOR.  */
> +  notx = gen_rtx_NOT (V16QImode, sel);
> +  andx = gen_rtx_AND (V16QImode, notx, notx);
> +  emit_insn (gen_rtx_SET (VOIDmode, norreg, andx));
>  
>    /* Permute with operands reversed and adjusted selector.  */
> -  unspec = gen_rtx_UNSPEC (mode, gen_rtvec (3, op1, op0, splatreg), UNSPEC_VPERM);
> +  unspec = gen_rtx_UNSPEC (mode, gen_rtvec (3, op1, op0, norreg),
> +			   UNSPEC_VPERM);
>  
>    /* Copy into target, possibly by way of a register.  */
>    if (!REG_P (target))
>
diff mbox

Patch

Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 209122)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -5621,12 +5621,10 @@  rs6000_expand_vector_set (rtx target, rtx val, int
   else 
     {
       /* Invert selector.  */
-      rtx splat = gen_rtx_VEC_DUPLICATE (V16QImode,
-					 gen_rtx_CONST_INT (QImode, -1));
+      rtx notx = gen_rtx_NOT (V16QImode, force_reg (V16QImode, x));
+      rtx andx = gen_rtx_AND (V16QImode, notx, notx);
       rtx tmp = gen_reg_rtx (V16QImode);
-      emit_move_insn (tmp, splat);
-      x = gen_rtx_MINUS (V16QImode, tmp, force_reg (V16QImode, x));
-      emit_move_insn (tmp, x);
+      emit_insn (gen_rtx_SET (VOIDmode, tmp, andx));
 
       /* Permute with operands reversed and adjusted selector.  */
       x = gen_rtx_UNSPEC (mode, gen_rtvec (3, reg, target, tmp),
@@ -30335,18 +30333,18 @@  altivec_expand_vec_perm_const_le (rtx operands[4])
 
 /* Similarly to altivec_expand_vec_perm_const_le, we must adjust the
    permute control vector.  But here it's not a constant, so we must
-   generate a vector splat/subtract to do the adjustment.  */
+   generate a vector NOR to do the adjustment.  */
 
 void
 altivec_expand_vec_perm_le (rtx operands[4])
 {
-  rtx splat, unspec;
+  rtx notx, andx, unspec;
   rtx target = operands[0];
   rtx op0 = operands[1];
   rtx op1 = operands[2];
   rtx sel = operands[3];
   rtx tmp = target;
-  rtx splatreg = gen_reg_rtx (V16QImode);
+  rtx norreg = gen_reg_rtx (V16QImode);
   enum machine_mode mode = GET_MODE (target);
 
   /* Get everything in regs so the pattern matches.  */
@@ -30359,18 +30357,14 @@  altivec_expand_vec_perm_le (rtx operands[4])
   if (!REG_P (target))
     tmp = gen_reg_rtx (mode);
 
-  /* SEL = splat(31) - SEL.  */
-  /* We want to subtract from 31, but we can't vspltisb 31 since
-     it's out of range.  -1 works as well because only the low-order
-     five bits of the permute control vector elements are used.  */
-  splat = gen_rtx_VEC_DUPLICATE (V16QImode,
-				 gen_rtx_CONST_INT (QImode, -1));
-  emit_move_insn (splatreg, splat);
-  sel = gen_rtx_MINUS (V16QImode, splatreg, sel);
-  emit_move_insn (splatreg, sel);
+  /* Invert the selector with a VNOR.  */
+  notx = gen_rtx_NOT (V16QImode, sel);
+  andx = gen_rtx_AND (V16QImode, notx, notx);
+  emit_insn (gen_rtx_SET (VOIDmode, norreg, andx));
 
   /* Permute with operands reversed and adjusted selector.  */
-  unspec = gen_rtx_UNSPEC (mode, gen_rtvec (3, op1, op0, splatreg), UNSPEC_VPERM);
+  unspec = gen_rtx_UNSPEC (mode, gen_rtvec (3, op1, op0, norreg),
+			   UNSPEC_VPERM);
 
   /* Copy into target, possibly by way of a register.  */
   if (!REG_P (target))