diff mbox

[AArch64,2/5] Improve immediate generation

Message ID 000a01d0e57b$d435d420$7ca17c60$@com
State New
Headers show

Commit Message

Wilco Sept. 2, 2015, 12:35 p.m. UTC
aarch64_internal_mov_immediate uses loops iterating over all legal bitmask immediates to find
2-instruction immediate combinations. One loop is quadratic and despite being extremely expensive
very rarely finds a matching immediate (43 matches in all of SPEC2006 but none are emitted in final
code), so it can be removed without any effect on code quality. The other loop can be replaced by a
constant-time search: rather than iterating over all legal bitmask values, reconstruct a potential
bitmask and query the fast aarch64_bitmask_imm.

No change in generated code, passes GCC regression tests/bootstrap.

ChangeLog:
2015-09-02  Wilco Dijkstra  <wdijkstr@arm.com>

	* gcc/config/aarch64/aarch64.c (aarch64_internal_mov_immediate):
	Replace slow immediate matching loops with a faster algorithm.

---
 gcc/config/aarch64/aarch64.c | 96 +++++++++++---------------------------------
 1 file changed, 23 insertions(+), 73 deletions(-)

Comments

James Greenhalgh Sept. 18, 2015, 2:26 p.m. UTC | #1
On Wed, Sep 02, 2015 at 01:35:19PM +0100, Wilco Dijkstra wrote:
> aarch64_internal_mov_immediate uses loops iterating over all legal bitmask
> immediates to find 2-instruction immediate combinations. One loop is
> quadratic and despite being extremely expensive very rarely finds a matching
> immediate (43 matches in all of SPEC2006 but none are emitted in final code),
> so it can be removed without any effect on code quality. The other loop can
> be replaced by a constant-time search: rather than iterating over all legal
> bitmask values, reconstruct a potential bitmask and query the fast
> aarch64_bitmask_imm.
> 
> No change in generated code, passes GCC regression tests/bootstrap.

Well, presumably those 43 cases in SPEC2006 changed...

The code looks OK, and I haven't seen any objections from Marcus or
Richard on the overall direction of the patch set.

OK for trunk.

Thanks,
James

> 
> ChangeLog:
> 2015-09-02  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* gcc/config/aarch64/aarch64.c (aarch64_internal_mov_immediate):
> 	Replace slow immediate matching loops with a faster algorithm.
> 
> ---
>  gcc/config/aarch64/aarch64.c | 96 +++++++++++---------------------------------
>  1 file changed, 23 insertions(+), 73 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index c0280e6..d6f7cb0 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1376,7 +1376,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>    unsigned HOST_WIDE_INT mask;
>    int i;
>    bool first;
> -  unsigned HOST_WIDE_INT val;
> +  unsigned HOST_WIDE_INT val, val2;
>    bool subtargets;
>    rtx subtarget;
>    int one_match, zero_match, first_not_ffff_match;
> @@ -1503,85 +1503,35 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>  	}
>      }
>  
> -  /* See if we can do it by arithmetically combining two
> -     immediates.  */
> -  for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
> +  if (zero_match != 2 && one_match != 2)
>      {
> -      int j;
> -      mask = 0xffff;
> +      /* Try emitting a bitmask immediate with a movk replacing 16 bits.
> +	 For a 64-bit bitmask try whether changing 16 bits to all ones or
> +	 zeroes creates a valid bitmask.  To check any repeated bitmask,
> +	 try using 16 bits from the other 32-bit half of val.  */
>  
> -      if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
> -	  || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
> +      for (i = 0; i < 64; i += 16, mask <<= 16)
>  	{
> -	  if (generate)
> -	    {
> -	      subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
> -	      emit_insn (gen_rtx_SET (subtarget,
> -				      GEN_INT (aarch64_bitmasks[i])));
> -	      emit_insn (gen_adddi3 (dest, subtarget,
> -				     GEN_INT (val - aarch64_bitmasks[i])));
> -	    }
> -	  num_insns += 2;
> -	  return num_insns;
> +	  val2 = val & ~mask;
> +	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
> +	    break;
> +	  val2 = val | mask;
> +	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
> +	    break;
> +	  val2 = val2 & ~mask;
> +	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
> +	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
> +	    break;
>  	}
> -
> -      for (j = 0; j < 64; j += 16, mask <<= 16)
> +      if (i != 64)
>  	{
> -	  if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
> +	  if (generate)
>  	    {
> -	      if (generate)
> -		{
> -		  emit_insn (gen_rtx_SET (dest,
> -					  GEN_INT (aarch64_bitmasks[i])));
> -		  emit_insn (gen_insv_immdi (dest, GEN_INT (j),
> -					     GEN_INT ((val >> j) & 0xffff)));
> -		}
> -	      num_insns += 2;
> -	      return num_insns;
> +	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> +	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +			 GEN_INT ((val >> i) & 0xffff)));
>  	    }
> -	}
> -    }
> -
> -  /* See if we can do it by logically combining two immediates.  */
> -  for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
> -    {
> -      if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
> -	{
> -	  int j;
> -
> -	  for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
> -	    if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
> -	      {
> -		if (generate)
> -		  {
> -		    subtarget = subtargets ? gen_reg_rtx (mode) : dest;
> -		    emit_insn (gen_rtx_SET (subtarget,
> -					    GEN_INT (aarch64_bitmasks[i])));
> -		    emit_insn (gen_iordi3 (dest, subtarget,
> -					   GEN_INT (aarch64_bitmasks[j])));
> -		  }
> -		num_insns += 2;
> -		return num_insns;
> -	      }
> -	}
> -      else if ((val & aarch64_bitmasks[i]) == val)
> -	{
> -	  int j;
> -
> -	  for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
> -	    if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
> -	      {
> -		if (generate)
> -		  {
> -		    subtarget = subtargets ? gen_reg_rtx (mode) : dest;
> -		    emit_insn (gen_rtx_SET (subtarget,
> -					    GEN_INT (aarch64_bitmasks[j])));
> -		    emit_insn (gen_anddi3 (dest, subtarget,
> -					   GEN_INT (aarch64_bitmasks[i])));
> -		  }
> -		num_insns += 2;
> -		return num_insns;
> -	      }
> +	  return 2;
>  	}
>      }
>  
> -- 
> 1.8.3
> 
>
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index c0280e6..d6f7cb0 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1376,7 +1376,7 @@  aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
   unsigned HOST_WIDE_INT mask;
   int i;
   bool first;
-  unsigned HOST_WIDE_INT val;
+  unsigned HOST_WIDE_INT val, val2;
   bool subtargets;
   rtx subtarget;
   int one_match, zero_match, first_not_ffff_match;
@@ -1503,85 +1503,35 @@  aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
 	}
     }
 
-  /* See if we can do it by arithmetically combining two
-     immediates.  */
-  for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
+  if (zero_match != 2 && one_match != 2)
     {
-      int j;
-      mask = 0xffff;
+      /* Try emitting a bitmask immediate with a movk replacing 16 bits.
+	 For a 64-bit bitmask try whether changing 16 bits to all ones or
+	 zeroes creates a valid bitmask.  To check any repeated bitmask,
+	 try using 16 bits from the other 32-bit half of val.  */
 
-      if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
-	  || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
+      for (i = 0; i < 64; i += 16, mask <<= 16)
 	{
-	  if (generate)
-	    {
-	      subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
-	      emit_insn (gen_rtx_SET (subtarget,
-				      GEN_INT (aarch64_bitmasks[i])));
-	      emit_insn (gen_adddi3 (dest, subtarget,
-				     GEN_INT (val - aarch64_bitmasks[i])));
-	    }
-	  num_insns += 2;
-	  return num_insns;
+	  val2 = val & ~mask;
+	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
+	    break;
+	  val2 = val | mask;
+	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
+	    break;
+	  val2 = val2 & ~mask;
+	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
+	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
+	    break;
 	}
-
-      for (j = 0; j < 64; j += 16, mask <<= 16)
+      if (i != 64)
 	{
-	  if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
+	  if (generate)
 	    {
-	      if (generate)
-		{
-		  emit_insn (gen_rtx_SET (dest,
-					  GEN_INT (aarch64_bitmasks[i])));
-		  emit_insn (gen_insv_immdi (dest, GEN_INT (j),
-					     GEN_INT ((val >> j) & 0xffff)));
-		}
-	      num_insns += 2;
-	      return num_insns;
+	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+			 GEN_INT ((val >> i) & 0xffff)));
 	    }
-	}
-    }
-
-  /* See if we can do it by logically combining two immediates.  */
-  for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
-    {
-      if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
-	{
-	  int j;
-
-	  for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
-	    if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
-	      {
-		if (generate)
-		  {
-		    subtarget = subtargets ? gen_reg_rtx (mode) : dest;
-		    emit_insn (gen_rtx_SET (subtarget,
-					    GEN_INT (aarch64_bitmasks[i])));
-		    emit_insn (gen_iordi3 (dest, subtarget,
-					   GEN_INT (aarch64_bitmasks[j])));
-		  }
-		num_insns += 2;
-		return num_insns;
-	      }
-	}
-      else if ((val & aarch64_bitmasks[i]) == val)
-	{
-	  int j;
-
-	  for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
-	    if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
-	      {
-		if (generate)
-		  {
-		    subtarget = subtargets ? gen_reg_rtx (mode) : dest;
-		    emit_insn (gen_rtx_SET (subtarget,
-					    GEN_INT (aarch64_bitmasks[j])));
-		    emit_insn (gen_anddi3 (dest, subtarget,
-					   GEN_INT (aarch64_bitmasks[i])));
-		  }
-		num_insns += 2;
-		return num_insns;
-	      }
+	  return 2;
 	}
     }