Patchwork catch builtin_bswap16 construct

login
register
mail settings
Submitter Christophe Lyon
Date Sept. 19, 2012, 4:22 p.m.
Message ID <CAKdteOYFyVqWNdK78fS+xVqMBKgq4Cun0DUoE=vV=qpoOaH_qw@mail.gmail.com>
Download mbox | patch
Permalink /patch/185104/
State New
Headers show

Comments

Christophe Lyon - Sept. 19, 2012, 4:22 p.m.
Hi,

The attached patch catches C constructs:
(A << 8) | (A >> 8)
where A is unsigned 16 bits
and maps them to builtin_bswap16(A) which can provide more efficient
implementations on some targets.

The construct above is equivalent to the default bswap16 implementation.

I have added a testcase for ARM, and have found no regression with
qemu-arm on arm-none-linux-gnueabi.

OK?

Christophe
2012-09-19  Christophe Lyon <christophe.lyon@linaro.org>

	gcc/
	* fold-const.c (fold_binary_loc): call builtin_bswap16 when the
	equivalent construct is detected.

	gcc/testsuite/
	* gcc.target/arm/builtin-bswap-2.c: New testcase.
Georg-Johann Lay - Sept. 19, 2012, 4:44 p.m.
Christophe Lyon wrote:
> Hi,
> 
> The attached patch catches C constructs:
> (A << 8) | (A >> 8)
> where A is unsigned 16 bits
> and maps them to builtin_bswap16(A) which can provide more efficient
> implementations on some targets.
> 
> The construct above is equivalent to the default bswap16 implementation.
> 
> I have added a testcase for ARM, and have found no regression with
> qemu-arm on arm-none-linux-gnueabi.
> 
> OK?
> 
> Christophe
> 
> 2012-09-19  Christophe Lyon <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* fold-const.c (fold_binary_loc): call builtin_bswap16 when the
> 	equivalent construct is detected.
> 
> 	gcc/testsuite/
> 	* gcc.target/arm/builtin-bswap-2.c: New testcase.
> 
> diff --git a/gcc/fold-const.c b/gcc/fold-const.c
> index 2bf5179..0ff7e8b 100644
> --- a/gcc/fold-const.c
> +++ b/gcc/fold-const.c
> @@ -10326,6 +10326,99 @@ fold_binary_loc (location_t loc,
>  	  }
>        }
>  
> +      /* Catch bswap16 construct: (A << 8) | (A >> 8) where A is
> +         unsigned 16 bits.
> +         This has been expanded into:
> +	 (ior:SI (lshift:SI (nop:SI A:HI) 8)
> +	         (nop:SI (rshift:HI A:HI 8)))
> +      */

This seems overly complicated on 16-bit platforms where (A << 8) | (A >> 8)
is the same as rotate:HI (A 8).

Does your patch make sure that (A << 8) | (A >> 8) is still mapped to ROTATE
on these targets?

Johann



> +      {
> +	enum tree_code code0, code1;
> +	tree my_arg0 = arg0;
> +	tree my_arg1= arg1;
> +	tree rtype;
> +
> +	code0 = TREE_CODE (arg0);
> +	code1 = TREE_CODE (arg1);
> +	if (code1 == NOP_EXPR)
> +	  {
> +	    my_arg1 = TREE_OPERAND (arg1, 0);
> +	    code1 = TREE_CODE (my_arg1);
> +	  }
> +	else if (code0 == NOP_EXPR)
> +	  {
> +	    my_arg0 = TREE_OPERAND (arg0, 0);
> +	    code0 = TREE_CODE (my_arg0);
> +	  }
> +
> +	/* Handle (A << C1) + (A >> C1).  */
> +	if ((code1 == RSHIFT_EXPR && code0 == LSHIFT_EXPR)
> +	     && (TREE_CODE (TREE_OPERAND (my_arg0, 0)) == NOP_EXPR)
> +	     && operand_equal_p (TREE_OPERAND (TREE_OPERAND (my_arg0, 0), 0),
> +				 TREE_OPERAND (my_arg1, 0), 0)
> +	     && (rtype = TREE_TYPE (TREE_OPERAND (my_arg1, 0)),
> +		 TYPE_UNSIGNED (rtype)))
> +	  {
> +	    tree tree01, tree11;
> +	    enum tree_code code01, code11;
> +
> +	    tree01 = TREE_OPERAND (my_arg0, 1);
> +	    tree11 = TREE_OPERAND (my_arg1, 1);
> +	    STRIP_NOPS (tree01);
> +	    STRIP_NOPS (tree11);
> +	    code01 = TREE_CODE (tree01);
> +	    code11 = TREE_CODE (tree11);
> +
> +	    /* Check that shift amount is 8, and input 16 bits wide.  */
> +	    if (code01 == INTEGER_CST
> +		&& code11 == INTEGER_CST
> +		&& TREE_INT_CST_HIGH (tree01) == 0
> +		&& TREE_INT_CST_HIGH (tree11) == 0
> +		&& TREE_INT_CST_LOW (tree01) == TREE_INT_CST_LOW (tree11)
> +		&& TREE_INT_CST_LOW (tree01) == 8
> +		&& TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (my_arg1, 0))) == 16)
> +	      {
> +		tree bswapfn = builtin_decl_explicit (BUILT_IN_BSWAP16);
> +		return build_call_expr_loc (loc, bswapfn, 1,
> +					    TREE_OPERAND (my_arg1, 0));
> +	      }
> +	  }
> +
> +	/* Handle (A >> C1) + (A << C1).  */
> +	else if ((code0 == RSHIFT_EXPR && code1 == LSHIFT_EXPR)
> +		 && (TREE_CODE (TREE_OPERAND (my_arg1, 0)) == NOP_EXPR)
> +		 && operand_equal_p (TREE_OPERAND (TREE_OPERAND (my_arg1, 0),
> +						   0),
> +				     TREE_OPERAND (my_arg0, 0), 0)
> +		 && (rtype = TREE_TYPE (TREE_OPERAND (my_arg0, 0)),
> +		     TYPE_UNSIGNED (rtype)))
> +	  {
> +	    tree tree01, tree11;
> +	    enum tree_code code01, code11;
> +
> +	    tree01 = TREE_OPERAND (my_arg0, 1);
> +	    tree11 = TREE_OPERAND (my_arg1, 1);
> +	    STRIP_NOPS (tree01);
> +	    STRIP_NOPS (tree11);
> +	    code01 = TREE_CODE (tree01);
> +	    code11 = TREE_CODE (tree11);
> +
> +	    /* Check that shift amount is 8, and input 16 bits wide.  */
> +	    if (code01 == INTEGER_CST
> +		&& code11 == INTEGER_CST
> +		&& TREE_INT_CST_HIGH (tree01) == 0
> +		&& TREE_INT_CST_HIGH (tree11) == 0
> +		&& TREE_INT_CST_LOW (tree01) == TREE_INT_CST_LOW (tree11)
> +		&& TREE_INT_CST_LOW (tree01) == 8
> +		&& TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (my_arg0, 0))) == 16)
> +	      {
> +		tree bswapfn = builtin_decl_explicit (BUILT_IN_BSWAP16);
> +		return build_call_expr_loc (loc, bswapfn, 1,
> +					    TREE_OPERAND (my_arg0, 0));
> +	      }
> +	  }
> +      }
> +
>      associate:
>        /* In most languages, can't associate operations on floats through
>  	 parentheses.  Rather than remember where the parentheses were, we
> diff --git a/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c b/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c
> new file mode 100644
> index 0000000..93dbb35
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-require-effective-target arm_arch_v6_ok } */
> +/* { dg-add-options arm_arch_v6 } */
> +/* { dg-final { scan-assembler-not "orr\[ \t\]" } } */
> +
> +unsigned short swapu16_1 (unsigned short x)
> +{
> +  return (x << 8) | (x >> 8);
> +}
> +
> +unsigned short swapu16_2 (unsigned short x)
> +{
> +  return (x >> 8) | (x << 8);
> +}
> +
> +unsigned int swapu32_1 (unsigned int x)
> +{
> +  return (x << 16) | (x >> 16);
> +}
> +
> +unsigned int swapu32_2 (unsigned int x)
> +{
> +  return (x >> 16) | (x << 16);
> +}
Oleg Endo - Sept. 19, 2012, 5:28 p.m.
On Wed, 2012-09-19 at 18:44 +0200, Georg-Johann Lay wrote:

> This seems overly complicated on 16-bit platforms where (A << 8) | (A >> 8)
> is the same as rotate:HI (A 8).
> 
> Does your patch make sure that (A << 8) | (A >> 8) is still mapped to ROTATE
> on these targets?
> 

If I remember correctly, the bswap16 builtin expansion will try to use a
rotate pattern, if no HImode bswap is present.

Cheers,
Oleg
Eric Botcazou - Sept. 20, 2012, 7:12 a.m.
> The attached patch catches C constructs:
> (A << 8) | (A >> 8)
> where A is unsigned 16 bits
> and maps them to builtin_bswap16(A) which can provide more efficient
> implementations on some targets.

This belongs in tree-ssa-math-opts.c:execute_optimize_bswap instead.

When I implemented __builtin_bswap16, I didn't add this because I thought this 
would be overkill since the RTL combiner should be able to catch the pattern.
Have you investigated on this front?  But I don't have a strong opinion.
Christophe Lyon - Sept. 20, 2012, 7:55 a.m.
On 20 September 2012 09:12, Eric Botcazou <ebotcazou@adacore.com> wrote:
>> The attached patch catches C constructs:
>> (A << 8) | (A >> 8)
>> where A is unsigned 16 bits
>> and maps them to builtin_bswap16(A) which can provide more efficient
>> implementations on some targets.
>
> This belongs in tree-ssa-math-opts.c:execute_optimize_bswap instead.
>
OK I'll have a look at that. Actually I modified fold-const.c because
it's here that the similar 32 bits pattern is turned into a rotate.

> When I implemented __builtin_bswap16, I didn't add this because I thought this
> would be overkill since the RTL combiner should be able to catch the pattern.
> Have you investigated on this front?  But I don't have a strong opinion.
>
No I didn't. As I said above, I looked at where the 32 bits pattern
was handled and added the 16 bits one.

Christophe.
Oleg Endo - Sept. 23, 2012, 10:49 p.m.
On Thu, 2012-09-20 at 09:12 +0200, Eric Botcazou wrote:
> > The attached patch catches C constructs:
> > (A << 8) | (A >> 8)
> > where A is unsigned 16 bits
> > and maps them to builtin_bswap16(A) which can provide more efficient
> > implementations on some targets.
> 
> This belongs in tree-ssa-math-opts.c:execute_optimize_bswap instead.
> 
> When I implemented __builtin_bswap16, I didn't add this because I thought this 
> would be overkill since the RTL combiner should be able to catch the pattern.
> Have you investigated on this front?  But I don't have a strong opinion.
> 

A while ago I've tried doing that for SH (implementing bswap16 with RTL
combine).  It was like an explosion of patterns, because combine would
try out a lot of things depending on the surrounding code around the
actual bswap16.  In the end I decided to drop that stuff for the most
part.

BTW, the built-in documentation says:

Built-in Function: int16_t __builtin_bswap16 (int16_t x)
Built-in Function: int32_t __builtin_bswap32 (int32_t x)

However, it seems the result is always unsigned for those.
At least on SH I get the following:

int test (short x)
{
  return __builtin_bswap16 (x);
}

swap.b  r4,r4	! 8	*rotlhi3_8
rts		! 24	*return_i
extu.w	r4,r0	! 9	*zero_extendhisi2_compact

... and similarly for int32.
Can anyone else confirm this?

Cheers,
Oleg
Eric Botcazou - Sept. 24, 2012, 7:15 a.m.
> A while ago I've tried doing that for SH (implementing bswap16 with RTL
> combine).  It was like an explosion of patterns, because combine would
> try out a lot of things depending on the surrounding code around the
> actual bswap16.  In the end I decided to drop that stuff for the most
> part.

OK, thanks for your feedback.  Note that you won't get the benefit of the 
optimization for SH if you don't define a bswaphi2 pattern.

> BTW, the built-in documentation says:
> 
> Built-in Function: int16_t __builtin_bswap16 (int16_t x)
> Built-in Function: int32_t __builtin_bswap32 (int32_t x)
> 
> However, it seems the result is always unsigned for those.

Yes, builtins.def contains:

DEF_GCC_BUILTIN        (BUILT_IN_BSWAP16, "bswap16", BT_FN_UINT16_UINT16, 
ATTR_CONST_NOTHROW_LEAF_LIST)
DEF_GCC_BUILTIN        (BUILT_IN_BSWAP32, "bswap32", BT_FN_UINT32_UINT32, 
ATTR_CONST_NOTHROW_LEAF_LIST)
DEF_GCC_BUILTIN        (BUILT_IN_BSWAP64, "bswap64", BT_FN_UINT64_UINT64, 
ATTR_CONST_NOTHROW_LEAF_LIST)

The documentation indeed needs to be fixed.
Oleg Endo - Sept. 24, 2012, 7:47 a.m.
On Mon, 2012-09-24 at 09:15 +0200, Eric Botcazou wrote:
> > A while ago I've tried doing that for SH (implementing bswap16 with RTL
> > combine).  It was like an explosion of patterns, because combine would
> > try out a lot of things depending on the surrounding code around the
> > actual bswap16.  In the end I decided to drop that stuff for the most
> > part.
> 
> OK, thanks for your feedback.  Note that you won't get the benefit of the 
> optimization for SH if you don't define a bswaphi2 pattern.

Sure sure.  I'll resume my efforts after Christophe's patch is in.

> 
> > BTW, the built-in documentation says:
> > 
> > Built-in Function: int16_t __builtin_bswap16 (int16_t x)
> > Built-in Function: int32_t __builtin_bswap32 (int32_t x)
> > 
> > However, it seems the result is always unsigned for those.
> 
> Yes, builtins.def contains:
> 
> DEF_GCC_BUILTIN        (BUILT_IN_BSWAP16, "bswap16", BT_FN_UINT16_UINT16, 
> ATTR_CONST_NOTHROW_LEAF_LIST)
> DEF_GCC_BUILTIN        (BUILT_IN_BSWAP32, "bswap32", BT_FN_UINT32_UINT32, 
> ATTR_CONST_NOTHROW_LEAF_LIST)
> DEF_GCC_BUILTIN        (BUILT_IN_BSWAP64, "bswap64", BT_FN_UINT64_UINT64, 
> ATTR_CONST_NOTHROW_LEAF_LIST)
> 
> The documentation indeed needs to be fixed.
> 

OK, I'll check it out.

Cheers,
Oleg

Patch

diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index 2bf5179..0ff7e8b 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -10326,6 +10326,99 @@  fold_binary_loc (location_t loc,
 	  }
       }
 
+      /* Catch bswap16 construct: (A << 8) | (A >> 8) where A is
+         unsigned 16 bits.
+         This has been expanded into:
+	 (ior:SI (lshift:SI (nop:SI A:HI) 8)
+	         (nop:SI (rshift:HI A:HI 8)))
+      */
+      {
+	enum tree_code code0, code1;
+	tree my_arg0 = arg0;
+	tree my_arg1= arg1;
+	tree rtype;
+
+	code0 = TREE_CODE (arg0);
+	code1 = TREE_CODE (arg1);
+	if (code1 == NOP_EXPR)
+	  {
+	    my_arg1 = TREE_OPERAND (arg1, 0);
+	    code1 = TREE_CODE (my_arg1);
+	  }
+	else if (code0 == NOP_EXPR)
+	  {
+	    my_arg0 = TREE_OPERAND (arg0, 0);
+	    code0 = TREE_CODE (my_arg0);
+	  }
+
+	/* Handle (A << C1) + (A >> C1).  */
+	if ((code1 == RSHIFT_EXPR && code0 == LSHIFT_EXPR)
+	     && (TREE_CODE (TREE_OPERAND (my_arg0, 0)) == NOP_EXPR)
+	     && operand_equal_p (TREE_OPERAND (TREE_OPERAND (my_arg0, 0), 0),
+				 TREE_OPERAND (my_arg1, 0), 0)
+	     && (rtype = TREE_TYPE (TREE_OPERAND (my_arg1, 0)),
+		 TYPE_UNSIGNED (rtype)))
+	  {
+	    tree tree01, tree11;
+	    enum tree_code code01, code11;
+
+	    tree01 = TREE_OPERAND (my_arg0, 1);
+	    tree11 = TREE_OPERAND (my_arg1, 1);
+	    STRIP_NOPS (tree01);
+	    STRIP_NOPS (tree11);
+	    code01 = TREE_CODE (tree01);
+	    code11 = TREE_CODE (tree11);
+
+	    /* Check that shift amount is 8, and input 16 bits wide.  */
+	    if (code01 == INTEGER_CST
+		&& code11 == INTEGER_CST
+		&& TREE_INT_CST_HIGH (tree01) == 0
+		&& TREE_INT_CST_HIGH (tree11) == 0
+		&& TREE_INT_CST_LOW (tree01) == TREE_INT_CST_LOW (tree11)
+		&& TREE_INT_CST_LOW (tree01) == 8
+		&& TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (my_arg1, 0))) == 16)
+	      {
+		tree bswapfn = builtin_decl_explicit (BUILT_IN_BSWAP16);
+		return build_call_expr_loc (loc, bswapfn, 1,
+					    TREE_OPERAND (my_arg1, 0));
+	      }
+	  }
+
+	/* Handle (A >> C1) + (A << C1).  */
+	else if ((code0 == RSHIFT_EXPR && code1 == LSHIFT_EXPR)
+		 && (TREE_CODE (TREE_OPERAND (my_arg1, 0)) == NOP_EXPR)
+		 && operand_equal_p (TREE_OPERAND (TREE_OPERAND (my_arg1, 0),
+						   0),
+				     TREE_OPERAND (my_arg0, 0), 0)
+		 && (rtype = TREE_TYPE (TREE_OPERAND (my_arg0, 0)),
+		     TYPE_UNSIGNED (rtype)))
+	  {
+	    tree tree01, tree11;
+	    enum tree_code code01, code11;
+
+	    tree01 = TREE_OPERAND (my_arg0, 1);
+	    tree11 = TREE_OPERAND (my_arg1, 1);
+	    STRIP_NOPS (tree01);
+	    STRIP_NOPS (tree11);
+	    code01 = TREE_CODE (tree01);
+	    code11 = TREE_CODE (tree11);
+
+	    /* Check that shift amount is 8, and input 16 bits wide.  */
+	    if (code01 == INTEGER_CST
+		&& code11 == INTEGER_CST
+		&& TREE_INT_CST_HIGH (tree01) == 0
+		&& TREE_INT_CST_HIGH (tree11) == 0
+		&& TREE_INT_CST_LOW (tree01) == TREE_INT_CST_LOW (tree11)
+		&& TREE_INT_CST_LOW (tree01) == 8
+		&& TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (my_arg0, 0))) == 16)
+	      {
+		tree bswapfn = builtin_decl_explicit (BUILT_IN_BSWAP16);
+		return build_call_expr_loc (loc, bswapfn, 1,
+					    TREE_OPERAND (my_arg0, 0));
+	      }
+	  }
+      }
+
     associate:
       /* In most languages, can't associate operations on floats through
 	 parentheses.  Rather than remember where the parentheses were, we
diff --git a/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c b/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c
new file mode 100644
index 0000000..93dbb35
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target arm_arch_v6_ok } */
+/* { dg-add-options arm_arch_v6 } */
+/* { dg-final { scan-assembler-not "orr\[ \t\]" } } */
+
+unsigned short swapu16_1 (unsigned short x)
+{
+  return (x << 8) | (x >> 8);
+}
+
+unsigned short swapu16_2 (unsigned short x)
+{
+  return (x >> 8) | (x << 8);
+}
+
+unsigned int swapu32_1 (unsigned int x)
+{
+  return (x << 16) | (x >> 16);
+}
+
+unsigned int swapu32_2 (unsigned int x)
+{
+  return (x >> 16) | (x << 16);
+}