Message ID | CAKdteOYFyVqWNdK78fS+xVqMBKgq4Cun0DUoE=vV=qpoOaH_qw@mail.gmail.com |
---|---|
State | New |
Headers | show |
Christophe Lyon wrote: > Hi, > > The attached patch catches C constructs: > (A << 8) | (A >> 8) > where A is unsigned 16 bits > and maps them to builtin_bswap16(A) which can provide more efficient > implementations on some targets. > > The construct above is equivalent to the default bswap16 implementation. > > I have added a testcase for ARM, and have found no regression with > qemu-arm on arm-none-linux-gnueabi. > > OK? > > Christophe > > 2012-09-19 Christophe Lyon <christophe.lyon@linaro.org> > > gcc/ > * fold-const.c (fold_binary_loc): call builtin_bswap16 when the > equivalent construct is detected. > > gcc/testsuite/ > * gcc.target/arm/builtin-bswap-2.c: New testcase. > > diff --git a/gcc/fold-const.c b/gcc/fold-const.c > index 2bf5179..0ff7e8b 100644 > --- a/gcc/fold-const.c > +++ b/gcc/fold-const.c > @@ -10326,6 +10326,99 @@ fold_binary_loc (location_t loc, > } > } > > + /* Catch bswap16 construct: (A << 8) | (A >> 8) where A is > + unsigned 16 bits. > + This has been expanded into: > + (ior:SI (lshift:SI (nop:SI A:HI) 8) > + (nop:SI (rshift:HI A:HI 8))) > + */ This seems overly complicated on 16-bit platforms where (A << 8) | (A >> 8) is the same as rotate:HI (A 8). Does your patch make sure that (A << 8) | (A >> 8) is still mapped to ROTATE on these targets? Johann > + { > + enum tree_code code0, code1; > + tree my_arg0 = arg0; > + tree my_arg1= arg1; > + tree rtype; > + > + code0 = TREE_CODE (arg0); > + code1 = TREE_CODE (arg1); > + if (code1 == NOP_EXPR) > + { > + my_arg1 = TREE_OPERAND (arg1, 0); > + code1 = TREE_CODE (my_arg1); > + } > + else if (code0 == NOP_EXPR) > + { > + my_arg0 = TREE_OPERAND (arg0, 0); > + code0 = TREE_CODE (my_arg0); > + } > + > + /* Handle (A << C1) + (A >> C1). */ > + if ((code1 == RSHIFT_EXPR && code0 == LSHIFT_EXPR) > + && (TREE_CODE (TREE_OPERAND (my_arg0, 0)) == NOP_EXPR) > + && operand_equal_p (TREE_OPERAND (TREE_OPERAND (my_arg0, 0), 0), > + TREE_OPERAND (my_arg1, 0), 0) > + && (rtype = TREE_TYPE (TREE_OPERAND (my_arg1, 0)), > + TYPE_UNSIGNED (rtype))) > + { > + tree tree01, tree11; > + enum tree_code code01, code11; > + > + tree01 = TREE_OPERAND (my_arg0, 1); > + tree11 = TREE_OPERAND (my_arg1, 1); > + STRIP_NOPS (tree01); > + STRIP_NOPS (tree11); > + code01 = TREE_CODE (tree01); > + code11 = TREE_CODE (tree11); > + > + /* Check that shift amount is 8, and input 16 bits wide. */ > + if (code01 == INTEGER_CST > + && code11 == INTEGER_CST > + && TREE_INT_CST_HIGH (tree01) == 0 > + && TREE_INT_CST_HIGH (tree11) == 0 > + && TREE_INT_CST_LOW (tree01) == TREE_INT_CST_LOW (tree11) > + && TREE_INT_CST_LOW (tree01) == 8 > + && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (my_arg1, 0))) == 16) > + { > + tree bswapfn = builtin_decl_explicit (BUILT_IN_BSWAP16); > + return build_call_expr_loc (loc, bswapfn, 1, > + TREE_OPERAND (my_arg1, 0)); > + } > + } > + > + /* Handle (A >> C1) + (A << C1). */ > + else if ((code0 == RSHIFT_EXPR && code1 == LSHIFT_EXPR) > + && (TREE_CODE (TREE_OPERAND (my_arg1, 0)) == NOP_EXPR) > + && operand_equal_p (TREE_OPERAND (TREE_OPERAND (my_arg1, 0), > + 0), > + TREE_OPERAND (my_arg0, 0), 0) > + && (rtype = TREE_TYPE (TREE_OPERAND (my_arg0, 0)), > + TYPE_UNSIGNED (rtype))) > + { > + tree tree01, tree11; > + enum tree_code code01, code11; > + > + tree01 = TREE_OPERAND (my_arg0, 1); > + tree11 = TREE_OPERAND (my_arg1, 1); > + STRIP_NOPS (tree01); > + STRIP_NOPS (tree11); > + code01 = TREE_CODE (tree01); > + code11 = TREE_CODE (tree11); > + > + /* Check that shift amount is 8, and input 16 bits wide. */ > + if (code01 == INTEGER_CST > + && code11 == INTEGER_CST > + && TREE_INT_CST_HIGH (tree01) == 0 > + && TREE_INT_CST_HIGH (tree11) == 0 > + && TREE_INT_CST_LOW (tree01) == TREE_INT_CST_LOW (tree11) > + && TREE_INT_CST_LOW (tree01) == 8 > + && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (my_arg0, 0))) == 16) > + { > + tree bswapfn = builtin_decl_explicit (BUILT_IN_BSWAP16); > + return build_call_expr_loc (loc, bswapfn, 1, > + TREE_OPERAND (my_arg0, 0)); > + } > + } > + } > + > associate: > /* In most languages, can't associate operations on floats through > parentheses. Rather than remember where the parentheses were, we > diff --git a/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c b/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c > new file mode 100644 > index 0000000..93dbb35 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-require-effective-target arm_arch_v6_ok } */ > +/* { dg-add-options arm_arch_v6 } */ > +/* { dg-final { scan-assembler-not "orr\[ \t\]" } } */ > + > +unsigned short swapu16_1 (unsigned short x) > +{ > + return (x << 8) | (x >> 8); > +} > + > +unsigned short swapu16_2 (unsigned short x) > +{ > + return (x >> 8) | (x << 8); > +} > + > +unsigned int swapu32_1 (unsigned int x) > +{ > + return (x << 16) | (x >> 16); > +} > + > +unsigned int swapu32_2 (unsigned int x) > +{ > + return (x >> 16) | (x << 16); > +}
On Wed, 2012-09-19 at 18:44 +0200, Georg-Johann Lay wrote: > This seems overly complicated on 16-bit platforms where (A << 8) | (A >> 8) > is the same as rotate:HI (A 8). > > Does your patch make sure that (A << 8) | (A >> 8) is still mapped to ROTATE > on these targets? > If I remember correctly, the bswap16 builtin expansion will try to use a rotate pattern, if no HImode bswap is present. Cheers, Oleg
> The attached patch catches C constructs: > (A << 8) | (A >> 8) > where A is unsigned 16 bits > and maps them to builtin_bswap16(A) which can provide more efficient > implementations on some targets. This belongs in tree-ssa-math-opts.c:execute_optimize_bswap instead. When I implemented __builtin_bswap16, I didn't add this because I thought this would be overkill since the RTL combiner should be able to catch the pattern. Have you investigated on this front? But I don't have a strong opinion.
On 20 September 2012 09:12, Eric Botcazou <ebotcazou@adacore.com> wrote: >> The attached patch catches C constructs: >> (A << 8) | (A >> 8) >> where A is unsigned 16 bits >> and maps them to builtin_bswap16(A) which can provide more efficient >> implementations on some targets. > > This belongs in tree-ssa-math-opts.c:execute_optimize_bswap instead. > OK I'll have a look at that. Actually I modified fold-const.c because it's here that the similar 32 bits pattern is turned into a rotate. > When I implemented __builtin_bswap16, I didn't add this because I thought this > would be overkill since the RTL combiner should be able to catch the pattern. > Have you investigated on this front? But I don't have a strong opinion. > No I didn't. As I said above, I looked at where the 32 bits pattern was handled and added the 16 bits one. Christophe.
On Thu, 2012-09-20 at 09:12 +0200, Eric Botcazou wrote: > > The attached patch catches C constructs: > > (A << 8) | (A >> 8) > > where A is unsigned 16 bits > > and maps them to builtin_bswap16(A) which can provide more efficient > > implementations on some targets. > > This belongs in tree-ssa-math-opts.c:execute_optimize_bswap instead. > > When I implemented __builtin_bswap16, I didn't add this because I thought this > would be overkill since the RTL combiner should be able to catch the pattern. > Have you investigated on this front? But I don't have a strong opinion. > A while ago I've tried doing that for SH (implementing bswap16 with RTL combine). It was like an explosion of patterns, because combine would try out a lot of things depending on the surrounding code around the actual bswap16. In the end I decided to drop that stuff for the most part. BTW, the built-in documentation says: Built-in Function: int16_t __builtin_bswap16 (int16_t x) Built-in Function: int32_t __builtin_bswap32 (int32_t x) However, it seems the result is always unsigned for those. At least on SH I get the following: int test (short x) { return __builtin_bswap16 (x); } swap.b r4,r4 ! 8 *rotlhi3_8 rts ! 24 *return_i extu.w r4,r0 ! 9 *zero_extendhisi2_compact ... and similarly for int32. Can anyone else confirm this? Cheers, Oleg
> A while ago I've tried doing that for SH (implementing bswap16 with RTL > combine). It was like an explosion of patterns, because combine would > try out a lot of things depending on the surrounding code around the > actual bswap16. In the end I decided to drop that stuff for the most > part. OK, thanks for your feedback. Note that you won't get the benefit of the optimization for SH if you don't define a bswaphi2 pattern. > BTW, the built-in documentation says: > > Built-in Function: int16_t __builtin_bswap16 (int16_t x) > Built-in Function: int32_t __builtin_bswap32 (int32_t x) > > However, it seems the result is always unsigned for those. Yes, builtins.def contains: DEF_GCC_BUILTIN (BUILT_IN_BSWAP16, "bswap16", BT_FN_UINT16_UINT16, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_BSWAP32, "bswap32", BT_FN_UINT32_UINT32, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_BSWAP64, "bswap64", BT_FN_UINT64_UINT64, ATTR_CONST_NOTHROW_LEAF_LIST) The documentation indeed needs to be fixed.
On Mon, 2012-09-24 at 09:15 +0200, Eric Botcazou wrote: > > A while ago I've tried doing that for SH (implementing bswap16 with RTL > > combine). It was like an explosion of patterns, because combine would > > try out a lot of things depending on the surrounding code around the > > actual bswap16. In the end I decided to drop that stuff for the most > > part. > > OK, thanks for your feedback. Note that you won't get the benefit of the > optimization for SH if you don't define a bswaphi2 pattern. Sure sure. I'll resume my efforts after Christophe's patch is in. > > > BTW, the built-in documentation says: > > > > Built-in Function: int16_t __builtin_bswap16 (int16_t x) > > Built-in Function: int32_t __builtin_bswap32 (int32_t x) > > > > However, it seems the result is always unsigned for those. > > Yes, builtins.def contains: > > DEF_GCC_BUILTIN (BUILT_IN_BSWAP16, "bswap16", BT_FN_UINT16_UINT16, > ATTR_CONST_NOTHROW_LEAF_LIST) > DEF_GCC_BUILTIN (BUILT_IN_BSWAP32, "bswap32", BT_FN_UINT32_UINT32, > ATTR_CONST_NOTHROW_LEAF_LIST) > DEF_GCC_BUILTIN (BUILT_IN_BSWAP64, "bswap64", BT_FN_UINT64_UINT64, > ATTR_CONST_NOTHROW_LEAF_LIST) > > The documentation indeed needs to be fixed. > OK, I'll check it out. Cheers, Oleg
diff --git a/gcc/fold-const.c b/gcc/fold-const.c index 2bf5179..0ff7e8b 100644 --- a/gcc/fold-const.c +++ b/gcc/fold-const.c @@ -10326,6 +10326,99 @@ fold_binary_loc (location_t loc, } } + /* Catch bswap16 construct: (A << 8) | (A >> 8) where A is + unsigned 16 bits. + This has been expanded into: + (ior:SI (lshift:SI (nop:SI A:HI) 8) + (nop:SI (rshift:HI A:HI 8))) + */ + { + enum tree_code code0, code1; + tree my_arg0 = arg0; + tree my_arg1= arg1; + tree rtype; + + code0 = TREE_CODE (arg0); + code1 = TREE_CODE (arg1); + if (code1 == NOP_EXPR) + { + my_arg1 = TREE_OPERAND (arg1, 0); + code1 = TREE_CODE (my_arg1); + } + else if (code0 == NOP_EXPR) + { + my_arg0 = TREE_OPERAND (arg0, 0); + code0 = TREE_CODE (my_arg0); + } + + /* Handle (A << C1) + (A >> C1). */ + if ((code1 == RSHIFT_EXPR && code0 == LSHIFT_EXPR) + && (TREE_CODE (TREE_OPERAND (my_arg0, 0)) == NOP_EXPR) + && operand_equal_p (TREE_OPERAND (TREE_OPERAND (my_arg0, 0), 0), + TREE_OPERAND (my_arg1, 0), 0) + && (rtype = TREE_TYPE (TREE_OPERAND (my_arg1, 0)), + TYPE_UNSIGNED (rtype))) + { + tree tree01, tree11; + enum tree_code code01, code11; + + tree01 = TREE_OPERAND (my_arg0, 1); + tree11 = TREE_OPERAND (my_arg1, 1); + STRIP_NOPS (tree01); + STRIP_NOPS (tree11); + code01 = TREE_CODE (tree01); + code11 = TREE_CODE (tree11); + + /* Check that shift amount is 8, and input 16 bits wide. */ + if (code01 == INTEGER_CST + && code11 == INTEGER_CST + && TREE_INT_CST_HIGH (tree01) == 0 + && TREE_INT_CST_HIGH (tree11) == 0 + && TREE_INT_CST_LOW (tree01) == TREE_INT_CST_LOW (tree11) + && TREE_INT_CST_LOW (tree01) == 8 + && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (my_arg1, 0))) == 16) + { + tree bswapfn = builtin_decl_explicit (BUILT_IN_BSWAP16); + return build_call_expr_loc (loc, bswapfn, 1, + TREE_OPERAND (my_arg1, 0)); + } + } + + /* Handle (A >> C1) + (A << C1). */ + else if ((code0 == RSHIFT_EXPR && code1 == LSHIFT_EXPR) + && (TREE_CODE (TREE_OPERAND (my_arg1, 0)) == NOP_EXPR) + && operand_equal_p (TREE_OPERAND (TREE_OPERAND (my_arg1, 0), + 0), + TREE_OPERAND (my_arg0, 0), 0) + && (rtype = TREE_TYPE (TREE_OPERAND (my_arg0, 0)), + TYPE_UNSIGNED (rtype))) + { + tree tree01, tree11; + enum tree_code code01, code11; + + tree01 = TREE_OPERAND (my_arg0, 1); + tree11 = TREE_OPERAND (my_arg1, 1); + STRIP_NOPS (tree01); + STRIP_NOPS (tree11); + code01 = TREE_CODE (tree01); + code11 = TREE_CODE (tree11); + + /* Check that shift amount is 8, and input 16 bits wide. */ + if (code01 == INTEGER_CST + && code11 == INTEGER_CST + && TREE_INT_CST_HIGH (tree01) == 0 + && TREE_INT_CST_HIGH (tree11) == 0 + && TREE_INT_CST_LOW (tree01) == TREE_INT_CST_LOW (tree11) + && TREE_INT_CST_LOW (tree01) == 8 + && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (my_arg0, 0))) == 16) + { + tree bswapfn = builtin_decl_explicit (BUILT_IN_BSWAP16); + return build_call_expr_loc (loc, bswapfn, 1, + TREE_OPERAND (my_arg0, 0)); + } + } + } + associate: /* In most languages, can't associate operations on floats through parentheses. Rather than remember where the parentheses were, we diff --git a/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c b/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c new file mode 100644 index 0000000..93dbb35 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/builtin-bswap-2.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-require-effective-target arm_arch_v6_ok } */ +/* { dg-add-options arm_arch_v6 } */ +/* { dg-final { scan-assembler-not "orr\[ \t\]" } } */ + +unsigned short swapu16_1 (unsigned short x) +{ + return (x << 8) | (x >> 8); +} + +unsigned short swapu16_2 (unsigned short x) +{ + return (x >> 8) | (x << 8); +} + +unsigned int swapu32_1 (unsigned int x) +{ + return (x << 16) | (x >> 16); +} + +unsigned int swapu32_2 (unsigned int x) +{ + return (x >> 16) | (x << 16); +}