diff mbox

Remove redundant AND from count reduction loop

Message ID 87pp4m8mkp.fsf@e105548-lin.cambridge.arm.com
State New
Headers show

Commit Message

Richard Sandiford June 23, 2015, 3:35 p.m. UTC
We vectorise:

int
f (int *a, int n)
{
  int count = 0;
  for (int i = 0; i < n; ++i)
    if (a[i] < 255)
      count += 1;
  return count;
}

using an add reduction of a VEC_COND_EXPR <COND, {1, 1, ...}, {0, 0, ...}>.
This leads to the main loop having an AND with a loop invariant {1, 1, ...}.
E.g. on aarch64:

        movi    v2.4s, 0x1
.L4:
        lsl     x5, x4, 4
        add     x4, x4, 1
        cmp     w2, w4
        ldr     q1, [x0, x5]
        cmge    v1.4s, v3.4s, v1.4s
        and     v1.16b, v2.16b, v1.16b
        add     v0.4s, v0.4s, v1.4s
        bhi     .L4

This patch converts an ADD of that VEC_COND_EXPR into a SUB of COND:

.L4:
        lsl     x5, x4, 4
        add     x4, x4, 1
        cmp     w2, w4
        ldr     q1, [x0, x5]
        cmge    v1.4s, v2.4s, v1.4s
        sub     v0.4s, v0.4s, v1.4s
        bhi     .L4

At the moment the simplification is done during forwprop4, after the last
dce pass, and so the VEC_COND_EXPR survives until expand.  Richi says
this a known problem.  Of course, the expression gets deleted by
rtl dce, but it means that a scan-tree-dump of the *.optimized output
can't easily tell that the optimisation has triggered.  I've therefore
added a scan-assembler test instead.

Bootstrapped & regression-tested on x86_64-linux-gnu.  Also tested
on aarch64-elf.  OK to install?

Thanks,
Richard


gcc/
	* match.pd: Add patterns for vec_conds between -1 and 0, and
	between 1 and 0.

gcc/testsuite/
	* gcc.target/aarch64/vect-add-sub-cond.c: New test.

Comments

Marc Glisse June 23, 2015, 9:27 p.m. UTC | #1
On Tue, 23 Jun 2015, Richard Sandiford wrote:

> +/* Vector comparisons are defined to produce all-one or all-zero results.  */
> +(simplify
> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +   (convert @0)))

I am trying to understand why the test tree_nop_conversion_p is the right 
one (at least for the transformations not using VIEW_CONVERT_EXPR). By 
definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer 
vector types of the same size and number of elements. It thus seems like a 
conversion is always fine. For vectors, tree_nop_conversion_p apparently 
only checks that they have the same mode (quite often VOIDmode I guess).

> +/* We could instead convert all instances of the vec_cond to negate,
> +   but that isn't necessarily a win on its own.  */
> +(simplify
> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +  (minus @3 (convert @0))))
> +
> +(simplify
> + (plus:c @3 (view_convert_expr

Aren't we suppose to drop _expr in match.pd?

> + 	     (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +  (minus @3 (convert @0))))
> +
> +(simplify
> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +  (plus @3 (convert @0))))
> +
> +(simplify
> + (minus @3 (view_convert_expr
> + 	    (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +  (plus @3 (convert @0))))
> +
> /* Simplifications of comparisons.  */
>
> /* We can simplify a logical negation of a comparison to the
> Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
> ===================================================================
> --- /dev/null	2015-06-02 17:27:28.541944012 +0100
> +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c	2015-06-23 12:06:27.120203685 +0100
> @@ -0,0 +1,94 @@
> +/* Make sure that vector comaprison results are not unnecessarily ANDed
> +   with vectors of 1.  */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +#define COUNT1(X) if (X) count += 1
> +#define COUNT2(X) if (X) count -= 1
> +#define COUNT3(X) count += (X)
> +#define COUNT4(X) count -= (X)
> +
> +#define COND1(X) (X)
> +#define COND2(X) ((X) ? 1 : 0)
> +#define COND3(X) ((X) ? -1 : 0)
> +#define COND4(X) ((X) ? 0 : 1)
> +#define COND5(X) ((X) ? 0 : -1)
> +
> +#define TEST_LT(X, Y) ((X) < (Y))
> +#define TEST_LE(X, Y) ((X) <= (Y))
> +#define TEST_GT(X, Y) ((X) > (Y))
> +#define TEST_GE(X, Y) ((X) >= (Y))
> +#define TEST_EQ(X, Y) ((X) == (Y))
> +#define TEST_NE(X, Y) ((X) != (Y))
> +
> +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
> +  TYPE \
> +  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    TYPE count = 0; \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      COUNT (TEST (CMP_ARRAY[i], x)); \
> +    return count; \
> +  }
> +
> +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
> +  void \
> +  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
> +  } \
> +  void \
> +  plusc_##ID (void) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
> +  } \
> +  void \
> +  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
> +  } \
> +  void \
> +  minusc_##ID (void) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
> +  }
> +
> +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
> +  typedef __typeof__(ARRAY[0]) ID##_type; \
> +  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
> +  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
> +  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
> +  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
> +  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
> +  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
> +  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
> +  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
> +  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
> +
> +signed int asi[1024] __attribute__ ((aligned (16)));
> +unsigned int aui[1024] __attribute__ ((aligned (16)));
> +signed long long asl[1024] __attribute__ ((aligned (16)));
> +unsigned long long aul[1024] __attribute__ ((aligned (16)));
> +float af[1024] __attribute__ ((aligned (16)));
> +double ad[1024] __attribute__ ((aligned (16)));
> +
> +ALL_LOOPS (si_si, aui, asi, TEST_LT)
> +ALL_LOOPS (ui_si, aui, asi, TEST_LE)
> +ALL_LOOPS (si_ui, aui, asi, TEST_GT)
> +ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
> +ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
> +ALL_LOOPS (ul_ul, aul, aul, TEST_NE)
> +ALL_LOOPS (si_f, asi, af, TEST_LE)
> +ALL_LOOPS (ui_f, aui, af, TEST_GT)
> +ALL_LOOPS (sl_d, asl, ad, TEST_GE)
> +ALL_LOOPS (ul_d, aul, ad, TEST_GT)
> +
> +/* { dg-final { scan-assembler-not "\tand\t" } } */
> +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
> +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
> +/* { dg-final { scan-assembler "\tldr\tq" } } */
> +/* { dg-final { scan-assembler "\tstr\tq" } } */
Richard Biener June 24, 2015, 9:22 a.m. UTC | #2
On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Tue, 23 Jun 2015, Richard Sandiford wrote:
>
>> +/* Vector comparisons are defined to produce all-one or all-zero results.
>> */
>> +(simplify
>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>> +   (convert @0)))
>
>
> I am trying to understand why the test tree_nop_conversion_p is the right
> one (at least for the transformations not using VIEW_CONVERT_EXPR). By
> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector
> types of the same size and number of elements. It thus seems like a
> conversion is always fine. For vectors, tree_nop_conversion_p apparently
> only checks that they have the same mode (quite often VOIDmode I guess).

The only conversion we seem to allow is changing the signed vector from
the comparison result to an unsigned vector (same number of elements
and same mode of the elements).  That is, a check using
TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably
be better (well, technically a TYPE_VECTOR_SUBPARTS && element
mode compare should be better as generic vectors might not have a vector mode).

I'm fine with using tree_nop_conversion_p for now.

>> +/* We could instead convert all instances of the vec_cond to negate,
>> +   but that isn't necessarily a win on its own.  */

so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks more compact
at least ;)  It would also simplify the patterns below.

I'm missing a comment on the transform done by the following patterns.

>> +(simplify
>> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>> +  (minus @3 (convert @0))))
>> +
>> +(simplify
>> + (plus:c @3 (view_convert_expr
>
>
> Aren't we suppose to drop _expr in match.pd?

Yes.  I probably should adjust genmatch.c to reject the _expr variants ;)

>
>> +            (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>> +  (minus @3 (convert @0))))
>> +
>> +(simplify
>> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>> +  (plus @3 (convert @0))))
>> +
>> +(simplify
>> + (minus @3 (view_convert_expr
>> +           (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>> +  (plus @3 (convert @0))))
>> +

Generally for sign-conversions of vectors you should use view_convert.

The above also hints at missing conditional view_convert support
and a way to iterate over commutative vs. non-commutative ops so
we could write

(for op (plus:c minus)
     rop (minus plus)
  (op @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
   (rop @3 (view_convert @0)))))

I'll see implementing that.

Richard.


>> /* Simplifications of comparisons.  */
>>
>> /* We can simplify a logical negation of a comparison to the
>> Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
>> ===================================================================
>> --- /dev/null   2015-06-02 17:27:28.541944012 +0100
>> +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c        2015-06-23
>> 12:06:27.120203685 +0100
>> @@ -0,0 +1,94 @@
>> +/* Make sure that vector comaprison results are not unnecessarily ANDed
>> +   with vectors of 1.  */
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -ftree-vectorize" } */
>> +
>> +#define COUNT1(X) if (X) count += 1
>> +#define COUNT2(X) if (X) count -= 1
>> +#define COUNT3(X) count += (X)
>> +#define COUNT4(X) count -= (X)
>> +
>> +#define COND1(X) (X)
>> +#define COND2(X) ((X) ? 1 : 0)
>> +#define COND3(X) ((X) ? -1 : 0)
>> +#define COND4(X) ((X) ? 0 : 1)
>> +#define COND5(X) ((X) ? 0 : -1)
>> +
>> +#define TEST_LT(X, Y) ((X) < (Y))
>> +#define TEST_LE(X, Y) ((X) <= (Y))
>> +#define TEST_GT(X, Y) ((X) > (Y))
>> +#define TEST_GE(X, Y) ((X) >= (Y))
>> +#define TEST_EQ(X, Y) ((X) == (Y))
>> +#define TEST_NE(X, Y) ((X) != (Y))
>> +
>> +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
>> +  TYPE \
>> +  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
>> +  { \
>> +    TYPE count = 0; \
>> +    for (unsigned int i = 0; i < 1024; ++i) \
>> +      COUNT (TEST (CMP_ARRAY[i], x)); \
>> +    return count; \
>> +  }
>> +
>> +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
>> +  void \
>> +  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
>> +  { \
>> +    for (unsigned int i = 0; i < 1024; ++i) \
>> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
>> +  } \
>> +  void \
>> +  plusc_##ID (void) \
>> +  { \
>> +    for (unsigned int i = 0; i < 1024; ++i) \
>> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
>> +  } \
>> +  void \
>> +  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
>> +  { \
>> +    for (unsigned int i = 0; i < 1024; ++i) \
>> +      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
>> +  } \
>> +  void \
>> +  minusc_##ID (void) \
>> +  { \
>> +    for (unsigned int i = 0; i < 1024; ++i) \
>> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
>> +  }
>> +
>> +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
>> +  typedef __typeof__(ARRAY[0]) ID##_type; \
>> +  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
>> +  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
>> +  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
>> +  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
>> +  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
>> +  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
>> +  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
>> +  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
>> +  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
>> +
>> +signed int asi[1024] __attribute__ ((aligned (16)));
>> +unsigned int aui[1024] __attribute__ ((aligned (16)));
>> +signed long long asl[1024] __attribute__ ((aligned (16)));
>> +unsigned long long aul[1024] __attribute__ ((aligned (16)));
>> +float af[1024] __attribute__ ((aligned (16)));
>> +double ad[1024] __attribute__ ((aligned (16)));
>> +
>> +ALL_LOOPS (si_si, aui, asi, TEST_LT)
>> +ALL_LOOPS (ui_si, aui, asi, TEST_LE)
>> +ALL_LOOPS (si_ui, aui, asi, TEST_GT)
>> +ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
>> +ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
>> +ALL_LOOPS (ul_ul, aul, aul, TEST_NE)
>> +ALL_LOOPS (si_f, asi, af, TEST_LE)
>> +ALL_LOOPS (ui_f, aui, af, TEST_GT)
>> +ALL_LOOPS (sl_d, asl, ad, TEST_GE)
>> +ALL_LOOPS (ul_d, aul, ad, TEST_GT)
>> +
>> +/* { dg-final { scan-assembler-not "\tand\t" } } */
>> +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
>> +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
>> +/* { dg-final { scan-assembler "\tldr\tq" } } */
>> +/* { dg-final { scan-assembler "\tstr\tq" } } */
>
>
> --
> Marc Glisse
Richard Sandiford June 24, 2015, 9:57 a.m. UTC | #3
Richard Biener <richard.guenther@gmail.com> writes:
> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>> On Tue, 23 Jun 2015, Richard Sandiford wrote:
>>
>>> +/* Vector comparisons are defined to produce all-one or all-zero results.
>>> */
>>> +(simplify
>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>> +   (convert @0)))
>>
>>
>> I am trying to understand why the test tree_nop_conversion_p is the right
>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By
>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector
>> types of the same size and number of elements. It thus seems like a
>> conversion is always fine. For vectors, tree_nop_conversion_p apparently
>> only checks that they have the same mode (quite often VOIDmode I guess).
>
> The only conversion we seem to allow is changing the signed vector from
> the comparison result to an unsigned vector (same number of elements
> and same mode of the elements).  That is, a check using
> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably
> be better (well, technically a TYPE_VECTOR_SUBPARTS && element
> mode compare should be better as generic vectors might not have a vector mode).

OK.  The reason I was being paranoid was that I couldn't see anywhere
where we enforced that the vector condition in a VEC_COND had to have
the same element width as the values being selected.  tree-cfg.c
only checks that rhs2 and rhs3 are compatible with the result.
There doesn't seem to be any checking of rhs1 vs. the other types.
So I wasn't sure whether anything stopped us from, e.g., comparing two
V4HIs and using the result to select between two V4SIs.

> I'm fine with using tree_nop_conversion_p for now.

I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
mode.  How about:

 (if (VECTOR_INTEGER_TYPE_P (type)
      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
      && (TYPE_MODE (TREE_TYPE (type))
          == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))

(But is it really OK to be adding more mode-based compatibility checks?
I thought you were hoping to move away from modes in the middle end.)

>>> +/* We could instead convert all instances of the vec_cond to negate,
>>> +   but that isn't necessarily a win on its own.  */
>
> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks more compact
> at least ;)  It would also simplify the patterns below.

In the past I've dealt with processors where arithmetic wasn't handled
as efficiently as logical ops.  Seems like an especial risk for 64-bit
elements, from a quick scan of the i386 scheduling models.

> I'm missing a comment on the transform done by the following patterns.

Heh.  The comment was supposed to be describing all four at once.
I originally had then bunched together without whitespace, but it
looked bad.

>>> +(simplify
>>> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>> +  (minus @3 (convert @0))))
>>> +
>>> +(simplify
>>> + (plus:c @3 (view_convert_expr
>>
>>
>> Aren't we suppose to drop _expr in match.pd?
>
> Yes.  I probably should adjust genmatch.c to reject the _expr variants ;)

OK.

>>> +            (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>> +  (minus @3 (convert @0))))
>>> +
>>> +(simplify
>>> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>> +  (plus @3 (convert @0))))
>>> +
>>> +(simplify
>>> + (minus @3 (view_convert_expr
>>> +           (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>> +  (plus @3 (convert @0))))
>>> +
>
> Generally for sign-conversions of vectors you should use view_convert.

OK.

> The above also hints at missing conditional view_convert support
> and a way to iterate over commutative vs. non-commutative ops so
> we could write
>
> (for op (plus:c minus)
>      rop (minus plus)
>   (op @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>   (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>    (rop @3 (view_convert @0)))))
>
> I'll see implementing that.

Looks good. :-)

I also realised later that:

/* Vector comparisons are defined to produce all-one or all-zero results.  */
(simplify
 (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
 (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
   (convert @0)))

is redundant with some fold-const.c code.

Thanks,
Richard
Richard Biener June 24, 2015, 10:21 a.m. UTC | #4
On Wed, Jun 24, 2015 at 11:57 AM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>>> On Tue, 23 Jun 2015, Richard Sandiford wrote:
>>>
>>>> +/* Vector comparisons are defined to produce all-one or all-zero results.
>>>> */
>>>> +(simplify
>>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>> +   (convert @0)))
>>>
>>>
>>> I am trying to understand why the test tree_nop_conversion_p is the right
>>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By
>>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector
>>> types of the same size and number of elements. It thus seems like a
>>> conversion is always fine. For vectors, tree_nop_conversion_p apparently
>>> only checks that they have the same mode (quite often VOIDmode I guess).
>>
>> The only conversion we seem to allow is changing the signed vector from
>> the comparison result to an unsigned vector (same number of elements
>> and same mode of the elements).  That is, a check using
>> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably
>> be better (well, technically a TYPE_VECTOR_SUBPARTS && element
>> mode compare should be better as generic vectors might not have a vector mode).
>
> OK.  The reason I was being paranoid was that I couldn't see anywhere
> where we enforced that the vector condition in a VEC_COND had to have
> the same element width as the values being selected.

We don't require that indeed.

>  tree-cfg.c
> only checks that rhs2 and rhs3 are compatible with the result.
> There doesn't seem to be any checking of rhs1 vs. the other types.
> So I wasn't sure whether anything stopped us from, e.g., comparing two
> V4HIs and using the result to select between two V4SIs.

Nothing does (or should).

>> I'm fine with using tree_nop_conversion_p for now.
>
> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
> mode.  How about:
>
>  (if (VECTOR_INTEGER_TYPE_P (type)
>       && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>       && (TYPE_MODE (TREE_TYPE (type))
>           == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>
> (But is it really OK to be adding more mode-based compatibility checks?
> I thought you were hoping to move away from modes in the middle end.)

The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant
(the type of a comparison is always a signed vector integer type).
Yes, mode-based
checks are ok.  I don't see us moving away from them.

>>>> +/* We could instead convert all instances of the vec_cond to negate,
>>>> +   but that isn't necessarily a win on its own.  */
>>
>> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks more compact
>> at least ;)  It would also simplify the patterns below.
>
> In the past I've dealt with processors where arithmetic wasn't handled
> as efficiently as logical ops.  Seems like an especial risk for 64-bit
> elements, from a quick scan of the i386 scheduling models.

But then expansion could undo this ...

>> I'm missing a comment on the transform done by the following patterns.
>
> Heh.  The comment was supposed to be describing all four at once.
> I originally had then bunched together without whitespace, but it
> looked bad.
>
>>>> +(simplify
>>>> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>> +  (minus @3 (convert @0))))
>>>> +
>>>> +(simplify
>>>> + (plus:c @3 (view_convert_expr
>>>
>>>
>>> Aren't we suppose to drop _expr in match.pd?
>>
>> Yes.  I probably should adjust genmatch.c to reject the _expr variants ;)
>
> OK.
>
>>>> +            (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>> +  (minus @3 (convert @0))))
>>>> +
>>>> +(simplify
>>>> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>> +  (plus @3 (convert @0))))
>>>> +
>>>> +(simplify
>>>> + (minus @3 (view_convert_expr
>>>> +           (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>> +  (plus @3 (convert @0))))
>>>> +
>>
>> Generally for sign-conversions of vectors you should use view_convert.
>
> OK.
>
>> The above also hints at missing conditional view_convert support
>> and a way to iterate over commutative vs. non-commutative ops so
>> we could write
>>
>> (for op (plus:c minus)
>>      rop (minus plus)
>>   (op @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>   (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>    (rop @3 (view_convert @0)))))
>>
>> I'll see implementing that.
>
> Looks good. :-)
>
> I also realised later that:
>
> /* Vector comparisons are defined to produce all-one or all-zero results.  */
> (simplify
>  (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>    (convert @0)))
>
> is redundant with some fold-const.c code.

If so then you should remove the fold-const.c at the time you add the pattern.

Note that ISTR code performing exactly the opposite transform in
fold-const.c ...

Richard.

> Thanks,
> Richard
>
Richard Sandiford June 24, 2015, 11:10 a.m. UTC | #5
Richard Biener <richard.guenther@gmail.com> writes:
>>> I'm fine with using tree_nop_conversion_p for now.
>>
>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
>> mode.  How about:
>>
>>  (if (VECTOR_INTEGER_TYPE_P (type)
>>       && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>>       && (TYPE_MODE (TREE_TYPE (type))
>>           == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>>
>> (But is it really OK to be adding more mode-based compatibility checks?
>> I thought you were hoping to move away from modes in the middle end.)
>
> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant
> (the type of a comparison is always a signed vector integer type).

OK, will just use VECTOR_TYPE_P then.

>>>>> +/* We could instead convert all instances of the vec_cond to negate,
>>>>> +   but that isn't necessarily a win on its own.  */
>>>
>>> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks more compact
>>> at least ;)  It would also simplify the patterns below.
>>
>> In the past I've dealt with processors where arithmetic wasn't handled
>> as efficiently as logical ops.  Seems like an especial risk for 64-bit
>> elements, from a quick scan of the i386 scheduling models.
>
> But then expansion could undo this ...

So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)?
Is there precendent for doing that kind of thing?

>> I also realised later that:
>>
>> /* Vector comparisons are defined to produce all-one or all-zero results.  */
>> (simplify
>>  (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>    (convert @0)))
>>
>> is redundant with some fold-const.c code.
>
> If so then you should remove the fold-const.c at the time you add the pattern.

Can I just drop that part of the patch instead?  The fold-const.c
code handles COND_EXPR and VEC_COND_EXPR analogously, so I'd have
to move COND_EXPR at the same time.  And then the natural follow-on
would be: why not move the other COND_EXPR and VEC_COND_EXPR folds too? :-)

> Note that ISTR code performing exactly the opposite transform in
> fold-const.c ...

That's another reason why I'm worried about just doing the (negate ...)
thing without knowing whether the negate can be folded into anything else.

Thanks,
Richard
Richard Biener June 24, 2015, 11:43 a.m. UTC | #6
On Wed, Jun 24, 2015 at 1:10 PM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>>>> I'm fine with using tree_nop_conversion_p for now.
>>>
>>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
>>> mode.  How about:
>>>
>>>  (if (VECTOR_INTEGER_TYPE_P (type)
>>>       && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>>>       && (TYPE_MODE (TREE_TYPE (type))
>>>           == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>>>
>>> (But is it really OK to be adding more mode-based compatibility checks?
>>> I thought you were hoping to move away from modes in the middle end.)
>>
>> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant
>> (the type of a comparison is always a signed vector integer type).
>
> OK, will just use VECTOR_TYPE_P then.

Given we're in a VEC_COND_EXPR that's redundant as well.

>>>>>> +/* We could instead convert all instances of the vec_cond to negate,
>>>>>> +   but that isn't necessarily a win on its own.  */
>>>>
>>>> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks more compact
>>>> at least ;)  It would also simplify the patterns below.
>>>
>>> In the past I've dealt with processors where arithmetic wasn't handled
>>> as efficiently as logical ops.  Seems like an especial risk for 64-bit
>>> elements, from a quick scan of the i386 scheduling models.
>>
>> But then expansion could undo this ...
>
> So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)?
> Is there precendent for doing that kind of thing?

Expanding it as this, yes.  Whether there is precedence no idea, but
surely the expand_unop path could, if there is no optab for neg:vector_mode,
try expanding as vec_cond .. 1 0.  There is precedence for different
expansion paths dependent on optabs (or even rtx cost?).  Of course
expand_unop doesn't get the original tree ops (expand_expr.c does,
where some special-casing using get_gimple_for_expr is).  Not sure
if expand_unop would get 'cond' in a form where it can recognize
the result is either -1 or 0.

>>> I also realised later that:
>>>
>>> /* Vector comparisons are defined to produce all-one or all-zero results.  */
>>> (simplify
>>>  (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>>  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>    (convert @0)))
>>>
>>> is redundant with some fold-const.c code.
>>
>> If so then you should remove the fold-const.c at the time you add the pattern.
>
> Can I just drop that part of the patch instead?  The fold-const.c
> code handles COND_EXPR and VEC_COND_EXPR analogously, so I'd have
> to move COND_EXPR at the same time.  And then the natural follow-on
> would be: why not move the other COND_EXPR and VEC_COND_EXPR folds too? :-)

Yes, why not? ;)  But sure, you can also drop the case for now.

>> Note that ISTR code performing exactly the opposite transform in
>> fold-const.c ...
>
> That's another reason why I'm worried about just doing the (negate ...)
> thing without knowing whether the negate can be folded into anything else.

I'm not aware of anything here.

Richard.

> Thanks,
> Richard
>
Richard Sandiford June 24, 2015, 12:28 p.m. UTC | #7
Richard Biener <richard.guenther@gmail.com> writes:
> On Wed, Jun 24, 2015 at 1:10 PM, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>> Richard Biener <richard.guenther@gmail.com> writes:
>>>>> I'm fine with using tree_nop_conversion_p for now.
>>>>
>>>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
>>>> mode.  How about:
>>>>
>>>>  (if (VECTOR_INTEGER_TYPE_P (type)
>>>>       && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>>>>       && (TYPE_MODE (TREE_TYPE (type))
>>>>           == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>>>>
>>>> (But is it really OK to be adding more mode-based compatibility checks?
>>>> I thought you were hoping to move away from modes in the middle end.)
>>>
>>> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant
>>> (the type of a comparison is always a signed vector integer type).
>>
>> OK, will just use VECTOR_TYPE_P then.
>
> Given we're in a VEC_COND_EXPR that's redundant as well.

Hmm, but is it really guaranteed in:

 (plus:c @3 (view_convert (vec_cond @0 integer_each_onep@1 integer_zerop@2)))

that the @3 and the view_convert are also vectors?  I thought we allowed
view_converts from vector to non-vector types.

>>>>>>> +/* We could instead convert all instances of the vec_cond to negate,
>>>>>>> +   but that isn't necessarily a win on its own.  */
>>>>>
>>>>> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks
>>>>> more compact
>>>>> at least ;)  It would also simplify the patterns below.
>>>>
>>>> In the past I've dealt with processors where arithmetic wasn't handled
>>>> as efficiently as logical ops.  Seems like an especial risk for 64-bit
>>>> elements, from a quick scan of the i386 scheduling models.
>>>
>>> But then expansion could undo this ...
>>
>> So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)?
>> Is there precendent for doing that kind of thing?
>
> Expanding it as this, yes.  Whether there is precedence no idea, but
> surely the expand_unop path could, if there is no optab for neg:vector_mode,
> try expanding as vec_cond .. 1 0.

Yeah, that part isn't the problem.  It's when there is an implementation
of (neg ...) (which I'd hope all real integer vector architectures would
support) but it's not as efficient as the (and ...) that most targets
would use for a (vec_cond ... 0).

> There is precedence for different
> expansion paths dependent on optabs (or even rtx cost?).  Of course
> expand_unop doesn't get the original tree ops (expand_expr.c does,
> where some special-casing using get_gimple_for_expr is).  Not sure
> if expand_unop would get 'cond' in a form where it can recognize
> the result is either -1 or 0.

It just seems inconsistent to have the optabs machinery try to detect
this ad-hoc combination opportunity while still leaving the vcond optab
to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0).
The vcond optabs would still have the logic needed to produce the
right code, but we'd be circumventing it and trying to reimplement
one particular case in a different way.

Thanks,
Richard
Richard Biener June 24, 2015, 12:58 p.m. UTC | #8
On Wed, Jun 24, 2015 at 2:28 PM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>> On Wed, Jun 24, 2015 at 1:10 PM, Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>>> Richard Biener <richard.guenther@gmail.com> writes:
>>>>>> I'm fine with using tree_nop_conversion_p for now.
>>>>>
>>>>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
>>>>> mode.  How about:
>>>>>
>>>>>  (if (VECTOR_INTEGER_TYPE_P (type)
>>>>>       && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>>>>>       && (TYPE_MODE (TREE_TYPE (type))
>>>>>           == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>>>>>
>>>>> (But is it really OK to be adding more mode-based compatibility checks?
>>>>> I thought you were hoping to move away from modes in the middle end.)
>>>>
>>>> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant
>>>> (the type of a comparison is always a signed vector integer type).
>>>
>>> OK, will just use VECTOR_TYPE_P then.
>>
>> Given we're in a VEC_COND_EXPR that's redundant as well.
>
> Hmm, but is it really guaranteed in:
>
>  (plus:c @3 (view_convert (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>
> that the @3 and the view_convert are also vectors?  I thought we allowed
> view_converts from vector to non-vector types.

Hmm, true.

>>>>>>>> +/* We could instead convert all instances of the vec_cond to negate,
>>>>>>>> +   but that isn't necessarily a win on its own.  */
>>>>>>
>>>>>> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks
>>>>>> more compact
>>>>>> at least ;)  It would also simplify the patterns below.
>>>>>
>>>>> In the past I've dealt with processors where arithmetic wasn't handled
>>>>> as efficiently as logical ops.  Seems like an especial risk for 64-bit
>>>>> elements, from a quick scan of the i386 scheduling models.
>>>>
>>>> But then expansion could undo this ...
>>>
>>> So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)?
>>> Is there precendent for doing that kind of thing?
>>
>> Expanding it as this, yes.  Whether there is precedence no idea, but
>> surely the expand_unop path could, if there is no optab for neg:vector_mode,
>> try expanding as vec_cond .. 1 0.
>
> Yeah, that part isn't the problem.  It's when there is an implementation
> of (neg ...) (which I'd hope all real integer vector architectures would
> support) but it's not as efficient as the (and ...) that most targets
> would use for a (vec_cond ... 0).

I would suppose that a single-operand op (neg) is always bettter than a
two-operand (and) one.  But you of course never know...

>> There is precedence for different
>> expansion paths dependent on optabs (or even rtx cost?).  Of course
>> expand_unop doesn't get the original tree ops (expand_expr.c does,
>> where some special-casing using get_gimple_for_expr is).  Not sure
>> if expand_unop would get 'cond' in a form where it can recognize
>> the result is either -1 or 0.
>
> It just seems inconsistent to have the optabs machinery try to detect
> this ad-hoc combination opportunity while still leaving the vcond optab
> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0).
> The vcond optabs would still have the logic needed to produce the
> right code, but we'd be circumventing it and trying to reimplement
> one particular case in a different way.

That's true.  One could also leave it to combine / simplify_rtx and
thus rtx_cost.  But that's true of all of the match.pd stuff you add, no?

Richard.

> Thanks,
> Richard
>
Richard Sandiford June 24, 2015, 1:37 p.m. UTC | #9
>>> There is precedence for different
>>> expansion paths dependent on optabs (or even rtx cost?).  Of course
>>> expand_unop doesn't get the original tree ops (expand_expr.c does,
>>> where some special-casing using get_gimple_for_expr is).  Not sure
>>> if expand_unop would get 'cond' in a form where it can recognize
>>> the result is either -1 or 0.
>>
>> It just seems inconsistent to have the optabs machinery try to detect
>> this ad-hoc combination opportunity while still leaving the vcond optab
>> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0).
>> The vcond optabs would still have the logic needed to produce the
>> right code, but we'd be circumventing it and trying to reimplement
>> one particular case in a different way.
>
> That's true.  One could also leave it to combine / simplify_rtx and
> thus rtx_cost.  But that's true of all of the match.pd stuff you add, no?

It's probably true of most match.pd stuff in general though :-)
One advantage of match.pd of course is that it works across
block boundaries.

The difference between the stuff I added and converting vec_cond_expr
to negate is that the stuff I added avoids the vec_cond_expr altogether
and so ought to be an unequivocal win.  Replacing vec_cond_expr with
negate just rewrites it into another (arguably more surprising) form.

Thanks,
Richard
Richard Biener June 24, 2015, 2:08 p.m. UTC | #10
On Wed, Jun 24, 2015 at 3:37 PM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>>>> There is precedence for different
>>>> expansion paths dependent on optabs (or even rtx cost?).  Of course
>>>> expand_unop doesn't get the original tree ops (expand_expr.c does,
>>>> where some special-casing using get_gimple_for_expr is).  Not sure
>>>> if expand_unop would get 'cond' in a form where it can recognize
>>>> the result is either -1 or 0.
>>>
>>> It just seems inconsistent to have the optabs machinery try to detect
>>> this ad-hoc combination opportunity while still leaving the vcond optab
>>> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0).
>>> The vcond optabs would still have the logic needed to produce the
>>> right code, but we'd be circumventing it and trying to reimplement
>>> one particular case in a different way.
>>
>> That's true.  One could also leave it to combine / simplify_rtx and
>> thus rtx_cost.  But that's true of all of the match.pd stuff you add, no?
>
> It's probably true of most match.pd stuff in general though :-)
> One advantage of match.pd of course is that it works across
> block boundaries.
>
> The difference between the stuff I added and converting vec_cond_expr
> to negate is that the stuff I added avoids the vec_cond_expr altogether
> and so ought to be an unequivocal win.  Replacing vec_cond_expr with
> negate just rewrites it into another (arguably more surprising) form.

True.  Btw, conditional view_convert is now in trunk so you can at least
merge both plus:c patterns and both minus patterns.

Richard.

> Thanks,
> Richard
>
Jeff Law June 24, 2015, 4:05 p.m. UTC | #11
On 06/24/2015 05:43 AM, Richard Biener wrote:

>>> Note that ISTR code performing exactly the opposite transform in
>>> fold-const.c ...
>>
>> That's another reason why I'm worried about just doing the (negate ...)
>> thing without knowing whether the negate can be folded into anything else.
>
> I'm not aware of anything here.
It's worth looking at -- I've certainly seen cases where we end up 
infinite recursion because we've got a transformation in once place (say 
match.pd) and its inverse elsewhere (fold-const.c).

Jeff
Marc Glisse June 25, 2015, 10:39 p.m. UTC | #12
On Wed, 24 Jun 2015, Richard Biener wrote:

> On Wed, Jun 24, 2015 at 11:57 AM, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>> Richard Biener <richard.guenther@gmail.com> writes:
>>> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>>>> On Tue, 23 Jun 2015, Richard Sandiford wrote:
>>>>
>>>>> +/* Vector comparisons are defined to produce all-one or all-zero results.
>>>>> */
>>>>> +(simplify
>>>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>>> +   (convert @0)))
>>>>
>>>>
>>>> I am trying to understand why the test tree_nop_conversion_p is the right
>>>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By
>>>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector
>>>> types of the same size and number of elements. It thus seems like a
>>>> conversion is always fine. For vectors, tree_nop_conversion_p apparently
>>>> only checks that they have the same mode (quite often VOIDmode I guess).
>>>
>>> The only conversion we seem to allow is changing the signed vector from
>>> the comparison result to an unsigned vector (same number of elements
>>> and same mode of the elements).  That is, a check using
>>> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably
>>> be better (well, technically a TYPE_VECTOR_SUBPARTS && element
>>> mode compare should be better as generic vectors might not have a vector mode).
>>
>> OK.  The reason I was being paranoid was that I couldn't see anywhere
>> where we enforced that the vector condition in a VEC_COND had to have
>> the same element width as the values being selected.
>
> We don't require that indeed.
>
>>  tree-cfg.c
>> only checks that rhs2 and rhs3 are compatible with the result.
>> There doesn't seem to be any checking of rhs1 vs. the other types.
>> So I wasn't sure whether anything stopped us from, e.g., comparing two
>> V4HIs and using the result to select between two V4SIs.
>
> Nothing does (or should).

The documentation patch you approved in 
https://gcc.gnu.org/ml/gcc-patches/2012-10/msg01109.html says something 
different. If it is really wrong, could you fix it?
Richard Biener June 26, 2015, 9:44 a.m. UTC | #13
On Fri, Jun 26, 2015 at 12:39 AM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Wed, 24 Jun 2015, Richard Biener wrote:
>
>> On Wed, Jun 24, 2015 at 11:57 AM, Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>>>
>>> Richard Biener <richard.guenther@gmail.com> writes:
>>>>
>>>> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr>
>>>> wrote:
>>>>>
>>>>> On Tue, 23 Jun 2015, Richard Sandiford wrote:
>>>>>
>>>>>> +/* Vector comparisons are defined to produce all-one or all-zero
>>>>>> results.
>>>>>> */
>>>>>> +(simplify
>>>>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>>>> +   (convert @0)))
>>>>>
>>>>>
>>>>>
>>>>> I am trying to understand why the test tree_nop_conversion_p is the
>>>>> right
>>>>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By
>>>>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer
>>>>> vector
>>>>> types of the same size and number of elements. It thus seems like a
>>>>> conversion is always fine. For vectors, tree_nop_conversion_p
>>>>> apparently
>>>>> only checks that they have the same mode (quite often VOIDmode I
>>>>> guess).
>>>>
>>>>
>>>> The only conversion we seem to allow is changing the signed vector from
>>>> the comparison result to an unsigned vector (same number of elements
>>>> and same mode of the elements).  That is, a check using
>>>> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably
>>>> be better (well, technically a TYPE_VECTOR_SUBPARTS && element
>>>> mode compare should be better as generic vectors might not have a vector
>>>> mode).
>>>
>>>
>>> OK.  The reason I was being paranoid was that I couldn't see anywhere
>>> where we enforced that the vector condition in a VEC_COND had to have
>>> the same element width as the values being selected.
>>
>>
>> We don't require that indeed.
>>
>>>  tree-cfg.c
>>> only checks that rhs2 and rhs3 are compatible with the result.
>>> There doesn't seem to be any checking of rhs1 vs. the other types.
>>> So I wasn't sure whether anything stopped us from, e.g., comparing two
>>> V4HIs and using the result to select between two V4SIs.
>>
>>
>> Nothing does (or should).
>
>
> The documentation patch you approved in
> https://gcc.gnu.org/ml/gcc-patches/2012-10/msg01109.html says something
> different. If it is really wrong, could you fix it?

Hmm, that simplifies things.  It would be nice if these constraints
would also be
checked in the gimple verifier...

Richard.

> --
> Marc Glisse
diff mbox

Patch

Index: gcc/match.pd
===================================================================
--- gcc/match.pd	2015-06-23 11:42:23.644645975 +0100
+++ gcc/match.pd	2015-06-23 11:42:23.760644655 +0100
@@ -973,6 +973,36 @@  along with GCC; see the file COPYING3.
   (cnd @0 @2 @1)))
 
 
+/* Vector comparisons are defined to produce all-one or all-zero results.  */
+(simplify
+ (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+   (convert @0)))
+
+/* We could instead convert all instances of the vec_cond to negate,
+   but that isn't necessarily a win on its own.  */
+(simplify
+ (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (minus @3 (convert @0))))
+
+(simplify
+ (plus:c @3 (view_convert_expr
+ 	     (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (minus @3 (convert @0))))
+
+(simplify
+ (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (plus @3 (convert @0))))
+
+(simplify
+ (minus @3 (view_convert_expr
+ 	    (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (plus @3 (convert @0))))
+
 /* Simplifications of comparisons.  */
 
 /* We can simplify a logical negation of a comparison to the
Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
===================================================================
--- /dev/null	2015-06-02 17:27:28.541944012 +0100
+++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c	2015-06-23 12:06:27.120203685 +0100
@@ -0,0 +1,94 @@ 
+/* Make sure that vector comaprison results are not unnecessarily ANDed
+   with vectors of 1.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define COUNT1(X) if (X) count += 1
+#define COUNT2(X) if (X) count -= 1
+#define COUNT3(X) count += (X)
+#define COUNT4(X) count -= (X)
+
+#define COND1(X) (X)
+#define COND2(X) ((X) ? 1 : 0)
+#define COND3(X) ((X) ? -1 : 0)
+#define COND4(X) ((X) ? 0 : 1)
+#define COND5(X) ((X) ? 0 : -1)
+
+#define TEST_LT(X, Y) ((X) < (Y))
+#define TEST_LE(X, Y) ((X) <= (Y))
+#define TEST_GT(X, Y) ((X) > (Y))
+#define TEST_GE(X, Y) ((X) >= (Y))
+#define TEST_EQ(X, Y) ((X) == (Y))
+#define TEST_NE(X, Y) ((X) != (Y))
+
+#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
+  TYPE \
+  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    TYPE count = 0; \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      COUNT (TEST (CMP_ARRAY[i], x)); \
+    return count; \
+  }
+
+#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
+  void \
+  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
+  } \
+  void \
+  plusc_##ID (void) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
+  } \
+  void \
+  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
+  } \
+  void \
+  minusc_##ID (void) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
+  }
+
+#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
+  typedef __typeof__(ARRAY[0]) ID##_type; \
+  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
+  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
+  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
+  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
+  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
+  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
+  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
+  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
+  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
+
+signed int asi[1024] __attribute__ ((aligned (16)));
+unsigned int aui[1024] __attribute__ ((aligned (16)));
+signed long long asl[1024] __attribute__ ((aligned (16)));
+unsigned long long aul[1024] __attribute__ ((aligned (16)));
+float af[1024] __attribute__ ((aligned (16)));
+double ad[1024] __attribute__ ((aligned (16)));
+
+ALL_LOOPS (si_si, aui, asi, TEST_LT)
+ALL_LOOPS (ui_si, aui, asi, TEST_LE)
+ALL_LOOPS (si_ui, aui, asi, TEST_GT)
+ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
+ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
+ALL_LOOPS (ul_ul, aul, aul, TEST_NE)
+ALL_LOOPS (si_f, asi, af, TEST_LE)
+ALL_LOOPS (ui_f, aui, af, TEST_GT)
+ALL_LOOPS (sl_d, asl, ad, TEST_GE)
+ALL_LOOPS (ul_d, aul, ad, TEST_GT)
+
+/* { dg-final { scan-assembler-not "\tand\t" } } */
+/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
+/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
+/* { dg-final { scan-assembler "\tldr\tq" } } */
+/* { dg-final { scan-assembler "\tstr\tq" } } */