Message ID | ZD5Z3iovGmuMQ3C9@tucnak |
---|---|
State | New |
Headers | show |
Series | match.pd: Improve fneg/fadd optimization [PR109240] | expand |
On Tue, 18 Apr 2023, Jakub Jelinek wrote: > Hi! > > match.pd has mostly for AArch64 an optimization in which it optimizes > certain forms of __builtin_shuffle of x + y and x - y vectors into > fneg using twice as wide element type so that every other sign is changed, > followed by fadd. > > The following patch extends that optimization, so that it can handle > other forms as well, using the same fneg but fsub instead of fadd. > > As the plus is commutative and minus is not and I want to handle > vec_perm with plus minus and minus plus order preferrably in one > pattern, I had to do the matching operand checks by hand. > > Bootstrapped/regtested on aarch64-linux, x86_64-linux and i686-linux, > ok for trunk? OK. Thanks, Richard. > 2023-04-18 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/109240 > * match.pd (fneg/fadd): Rewrite such that it handles both plus as > first vec_perm operand and minus as second using fneg/fadd and > minus as first vec_perm operand and plus as second using fneg/fsub. > > * gcc.target/aarch64/simd/addsub_2.c: New test. > * gcc.target/aarch64/sve/addsub_2.c: New test. > > --- gcc/match.pd.jj 2023-03-21 19:59:40.209634256 +0100 > +++ gcc/match.pd 2023-03-22 10:17:25.344772636 +0100 > @@ -8074,63 +8074,76 @@ and, > under IEEE 754 the fneg of the wider type will negate every even entry > and when doing an add we get a sub of the even and add of every odd > elements. */ > -(simplify > - (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2) > - (if (!VECTOR_INTEGER_TYPE_P (type) > - && !FLOAT_WORDS_BIG_ENDIAN) > - (with > - { > - /* Build a vector of integers from the tree mask. */ > - vec_perm_builder builder; > - } > - (if (tree_to_vec_perm_builder (&builder, @2)) > - (with > - { > - /* Create a vec_perm_indices for the integer vector. */ > - poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); > - vec_perm_indices sel (builder, 2, nelts); > - machine_mode vec_mode = TYPE_MODE (type); > - machine_mode wide_mode; > - scalar_mode wide_elt_mode; > - poly_uint64 wide_nunits; > - scalar_mode inner_mode = GET_MODE_INNER (vec_mode); > - } > - (if (sel.series_p (0, 2, 0, 2) > - && sel.series_p (1, 2, nelts + 1, 2) > - && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode) > - && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits) > - && related_vector_mode (vec_mode, wide_elt_mode, > - wide_nunits).exists (&wide_mode)) > - (with > - { > - tree stype > - = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode), > - TYPE_UNSIGNED (type)); > - tree ntype = build_vector_type_for_mode (stype, wide_mode); > +(for plusminus (plus minus) > + minusplus (minus plus) > + (simplify > + (vec_perm (plusminus @0 @1) (minusplus @2 @3) VECTOR_CST@4) > + (if (!VECTOR_INTEGER_TYPE_P (type) > + && !FLOAT_WORDS_BIG_ENDIAN > + /* plus is commutative, while minus is not, so :c can't be used. > + Do equality comparisons by hand and at the end pick the operands > + from the minus. */ > + && (operand_equal_p (@0, @2, 0) > + ? operand_equal_p (@1, @3, 0) > + : operand_equal_p (@0, @3, 0) && operand_equal_p (@1, @2, 0))) > + (with > + { > + /* Build a vector of integers from the tree mask. */ > + vec_perm_builder builder; > + } > + (if (tree_to_vec_perm_builder (&builder, @4)) > + (with > + { > + /* Create a vec_perm_indices for the integer vector. */ > + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); > + vec_perm_indices sel (builder, 2, nelts); > + machine_mode vec_mode = TYPE_MODE (type); > + machine_mode wide_mode; > + scalar_mode wide_elt_mode; > + poly_uint64 wide_nunits; > + scalar_mode inner_mode = GET_MODE_INNER (vec_mode); > + } > + (if (sel.series_p (0, 2, 0, 2) > + && sel.series_p (1, 2, nelts + 1, 2) > + && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode) > + && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits) > + && related_vector_mode (vec_mode, wide_elt_mode, > + wide_nunits).exists (&wide_mode)) > + (with > + { > + tree stype > + = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode), > + TYPE_UNSIGNED (type)); > + tree ntype = build_vector_type_for_mode (stype, wide_mode); > > - /* The format has to be a non-extended ieee format. */ > - const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode); > - const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode); > - } > - (if (TYPE_MODE (stype) != BLKmode > - && VECTOR_TYPE_P (ntype) > - && fmt_old != NULL > - && fmt_new != NULL) > - (with > - { > - /* If the target doesn't support v1xx vectors, try using > - scalar mode xx instead. */ > + /* The format has to be a non-extended ieee format. */ > + const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode); > + const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode); > + } > + (if (TYPE_MODE (stype) != BLKmode > + && VECTOR_TYPE_P (ntype) > + && fmt_old != NULL > + && fmt_new != NULL) > + (with > + { > + /* If the target doesn't support v1xx vectors, try using > + scalar mode xx instead. */ > if (known_eq (GET_MODE_NUNITS (wide_mode), 1) > && !target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)) > ntype = stype; > - } > - (if (fmt_new->signbit_rw > - == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode) > - && fmt_new->signbit_rw == fmt_new->signbit_ro > - && targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE (type), ALL_REGS) > - && ((optimize_vectors_before_lowering_p () && VECTOR_TYPE_P (ntype)) > - || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))) > - (plus (view_convert:type (negate (view_convert:ntype @1))) @0))))))))))) > + } > + (if (fmt_new->signbit_rw > + == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode) > + && fmt_new->signbit_rw == fmt_new->signbit_ro > + && targetm.can_change_mode_class (TYPE_MODE (ntype), > + TYPE_MODE (type), ALL_REGS) > + && ((optimize_vectors_before_lowering_p () > + && VECTOR_TYPE_P (ntype)) > + || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))) > + (if (plusminus == PLUS_EXPR) > + (plus (view_convert:type (negate (view_convert:ntype @3))) @2) > + (minus @0 (view_convert:type > + (negate (view_convert:ntype @1)))))))))))))))) > > (simplify > (vec_perm @0 @1 VECTOR_CST@2) > --- gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c.jj 2023-03-22 10:22:57.324017790 +0100 > +++ gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c 2023-03-22 10:23:54.482199126 +0100 > @@ -0,0 +1,56 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */ > +/* { dg-options "-Ofast" } */ > +/* { dg-add-options arm_v8_2a_fp16_neon } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ > + > +#pragma GCC target "+nosve" > + > +/* > +** f1: > +** ... > +** fneg v[0-9]+.2d, v[0-9]+.2d > +** fsub v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** ... > +*/ > +void f1 (float *restrict a, float *restrict b, float *res, int n) > +{ > + for (int i = 0; i < (n & -4); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > + > +/* > +** d1: > +** ... > +** fneg v[0-9]+.4s, v[0-9]+.4s > +** fsub v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h > +** ... > +*/ > +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n) > +{ > + for (int i = 0; i < (n & -8); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > + > +/* > +** e1: > +** ... > +** fsub v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > +** fadd v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > +** ins v[0-9]+.d\[1\], v[0-9]+.d\[1\] > +** ... > +*/ > +void e1 (double *restrict a, double *restrict b, double *res, int n) > +{ > + for (int i = 0; i < (n & -4); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > --- gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c.jj 2023-03-22 10:24:14.169917153 +0100 > +++ gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c 2023-03-22 10:25:05.414183194 +0100 > @@ -0,0 +1,52 @@ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ > + > +/* > +** f1: > +** ... > +** fneg z[0-9]+.d, p[0-9]+/m, z[0-9]+.d > +** fsub z[0-9]+.s, z[0-9]+.s, z[0-9]+.s > +** ... > +*/ > +void f1 (float *restrict a, float *restrict b, float *res, int n) > +{ > + for (int i = 0; i < (n & -4); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > + > +/* > +** d1: > +** ... > +** fneg z[0-9]+.s, p[0-9]+/m, z[0-9]+.s > +** fsub z[0-9]+.h, z[0-9]+.h, z[0-9]+.h > +** ... > +*/ > +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n) > +{ > + for (int i = 0; i < (n & -8); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > + > +/* > +** e1: > +** ... > +** fadd z[0-9]+.d, z[0-9]+.d, z[0-9]+.d > +** movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d > +** fsub z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d > +** ... > +*/ > +void e1 (double *restrict a, double *restrict b, double *res, int n) > +{ > + for (int i = 0; i < (n & -4); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > > Jakub > >
--- gcc/match.pd.jj 2023-03-21 19:59:40.209634256 +0100 +++ gcc/match.pd 2023-03-22 10:17:25.344772636 +0100 @@ -8074,63 +8074,76 @@ and, under IEEE 754 the fneg of the wider type will negate every even entry and when doing an add we get a sub of the even and add of every odd elements. */ -(simplify - (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2) - (if (!VECTOR_INTEGER_TYPE_P (type) - && !FLOAT_WORDS_BIG_ENDIAN) - (with - { - /* Build a vector of integers from the tree mask. */ - vec_perm_builder builder; - } - (if (tree_to_vec_perm_builder (&builder, @2)) - (with - { - /* Create a vec_perm_indices for the integer vector. */ - poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); - vec_perm_indices sel (builder, 2, nelts); - machine_mode vec_mode = TYPE_MODE (type); - machine_mode wide_mode; - scalar_mode wide_elt_mode; - poly_uint64 wide_nunits; - scalar_mode inner_mode = GET_MODE_INNER (vec_mode); - } - (if (sel.series_p (0, 2, 0, 2) - && sel.series_p (1, 2, nelts + 1, 2) - && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode) - && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits) - && related_vector_mode (vec_mode, wide_elt_mode, - wide_nunits).exists (&wide_mode)) - (with - { - tree stype - = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode), - TYPE_UNSIGNED (type)); - tree ntype = build_vector_type_for_mode (stype, wide_mode); +(for plusminus (plus minus) + minusplus (minus plus) + (simplify + (vec_perm (plusminus @0 @1) (minusplus @2 @3) VECTOR_CST@4) + (if (!VECTOR_INTEGER_TYPE_P (type) + && !FLOAT_WORDS_BIG_ENDIAN + /* plus is commutative, while minus is not, so :c can't be used. + Do equality comparisons by hand and at the end pick the operands + from the minus. */ + && (operand_equal_p (@0, @2, 0) + ? operand_equal_p (@1, @3, 0) + : operand_equal_p (@0, @3, 0) && operand_equal_p (@1, @2, 0))) + (with + { + /* Build a vector of integers from the tree mask. */ + vec_perm_builder builder; + } + (if (tree_to_vec_perm_builder (&builder, @4)) + (with + { + /* Create a vec_perm_indices for the integer vector. */ + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); + vec_perm_indices sel (builder, 2, nelts); + machine_mode vec_mode = TYPE_MODE (type); + machine_mode wide_mode; + scalar_mode wide_elt_mode; + poly_uint64 wide_nunits; + scalar_mode inner_mode = GET_MODE_INNER (vec_mode); + } + (if (sel.series_p (0, 2, 0, 2) + && sel.series_p (1, 2, nelts + 1, 2) + && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode) + && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits) + && related_vector_mode (vec_mode, wide_elt_mode, + wide_nunits).exists (&wide_mode)) + (with + { + tree stype + = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode), + TYPE_UNSIGNED (type)); + tree ntype = build_vector_type_for_mode (stype, wide_mode); - /* The format has to be a non-extended ieee format. */ - const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode); - const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode); - } - (if (TYPE_MODE (stype) != BLKmode - && VECTOR_TYPE_P (ntype) - && fmt_old != NULL - && fmt_new != NULL) - (with - { - /* If the target doesn't support v1xx vectors, try using - scalar mode xx instead. */ + /* The format has to be a non-extended ieee format. */ + const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode); + const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode); + } + (if (TYPE_MODE (stype) != BLKmode + && VECTOR_TYPE_P (ntype) + && fmt_old != NULL + && fmt_new != NULL) + (with + { + /* If the target doesn't support v1xx vectors, try using + scalar mode xx instead. */ if (known_eq (GET_MODE_NUNITS (wide_mode), 1) && !target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)) ntype = stype; - } - (if (fmt_new->signbit_rw - == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode) - && fmt_new->signbit_rw == fmt_new->signbit_ro - && targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE (type), ALL_REGS) - && ((optimize_vectors_before_lowering_p () && VECTOR_TYPE_P (ntype)) - || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))) - (plus (view_convert:type (negate (view_convert:ntype @1))) @0))))))))))) + } + (if (fmt_new->signbit_rw + == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode) + && fmt_new->signbit_rw == fmt_new->signbit_ro + && targetm.can_change_mode_class (TYPE_MODE (ntype), + TYPE_MODE (type), ALL_REGS) + && ((optimize_vectors_before_lowering_p () + && VECTOR_TYPE_P (ntype)) + || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))) + (if (plusminus == PLUS_EXPR) + (plus (view_convert:type (negate (view_convert:ntype @3))) @2) + (minus @0 (view_convert:type + (negate (view_convert:ntype @1)))))))))))))))) (simplify (vec_perm @0 @1 VECTOR_CST@2) --- gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c.jj 2023-03-22 10:22:57.324017790 +0100 +++ gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c 2023-03-22 10:23:54.482199126 +0100 @@ -0,0 +1,56 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */ +/* { dg-options "-Ofast" } */ +/* { dg-add-options arm_v8_2a_fp16_neon } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +#pragma GCC target "+nosve" + +/* +** f1: +** ... +** fneg v[0-9]+.2d, v[0-9]+.2d +** fsub v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** ... +*/ +void f1 (float *restrict a, float *restrict b, float *res, int n) +{ + for (int i = 0; i < (n & -4); i+=2) + { + res[i+0] = a[i+0] - b[i+0]; + res[i+1] = a[i+1] + b[i+1]; + } +} + +/* +** d1: +** ... +** fneg v[0-9]+.4s, v[0-9]+.4s +** fsub v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** ... +*/ +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n) +{ + for (int i = 0; i < (n & -8); i+=2) + { + res[i+0] = a[i+0] - b[i+0]; + res[i+1] = a[i+1] + b[i+1]; + } +} + +/* +** e1: +** ... +** fsub v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d +** fadd v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d +** ins v[0-9]+.d\[1\], v[0-9]+.d\[1\] +** ... +*/ +void e1 (double *restrict a, double *restrict b, double *res, int n) +{ + for (int i = 0; i < (n & -4); i+=2) + { + res[i+0] = a[i+0] - b[i+0]; + res[i+1] = a[i+1] + b[i+1]; + } +} --- gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c.jj 2023-03-22 10:24:14.169917153 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c 2023-03-22 10:25:05.414183194 +0100 @@ -0,0 +1,52 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +/* +** f1: +** ... +** fneg z[0-9]+.d, p[0-9]+/m, z[0-9]+.d +** fsub z[0-9]+.s, z[0-9]+.s, z[0-9]+.s +** ... +*/ +void f1 (float *restrict a, float *restrict b, float *res, int n) +{ + for (int i = 0; i < (n & -4); i+=2) + { + res[i+0] = a[i+0] - b[i+0]; + res[i+1] = a[i+1] + b[i+1]; + } +} + +/* +** d1: +** ... +** fneg z[0-9]+.s, p[0-9]+/m, z[0-9]+.s +** fsub z[0-9]+.h, z[0-9]+.h, z[0-9]+.h +** ... +*/ +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n) +{ + for (int i = 0; i < (n & -8); i+=2) + { + res[i+0] = a[i+0] - b[i+0]; + res[i+1] = a[i+1] + b[i+1]; + } +} + +/* +** e1: +** ... +** fadd z[0-9]+.d, z[0-9]+.d, z[0-9]+.d +** movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d +** fsub z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d +** ... +*/ +void e1 (double *restrict a, double *restrict b, double *res, int n) +{ + for (int i = 0; i < (n & -4); i+=2) + { + res[i+0] = a[i+0] - b[i+0]; + res[i+1] = a[i+1] + b[i+1]; + } +}