@@ -1334,15 +1334,56 @@ lower_vec_perm (gimple_stmt_iterator *gs
update_stmt (gsi_stmt (*gsi));
}
+/* Return type in which CODE operation with optab OP can be
+ computed. */
+
+static tree
+get_compute_type (enum tree_code code, optab op, tree type)
+{
+ /* For very wide vectors, try using a smaller vector mode. */
+ tree compute_type = type;
+ if (!VECTOR_MODE_P (TYPE_MODE (type)) && op)
+ {
+ tree vector_compute_type
+ = type_for_widest_vector_mode (TREE_TYPE (type), op);
+ if (vector_compute_type != NULL_TREE
+ && (TYPE_VECTOR_SUBPARTS (vector_compute_type)
+ < TYPE_VECTOR_SUBPARTS (compute_type))
+ && (optab_handler (op, TYPE_MODE (vector_compute_type))
+ != CODE_FOR_nothing))
+ compute_type = vector_compute_type;
+ }
+
+ /* If we are breaking a BLKmode vector into smaller pieces,
+ type_for_widest_vector_mode has already looked into the optab,
+ so skip these checks. */
+ if (compute_type == type)
+ {
+ enum machine_mode compute_mode = TYPE_MODE (compute_type);
+ if (VECTOR_MODE_P (compute_mode))
+ {
+ if (op && optab_handler (op, compute_mode) != CODE_FOR_nothing)
+ return compute_type;
+ if (code == MULT_HIGHPART_EXPR
+ && can_mult_highpart_p (compute_mode,
+ TYPE_UNSIGNED (compute_type)))
+ return compute_type;
+ }
+ /* There is no operation in hardware, so fall back to scalars. */
+ compute_type = TREE_TYPE (type);
+ }
+
+ return compute_type;
+}
+
/* Process one statement. If we identify a vector operation, expand it. */
static void
expand_vector_operations_1 (gimple_stmt_iterator *gsi)
{
gimple stmt = gsi_stmt (*gsi);
- tree lhs, rhs1, rhs2 = NULL, type, compute_type;
+ tree lhs, rhs1, rhs2 = NULL, type, compute_type = NULL_TREE;
enum tree_code code;
- enum machine_mode compute_mode;
optab op = unknown_optab;
enum gimple_rhs_class rhs_class;
tree new_rhs;
@@ -1461,6 +1502,41 @@ expand_vector_operations_1 (gimple_stmt_
&& optab_handler (opv, TYPE_MODE (type)) != CODE_FOR_nothing)
return;
}
+
+ if (code == LROTATE_EXPR || code == RROTATE_EXPR)
+ {
+ compute_type = get_compute_type (code, op, type);
+ if (compute_type == type)
+ return;
+ /* Before splitting vector rotates into scalar rotates,
+ see if we can't use vector shifts and BIT_IOR_EXPR
+ instead. For vector by vector rotates we'd also
+ need to check BIT_AND_EXPR and NEGATE_EXPR, punt there
+ for now, fold doesn't seem to create such rotates anyway. */
+ if (compute_type == TREE_TYPE (type)
+ && !VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2)))
+ {
+ optab oplv, opl, oprv, opr, opo;
+ oplv = optab_for_tree_code (LSHIFT_EXPR, type, optab_vector);
+ oprv = optab_for_tree_code (RSHIFT_EXPR, type, optab_vector);
+ opo = optab_for_tree_code (BIT_IOR_EXPR, type, optab_default);
+ opl = optab_for_tree_code (LSHIFT_EXPR, type, optab_scalar);
+ opr = optab_for_tree_code (RSHIFT_EXPR, type, optab_scalar);
+ if (optab_handler (oplv, TYPE_MODE (type)) != CODE_FOR_nothing
+ && optab_handler (opl, TYPE_MODE (type)) == CODE_FOR_nothing
+ && optab_handler (oprv, TYPE_MODE (type)) != CODE_FOR_nothing
+ && optab_handler (opr, TYPE_MODE (type)) == CODE_FOR_nothing)
+ {
+ opl = oplv;
+ opr = oprv;
+ }
+ compute_type = get_compute_type (LSHIFT_EXPR, opl, type);
+ if (compute_type == TREE_TYPE (type)
+ || compute_type != get_compute_type (RSHIFT_EXPR, opr, type)
+ || compute_type != get_compute_type (BIT_IOR_EXPR, opo, type))
+ compute_type = TREE_TYPE (type);
+ }
+ }
}
else
op = optab_for_tree_code (code, type, optab_default);
@@ -1473,38 +1549,10 @@ expand_vector_operations_1 (gimple_stmt_
&& INTEGRAL_TYPE_P (TREE_TYPE (type)))
op = optab_for_tree_code (MINUS_EXPR, type, optab_default);
- /* For very wide vectors, try using a smaller vector mode. */
- compute_type = type;
- if (!VECTOR_MODE_P (TYPE_MODE (type)) && op)
- {
- tree vector_compute_type
- = type_for_widest_vector_mode (TREE_TYPE (type), op);
- if (vector_compute_type != NULL_TREE
- && (TYPE_VECTOR_SUBPARTS (vector_compute_type)
- < TYPE_VECTOR_SUBPARTS (compute_type))
- && (optab_handler (op, TYPE_MODE (vector_compute_type))
- != CODE_FOR_nothing))
- compute_type = vector_compute_type;
- }
-
- /* If we are breaking a BLKmode vector into smaller pieces,
- type_for_widest_vector_mode has already looked into the optab,
- so skip these checks. */
+ if (compute_type == NULL_TREE)
+ compute_type = get_compute_type (code, op, type);
if (compute_type == type)
- {
- compute_mode = TYPE_MODE (compute_type);
- if (VECTOR_MODE_P (compute_mode))
- {
- if (op && optab_handler (op, compute_mode) != CODE_FOR_nothing)
- return;
- if (code == MULT_HIGHPART_EXPR
- && can_mult_highpart_p (compute_mode,
- TYPE_UNSIGNED (compute_type)))
- return;
- }
- /* There is no operation in hardware, so fall back to scalars. */
- compute_type = TREE_TYPE (type);
- }
+ return;
gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR);
new_rhs = expand_vector_operation (gsi, type, compute_type, stmt, code);
@@ -2128,9 +2128,12 @@ expand_shift_1 (enum tree_code code, enu
optab lrotate_optab = rotl_optab;
optab rrotate_optab = rotr_optab;
enum machine_mode op1_mode;
+ enum machine_mode scalar_mode = mode;
int attempt;
bool speed = optimize_insn_for_speed_p ();
+ if (VECTOR_MODE_P (mode))
+ scalar_mode = GET_MODE_INNER (mode);
op1 = amount;
op1_mode = GET_MODE (op1);
@@ -2153,9 +2156,9 @@ expand_shift_1 (enum tree_code code, enu
{
if (CONST_INT_P (op1)
&& ((unsigned HOST_WIDE_INT) INTVAL (op1) >=
- (unsigned HOST_WIDE_INT) GET_MODE_BITSIZE (mode)))
+ (unsigned HOST_WIDE_INT) GET_MODE_BITSIZE (scalar_mode)))
op1 = GEN_INT ((unsigned HOST_WIDE_INT) INTVAL (op1)
- % GET_MODE_BITSIZE (mode));
+ % GET_MODE_BITSIZE (scalar_mode));
else if (GET_CODE (op1) == SUBREG
&& subreg_lowpart_p (op1)
&& SCALAR_INT_MODE_P (GET_MODE (SUBREG_REG (op1)))
@@ -2169,10 +2172,10 @@ expand_shift_1 (enum tree_code code, enu
amount instead. */
if (rotate
&& CONST_INT_P (op1)
- && IN_RANGE (INTVAL (op1), GET_MODE_BITSIZE (mode) / 2 + left,
- GET_MODE_BITSIZE (mode) - 1))
+ && IN_RANGE (INTVAL (op1), GET_MODE_BITSIZE (scalar_mode) / 2 + left,
+ GET_MODE_BITSIZE (scalar_mode) - 1))
{
- op1 = GEN_INT (GET_MODE_BITSIZE (mode) - INTVAL (op1));
+ op1 = GEN_INT (GET_MODE_BITSIZE (scalar_mode) - INTVAL (op1));
left = !left;
code = left ? LROTATE_EXPR : RROTATE_EXPR;
}
@@ -2185,7 +2188,7 @@ expand_shift_1 (enum tree_code code, enu
if (code == LSHIFT_EXPR
&& CONST_INT_P (op1)
&& INTVAL (op1) > 0
- && INTVAL (op1) < GET_MODE_PRECISION (mode)
+ && INTVAL (op1) < GET_MODE_PRECISION (scalar_mode)
&& INTVAL (op1) < MAX_BITS_PER_WORD
&& (shift_cost (speed, mode, INTVAL (op1))
> INTVAL (op1) * add_cost (speed, mode))
@@ -2240,14 +2243,14 @@ expand_shift_1 (enum tree_code code, enu
if (op1 == const0_rtx)
return shifted;
else if (CONST_INT_P (op1))
- other_amount = GEN_INT (GET_MODE_BITSIZE (mode)
+ other_amount = GEN_INT (GET_MODE_BITSIZE (scalar_mode)
- INTVAL (op1));
else
{
other_amount
= simplify_gen_unary (NEG, GET_MODE (op1),
op1, GET_MODE (op1));
- HOST_WIDE_INT mask = GET_MODE_PRECISION (mode) - 1;
+ HOST_WIDE_INT mask = GET_MODE_PRECISION (scalar_mode) - 1;
other_amount
= simplify_gen_binary (AND, GET_MODE (op1), other_amount,
gen_int_mode (mask, GET_MODE (op1)));
@@ -0,0 +1,171 @@
+/* PR tree-optimization/57233 */
+/* { dg-do run { target { ilp32 || lp64 } } } */
+/* { dg-options "-O2" } */
+
+typedef unsigned V4 __attribute__((vector_size(4 * sizeof (int))));
+typedef unsigned V8 __attribute__((vector_size(8 * sizeof (int))));
+typedef unsigned V16 __attribute__((vector_size(16 * sizeof (int))));
+V4 a, b, g;
+V8 c, d, h;
+V16 e, f, j;
+
+__attribute__((noinline)) void
+f1 (void)
+{
+ a = (a << 2) | (a >> 30);
+}
+
+__attribute__((noinline)) void
+f2 (void)
+{
+ a = (a << 30) | (a >> 2);
+}
+
+__attribute__((noinline)) void
+f3 (void)
+{
+ a = (a << b) | (a >> (32 - b));
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+ a = (a << x) | (a >> (32 - x));
+}
+
+__attribute__((noinline)) void
+f5 (void)
+{
+ c = (c << 2) | (c >> 30);
+}
+
+__attribute__((noinline)) void
+f6 (void)
+{
+ c = (c << 30) | (c >> 2);
+}
+
+__attribute__((noinline)) void
+f7 (void)
+{
+ c = (c << d) | (c >> (32 - d));
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int x)
+{
+ c = (c << x) | (c >> (32 - x));
+}
+
+__attribute__((noinline)) void
+f9 (void)
+{
+ e = (e << 2) | (e >> 30);
+}
+
+__attribute__((noinline)) void
+f10 (void)
+{
+ e = (e << 30) | (e >> 2);
+}
+
+__attribute__((noinline)) void
+f11 (void)
+{
+ e = (e << f) | (e >> (32 - f));
+}
+
+__attribute__((noinline, noclone)) void
+f12 (int x)
+{
+ e = (e << x) | (e >> (32 - x));
+}
+
+unsigned
+r (void)
+{
+ static unsigned x = 0xdeadbeefU;
+ static unsigned y = 0x12347654U;
+ static unsigned z = 0x1a2b3c4dU;
+ static unsigned w = 0x87654321U;
+ unsigned t = x ^ (x << 11);
+ x = y;
+ y = z;
+ z = w;
+ w = w ^ (w >> 19) ^ t ^ (t >> 8);
+ return w;
+}
+
+void
+init (unsigned int *p, int count, int mod)
+{
+ int i;
+ for (i = 0; i < count; i++)
+ {
+ unsigned int v = r ();
+ if (mod)
+ v = (v % 31) + 1;
+ p[i] = v;
+ }
+}
+
+void
+check (unsigned int *p, unsigned int *q, int count, unsigned int *s, int ss)
+{
+ int i;
+ for (i = 0; i < count; i++)
+ {
+ if (s)
+ ss = s[i];
+ if (p[i] != ((q[i] << ss) | (q[i] >> (32 - ss))))
+ __builtin_abort ();
+ }
+}
+
+int
+main ()
+{
+ init ((unsigned int *) &a, 4, 0);
+ init ((unsigned int *) &b, 4, 1);
+ init ((unsigned int *) &c, 8, 0);
+ init ((unsigned int *) &d, 8, 1);
+ init ((unsigned int *) &e, 16, 0);
+ init ((unsigned int *) &f, 16, 1);
+ g = a;
+ h = c;
+ j = e;
+ f1 ();
+ f5 ();
+ f9 ();
+ check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 2);
+ check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 2);
+ check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 2);
+ g = a;
+ h = c;
+ j = e;
+ f2 ();
+ f6 ();
+ f10 ();
+ check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 30);
+ check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 30);
+ check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 30);
+ g = a;
+ h = c;
+ j = e;
+ f3 ();
+ f7 ();
+ f11 ();
+ check ((unsigned int *) &a, (unsigned int *) &g, 4, (unsigned int *) &b, 0);
+ check ((unsigned int *) &c, (unsigned int *) &h, 8, (unsigned int *) &d, 0);
+ check ((unsigned int *) &e, (unsigned int *) &j, 16, (unsigned int *) &f, 0);
+ g = a;
+ h = c;
+ j = e;
+ f4 (5);
+ f8 (5);
+ f12 (5);
+ check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 5);
+ check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 5);
+ check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 5);
+ return 0;
+}
@@ -0,0 +1,15 @@
+/* PR tree-optimization/57233 */
+/* { dg-do compile { target avx } } */
+/* { dg-options "-O2 -mavx -mno-xop" } */
+
+typedef unsigned V4 __attribute__((vector_size(4 * sizeof (int))));
+V4 a;
+
+__attribute__((noinline)) void
+foo (void)
+{
+ a = (a << 2) | (a >> 30);
+}
+
+/* { dg-final { scan-assembler "vpsrld\[^\n\r]*30" } } */
+/* { dg-final { scan-assembler "vpslld\[^\n\r]*2" } } */
@@ -0,0 +1,16 @@
+/* PR tree-optimization/57233 */
+/* { dg-do run { target sse2 } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "sse2-check.h"
+
+static void
+sse2_test (void)
+{
+ do_main ();
+}
+
+#undef main
+#define main() do_main ()
+
+#include "../../gcc.dg/pr57233.c"
@@ -0,0 +1,16 @@
+/* PR tree-optimization/57233 */
+/* { dg-do run { target avx } } */
+/* { dg-options "-O2 -mavx" } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+ do_main ();
+}
+
+#undef main
+#define main() do_main ()
+
+#include "../../gcc.dg/pr57233.c"
@@ -0,0 +1,16 @@
+/* PR tree-optimization/57233 */
+/* { dg-do run { target avx2 } } */
+/* { dg-options "-O2 -mavx2" } */
+
+#include "avx2-check.h"
+
+static void
+avx2_test (void)
+{
+ do_main ();
+}
+
+#undef main
+#define main() do_main ()
+
+#include "../../gcc.dg/pr57233.c"
@@ -0,0 +1,16 @@
+/* PR tree-optimization/57233 */
+/* { dg-do run { target avx512f } } */
+/* { dg-options "-O2 -mavx512f" } */
+
+#include "avx512f-check.h"
+
+static void
+avx512f_test (void)
+{
+ do_main ();
+}
+
+#undef main
+#define main() do_main ()
+
+#include "../../gcc.dg/pr57233.c"