diff mbox

Fix dr_explicit_realign vectorization (PR tree-optimization/65369)

Message ID 20150314090453.GJ1746@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek March 14, 2015, 9:04 a.m. UTC
Hi!

This issue is practically the same as PR63341, except in this case it is for
dr_explicit_realign rather than dr_explicit_realign_optimized, and the bump
isn't passed through multiple functions and thus is easier to fix.

Without the patch we use (dataptr & -16) for the first load and
((dataptr + 12) & -16) for the second load, which works just fine if the
elements are properly aligned (4 byte at least), but in this case we have
underaligned accesses (coming from folding of memcpy in this testcase, and
from 4 byte loads combined together recognized by bswap pass in the original
source), and so we really want to use ((dataptr + 15) & -16), otherwise
if we are unlucky we might read the same memory twice even when dataptr
is not 16 byte aligned.

Bootstrapped/regtested on
{x86_64,i686,aarch64,powerpc64{,le},s390{,x}}-linux, ok for trunk?

2015-03-14  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/65369
	* tree-vect-stmts.c (vectorizable_load) <case dr_explicit_realign>:
	Set bump to vs * TYPE_SIZE_UNIT (elem_type) - 1 instead of
	(vs - 1) * TYPE_SIZE_UNIT (elem_type).

	* gcc.c-torture/execute/pr65369.c: New test.


	Jakub

Comments

Richard Biener March 14, 2015, 9:49 a.m. UTC | #1
On March 14, 2015 10:04:53 AM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote:
>Hi!
>
>This issue is practically the same as PR63341, except in this case it
>is for
>dr_explicit_realign rather than dr_explicit_realign_optimized, and the
>bump
>isn't passed through multiple functions and thus is easier to fix.
>
>Without the patch we use (dataptr & -16) for the first load and
>((dataptr + 12) & -16) for the second load, which works just fine if
>the
>elements are properly aligned (4 byte at least), but in this case we
>have
>underaligned accesses (coming from folding of memcpy in this testcase,
>and
>from 4 byte loads combined together recognized by bswap pass in the
>original
>source), and so we really want to use ((dataptr + 15) & -16), otherwise
>if we are unlucky we might read the same memory twice even when dataptr
>is not 16 byte aligned.
>
>Bootstrapped/regtested on
>{x86_64,i686,aarch64,powerpc64{,le},s390{,x}}-linux, ok for trunk?

OK.

Thanks,
Richard.

>2015-03-14  Jakub Jelinek  <jakub@redhat.com>
>
>	PR tree-optimization/65369
>	* tree-vect-stmts.c (vectorizable_load) <case dr_explicit_realign>:
>	Set bump to vs * TYPE_SIZE_UNIT (elem_type) - 1 instead of
>	(vs - 1) * TYPE_SIZE_UNIT (elem_type).
>
>	* gcc.c-torture/execute/pr65369.c: New test.
>
>--- gcc/tree-vect-stmts.c.jj	2015-03-09 08:05:13.000000000 +0100
>+++ gcc/tree-vect-stmts.c	2015-03-13 17:27:30.613529768 +0100
>@@ -6468,9 +6468,8 @@ vectorizable_load (gimple stmt, gimple_s
> 		case dr_explicit_realign:
> 		  {
> 		    tree ptr, bump;
>-		    tree vs_minus_1;
> 
>-		    vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
>+		    tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> 
> 		    if (compute_in_loop)
> 		      msq = vect_setup_realignment (first_stmt, gsi,
>@@ -6499,8 +6498,9 @@ vectorizable_load (gimple stmt, gimple_s
> 		    vect_finish_stmt_generation (stmt, new_stmt, gsi);
> 		    msq = new_temp;
> 
>-		    bump = size_binop (MULT_EXPR, vs_minus_1,
>+		    bump = size_binop (MULT_EXPR, vs,
> 				       TYPE_SIZE_UNIT (elem_type));
>+		    bump = size_binop (MINUS_EXPR, bump, size_one_node);
> 		    ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
> 		    new_stmt = gimple_build_assign
> 				 (NULL_TREE, BIT_AND_EXPR, ptr,
>--- gcc/testsuite/gcc.c-torture/execute/pr65369.c.jj	2015-03-13
>17:37:10.926175685 +0100
>+++ gcc/testsuite/gcc.c-torture/execute/pr65369.c	2015-03-13
>17:35:40.000000000 +0100
>@@ -0,0 +1,45 @@
>+/* PR tree-optimization/65369 */
>+
>+static const char data[] =
>+  "12345678901234567890123456789012345678901234567890"
>+  "123456789012345678901234567890";
>+
>+__attribute__ ((noinline))
>+static void foo (const unsigned int *buf)
>+{
>+  if (__builtin_memcmp (buf, data, 64))
>+    __builtin_abort ();
>+}
>+
>+__attribute__ ((noinline))
>+static void bar (const unsigned char *block)
>+{
>+  unsigned int buf[16];
>+  __builtin_memcpy (buf +  0, block +  0, 4);
>+  __builtin_memcpy (buf +  1, block +  4, 4);
>+  __builtin_memcpy (buf +  2, block +  8, 4);
>+  __builtin_memcpy (buf +  3, block + 12, 4);
>+  __builtin_memcpy (buf +  4, block + 16, 4);
>+  __builtin_memcpy (buf +  5, block + 20, 4);
>+  __builtin_memcpy (buf +  6, block + 24, 4);
>+  __builtin_memcpy (buf +  7, block + 28, 4);
>+  __builtin_memcpy (buf +  8, block + 32, 4);
>+  __builtin_memcpy (buf +  9, block + 36, 4);
>+  __builtin_memcpy (buf + 10, block + 40, 4);
>+  __builtin_memcpy (buf + 11, block + 44, 4);
>+  __builtin_memcpy (buf + 12, block + 48, 4);
>+  __builtin_memcpy (buf + 13, block + 52, 4);
>+  __builtin_memcpy (buf + 14, block + 56, 4);
>+  __builtin_memcpy (buf + 15, block + 60, 4);
>+  foo (buf);
>+}
>+
>+int
>+main ()
>+{
>+  unsigned char input[sizeof data + 16] __attribute__((aligned (16)));
>+  __builtin_memset (input, 0, sizeof input);
>+  __builtin_memcpy (input + 1, data, sizeof data);
>+  bar (input + 1);
>+  return 0;
>+}
>
>	Jakub
diff mbox

Patch

--- gcc/tree-vect-stmts.c.jj	2015-03-09 08:05:13.000000000 +0100
+++ gcc/tree-vect-stmts.c	2015-03-13 17:27:30.613529768 +0100
@@ -6468,9 +6468,8 @@  vectorizable_load (gimple stmt, gimple_s
 		case dr_explicit_realign:
 		  {
 		    tree ptr, bump;
-		    tree vs_minus_1;
 
-		    vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
+		    tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
 
 		    if (compute_in_loop)
 		      msq = vect_setup_realignment (first_stmt, gsi,
@@ -6499,8 +6498,9 @@  vectorizable_load (gimple stmt, gimple_s
 		    vect_finish_stmt_generation (stmt, new_stmt, gsi);
 		    msq = new_temp;
 
-		    bump = size_binop (MULT_EXPR, vs_minus_1,
+		    bump = size_binop (MULT_EXPR, vs,
 				       TYPE_SIZE_UNIT (elem_type));
+		    bump = size_binop (MINUS_EXPR, bump, size_one_node);
 		    ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
 		    new_stmt = gimple_build_assign
 				 (NULL_TREE, BIT_AND_EXPR, ptr,
--- gcc/testsuite/gcc.c-torture/execute/pr65369.c.jj	2015-03-13 17:37:10.926175685 +0100
+++ gcc/testsuite/gcc.c-torture/execute/pr65369.c	2015-03-13 17:35:40.000000000 +0100
@@ -0,0 +1,45 @@ 
+/* PR tree-optimization/65369 */
+
+static const char data[] =
+  "12345678901234567890123456789012345678901234567890"
+  "123456789012345678901234567890";
+
+__attribute__ ((noinline))
+static void foo (const unsigned int *buf)
+{
+  if (__builtin_memcmp (buf, data, 64))
+    __builtin_abort ();
+}
+
+__attribute__ ((noinline))
+static void bar (const unsigned char *block)
+{
+  unsigned int buf[16];
+  __builtin_memcpy (buf +  0, block +  0, 4);
+  __builtin_memcpy (buf +  1, block +  4, 4);
+  __builtin_memcpy (buf +  2, block +  8, 4);
+  __builtin_memcpy (buf +  3, block + 12, 4);
+  __builtin_memcpy (buf +  4, block + 16, 4);
+  __builtin_memcpy (buf +  5, block + 20, 4);
+  __builtin_memcpy (buf +  6, block + 24, 4);
+  __builtin_memcpy (buf +  7, block + 28, 4);
+  __builtin_memcpy (buf +  8, block + 32, 4);
+  __builtin_memcpy (buf +  9, block + 36, 4);
+  __builtin_memcpy (buf + 10, block + 40, 4);
+  __builtin_memcpy (buf + 11, block + 44, 4);
+  __builtin_memcpy (buf + 12, block + 48, 4);
+  __builtin_memcpy (buf + 13, block + 52, 4);
+  __builtin_memcpy (buf + 14, block + 56, 4);
+  __builtin_memcpy (buf + 15, block + 60, 4);
+  foo (buf);
+}
+
+int
+main ()
+{
+  unsigned char input[sizeof data + 16] __attribute__((aligned (16)));
+  __builtin_memset (input, 0, sizeof input);
+  __builtin_memcpy (input + 1, data, sizeof data);
+  bar (input + 1);
+  return 0;
+}