diff mbox

PATCH: middle-end/58981: movmem/setmem use mode wider than Pmode for size

Message ID 20131104103216.GA13798@lucon.org
State New
Headers show

Commit Message

H.J. Lu Nov. 4, 2013, 10:32 a.m. UTC
emit_block_move_via_movmem and set_storage_via_setmem have

  for (mode = GET_CLASS_NARROWEST_MODE (MODE_INT); mode != VOIDmode;
       mode = GET_MODE_WIDER_MODE (mode))
    {
      enum insn_code code = direct_optab_handler (movmem_optab, mode);

      if (code != CODE_FOR_nothing
          /* We don't need MODE to be narrower than BITS_PER_HOST_WIDE_INT
             here because if SIZE is less than the mode mask, as it is
             returned by the macro, it will definitely be less than the
             actual mode mask.  */
          && ((CONST_INT_P (size)
               && ((unsigned HOST_WIDE_INT) INTVAL (size)
                   <= (GET_MODE_MASK (mode) >> 1)))
              || GET_MODE_BITSIZE (mode) >= BITS_PER_WORD))
        {

Backend may assume mode of size in movmem and setmem expanders is no
widder than Pmode since size is within the Pmode address space.  X86
backend expand_set_or_movmem_prologue_epilogue_by_misaligned has

      rtx saveddest = *destptr;

      gcc_assert (desired_align <= size);
      /* Align destptr up, place it to new register.  */
      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
                                      GEN_INT (prolog_size),
                                      NULL_RTX, 1, OPTAB_DIRECT);
      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
                                      GEN_INT (-desired_align),
                                      *destptr, 1, OPTAB_DIRECT);
      /* See how many bytes we skipped.  */
      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
                                       *destptr,
                                       saveddest, 1, OPTAB_DIRECT);
      /* Adjust srcptr and count.  */
      if (!issetmem)
        *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
                                        *srcptr, 1, OPTAB_DIRECT);
      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
                                    saveddest, *count, 1, OPTAB_DIRECT);

saveddest is a negative number in Pmode and *count is in word_mode.  For
x32, when Pmode is SImode and word_mode is DImode, saveddest + *count
leads to overflow.  We could fix it by using mode of saveddest to compute
saveddest + *count.  But it leads to extra conversions and other backends
may run into the same problem.  A better fix is to limit mode of size in
movmem and setmem expanders to Pmode.  It generates better and correct
memcpy and memset for x32.

There is also a typo in comments.  It should be BITS_PER_WORD, not
BITS_PER_HOST_WIDE_INT. 

Tested on x32.  OK to install?

Thanks.


H.J.
---
gcc/

2013-11-04  H.J. Lu  <hongjiu.lu@intel.com>

	PR middle-end/58981
	* expr.c (emit_block_move_via_movmem): Don't use mode wider than
	Pmode for size.
	(set_storage_via_setmem): Likewise.

gcc/testsuite/

2013-11-04  H.J. Lu  <hongjiu.lu@intel.com>

	PR middle-end/58981
	* gcc.dg/pr58981.c: New test.

Comments

Richard Sandiford Nov. 4, 2013, 11:11 a.m. UTC | #1
"H.J. Lu" <hongjiu.lu@intel.com> writes:
> emit_block_move_via_movmem and set_storage_via_setmem have
>
>   for (mode = GET_CLASS_NARROWEST_MODE (MODE_INT); mode != VOIDmode;
>        mode = GET_MODE_WIDER_MODE (mode))
>     {
>       enum insn_code code = direct_optab_handler (movmem_optab, mode);
>
>       if (code != CODE_FOR_nothing
>           /* We don't need MODE to be narrower than BITS_PER_HOST_WIDE_INT
>              here because if SIZE is less than the mode mask, as it is
>              returned by the macro, it will definitely be less than the
>              actual mode mask.  */
>           && ((CONST_INT_P (size)
>                && ((unsigned HOST_WIDE_INT) INTVAL (size)
>                    <= (GET_MODE_MASK (mode) >> 1)))
>               || GET_MODE_BITSIZE (mode) >= BITS_PER_WORD))
>         {
>
> Backend may assume mode of size in movmem and setmem expanders is no
> widder than Pmode since size is within the Pmode address space.  X86
> backend expand_set_or_movmem_prologue_epilogue_by_misaligned has
>
>       rtx saveddest = *destptr;
>
>       gcc_assert (desired_align <= size);
>       /* Align destptr up, place it to new register.  */
>       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
>                                       GEN_INT (prolog_size),
>                                       NULL_RTX, 1, OPTAB_DIRECT);
>       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
>                                       GEN_INT (-desired_align),
>                                       *destptr, 1, OPTAB_DIRECT);
>       /* See how many bytes we skipped.  */
>       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
>                                        *destptr,
>                                        saveddest, 1, OPTAB_DIRECT);
>       /* Adjust srcptr and count.  */
>       if (!issetmem)
>         *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
>                                         *srcptr, 1, OPTAB_DIRECT);
>       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
>                                     saveddest, *count, 1, OPTAB_DIRECT);
>
> saveddest is a negative number in Pmode and *count is in word_mode.  For
> x32, when Pmode is SImode and word_mode is DImode, saveddest + *count
> leads to overflow.  We could fix it by using mode of saveddest to compute
> saveddest + *count.  But it leads to extra conversions and other backends
> may run into the same problem.  A better fix is to limit mode of size in
> movmem and setmem expanders to Pmode.  It generates better and correct
> memcpy and memset for x32.
>
> There is also a typo in comments.  It should be BITS_PER_WORD, not
> BITS_PER_HOST_WIDE_INT. 

I don't think it's a typo.  It's explaining why we don't have to worry about:

    GET_MODE_BITSIZE (mode) > BITS_PER_HOST_WIDE_INT

in the CONST_INT_P test (because in that case the GET_MODE_MASK macro
will be an all-1 HOST_WIDE_INT, even though that's narrower than the
real mask).

I don't think the current comment covers the BITS_PER_WORD test at all.
AIUI it's there because the pattern is defined as taking a length of
at most word_mode, so we should stop once we reach it.

FWIW, I agree Pmode makes more sense on face value.  But shouldn't
we replace the BITS_PER_WORD test instead of adding to it?  Having both
would only make a difference for Pmode > word_mode targets, which might
be able to handle full Pmode lengths.

Either way, the md.texi documentation should be updated too.

Thanks,
Richard
H.J. Lu Nov. 4, 2013, 11:34 a.m. UTC | #2
Y
On Mon, Nov 4, 2013 at 3:11 AM, Richard Sandiford
<rsandifo@linux.vnet.ibm.com> wrote:
> "H.J. Lu" <hongjiu.lu@intel.com> writes:
>> emit_block_move_via_movmem and set_storage_via_setmem have
>>
>>   for (mode = GET_CLASS_NARROWEST_MODE (MODE_INT); mode != VOIDmode;
>>        mode = GET_MODE_WIDER_MODE (mode))
>>     {
>>       enum insn_code code = direct_optab_handler (movmem_optab, mode);
>>
>>       if (code != CODE_FOR_nothing
>>           /* We don't need MODE to be narrower than BITS_PER_HOST_WIDE_INT
>>              here because if SIZE is less than the mode mask, as it is
>>              returned by the macro, it will definitely be less than the
>>              actual mode mask.  */
>>           && ((CONST_INT_P (size)
>>                && ((unsigned HOST_WIDE_INT) INTVAL (size)
>>                    <= (GET_MODE_MASK (mode) >> 1)))
>>               || GET_MODE_BITSIZE (mode) >= BITS_PER_WORD))
>>         {
>>
>> Backend may assume mode of size in movmem and setmem expanders is no
>> widder than Pmode since size is within the Pmode address space.  X86
>> backend expand_set_or_movmem_prologue_epilogue_by_misaligned has
>>
>>       rtx saveddest = *destptr;
>>
>>       gcc_assert (desired_align <= size);
>>       /* Align destptr up, place it to new register.  */
>>       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
>>                                       GEN_INT (prolog_size),
>>                                       NULL_RTX, 1, OPTAB_DIRECT);
>>       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
>>                                       GEN_INT (-desired_align),
>>                                       *destptr, 1, OPTAB_DIRECT);
>>       /* See how many bytes we skipped.  */
>>       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
>>                                        *destptr,
>>                                        saveddest, 1, OPTAB_DIRECT);
>>       /* Adjust srcptr and count.  */
>>       if (!issetmem)
>>         *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
>>                                         *srcptr, 1, OPTAB_DIRECT);
>>       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
>>                                     saveddest, *count, 1, OPTAB_DIRECT);
>>
>> saveddest is a negative number in Pmode and *count is in word_mode.  For
>> x32, when Pmode is SImode and word_mode is DImode, saveddest + *count
>> leads to overflow.  We could fix it by using mode of saveddest to compute
>> saveddest + *count.  But it leads to extra conversions and other backends
>> may run into the same problem.  A better fix is to limit mode of size in
>> movmem and setmem expanders to Pmode.  It generates better and correct
>> memcpy and memset for x32.
>>
>> There is also a typo in comments.  It should be BITS_PER_WORD, not
>> BITS_PER_HOST_WIDE_INT.
>
> I don't think it's a typo.  It's explaining why we don't have to worry about:
>
>     GET_MODE_BITSIZE (mode) > BITS_PER_HOST_WIDE_INT
>
> in the CONST_INT_P test (because in that case the GET_MODE_MASK macro
> will be an all-1 HOST_WIDE_INT, even though that's narrower than the
> real mask).

Thanks for explanation.

> I don't think the current comment covers the BITS_PER_WORD test at all.
> AIUI it's there because the pattern is defined as taking a length of
> at most word_mode, so we should stop once we reach it.

I see.

> FWIW, I agree Pmode makes more sense on face value.  But shouldn't
> we replace the BITS_PER_WORD test instead of adding to it?  Having both
> would only make a difference for Pmode > word_mode targets, which might
> be able to handle full Pmode lengths.

Do we ever have a target with Pmode is wider than
word_mode? If not, we can check Pmode instead.

> Either way, the md.texi documentation should be updated too.
>
> Thanks,
> Richard
>
diff mbox

Patch

diff --git a/gcc/expr.c b/gcc/expr.c
index 551a660..1a869650 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1294,13 +1294,16 @@  emit_block_move_via_movmem (rtx x, rtx y, rtx size, unsigned int align,
       enum insn_code code = direct_optab_handler (movmem_optab, mode);
 
       if (code != CODE_FOR_nothing
-	  /* We don't need MODE to be narrower than BITS_PER_HOST_WIDE_INT
-	     here because if SIZE is less than the mode mask, as it is
+	  /* We don't need MODE to be narrower than BITS_PER_WORD here
+	     because if SIZE is less than the mode mask, as it is
 	     returned by the macro, it will definitely be less than the
-	     actual mode mask.  */
+	     actual mode mask unless MODE is wider than Pmode.  Since
+	     SIZE is within the Pmode address space, we should use
+	     Pmode in this case.  */
 	  && ((CONST_INT_P (size)
 	       && ((unsigned HOST_WIDE_INT) INTVAL (size)
 		   <= (GET_MODE_MASK (mode) >> 1)))
+	      || GET_MODE_BITSIZE (mode) >= GET_MODE_BITSIZE (Pmode)
 	      || GET_MODE_BITSIZE (mode) >= BITS_PER_WORD))
 	{
 	  struct expand_operand ops[6];
@@ -2879,13 +2882,16 @@  set_storage_via_setmem (rtx object, rtx size, rtx val, unsigned int align,
       enum insn_code code = direct_optab_handler (setmem_optab, mode);
 
       if (code != CODE_FOR_nothing
-	  /* We don't need MODE to be narrower than
-	     BITS_PER_HOST_WIDE_INT here because if SIZE is less than
-	     the mode mask, as it is returned by the macro, it will
-	     definitely be less than the actual mode mask.  */
+	  /* We don't need MODE to be narrower than BITS_PER_WORD here
+	     because if SIZE is less than the mode mask, as it is
+	     returned by the macro, it will definitely be less than the
+	     actual mode mask unless MODE is wider than Pmode.  Since
+	     SIZE is within the Pmode address space, we should use
+	     Pmode in this case.  */
 	  && ((CONST_INT_P (size)
 	       && ((unsigned HOST_WIDE_INT) INTVAL (size)
 		   <= (GET_MODE_MASK (mode) >> 1)))
+	      || GET_MODE_BITSIZE (mode) >= GET_MODE_BITSIZE (Pmode)
 	      || GET_MODE_BITSIZE (mode) >= BITS_PER_WORD))
 	{
 	  struct expand_operand ops[6];
diff --git a/gcc/testsuite/gcc.dg/pr58981.c b/gcc/testsuite/gcc.dg/pr58981.c
new file mode 100644
index 0000000..1c8293e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr58981.c
@@ -0,0 +1,55 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-minline-all-stringops" { target { i?86-*-* x86_64-*-* } } } */
+
+extern void abort (void);
+
+#define MAX_OFFSET (sizeof (long long))
+#define MAX_COPY (8 * sizeof (long long))
+#define MAX_EXTRA (sizeof (long long))
+
+#define MAX_LENGTH (MAX_OFFSET + MAX_COPY + MAX_EXTRA)
+
+static union {
+  char buf[MAX_LENGTH];
+  long long align_int;
+  long double align_fp;
+} u;
+
+char A[MAX_LENGTH];
+
+int
+main ()
+{
+  int off, len, i;
+  char *p, *q;
+
+  for (i = 0; i < MAX_LENGTH; i++)
+    A[i] = 'A';
+
+  for (off = 0; off < MAX_OFFSET; off++)
+    for (len = 1; len < MAX_COPY; len++)
+      {
+	for (i = 0; i < MAX_LENGTH; i++)
+	  u.buf[i] = 'a';
+
+	p = __builtin_memcpy (u.buf + off, A, len);
+	if (p != u.buf + off)
+	  abort ();
+
+	q = u.buf;
+	for (i = 0; i < off; i++, q++)
+	  if (*q != 'a')
+	    abort ();
+
+	for (i = 0; i < len; i++, q++)
+	  if (*q != 'A')
+	    abort ();
+
+	for (i = 0; i < MAX_EXTRA; i++, q++)
+	  if (*q != 'a')
+	    abort ();
+      }
+
+  return 0;
+}