From cca1f7d9578dd927b97a9210aae404b0f6135cc1 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 10 Jun 2019 09:57:15 -0700
Subject: [PATCH] Minimize number of operations between two areas of memory
For op_by_pieces operations between two areas of memory, this patch adds
-fminimize-op-by-pieces-run to minimize number of operations. When
operating on LENGTH bytes of memory, it starts with the widest usable
integer size, MAX_SIZE, for LENGTH bytes and finishes with the smallest
usable integer size, MIN_SIZE, for the remaining bytes where MAX_SIZE
>= MIN_SIZE. If MIN_SIZE > the remaining bytes, the last operation is
performed on MIN_SIZE bytes of overlapping memory from the previous
operation.
Tested on Linux/x86-64 with both -fminimize-op-by-pieces-run enabled
and disabled by default.
gcc/
PR middl-end/90773
* common.opt (-fminimize-op-by-pieces-run): New.
* expr.c (op_by_pieces_d): Add get_usable_mode.
(op_by_pieces_d::get_usable_mode): New.
(op_by_pieces_d::run): Use get_usable_mode to get the largest
usable integer mode for -fminimize-op-by-pieces-run.
* doc/invoke.texi: Document -fminimize-op-by-pieces-run.
gcc/testsuite/
PR middl-end/90773
* gcc.target/i386/pr90773-1.c: New test.
* gcc.target/i386/pr90773-2.c: Likewise.
* gcc.target/i386/pr90773-3.c: Likewise.
* gcc.target/i386/pr90773-4.c: Likewise.
* gcc.target/i386/pr90773-5.c: Likewise.
---
gcc/common.opt | 4 +
gcc/doc/invoke.texi | 9 ++-
gcc/expr.c | 97 +++++++++++++++++------
gcc/testsuite/gcc.target/i386/pr90773-1.c | 17 ++++
gcc/testsuite/gcc.target/i386/pr90773-2.c | 20 +++++
gcc/testsuite/gcc.target/i386/pr90773-3.c | 23 ++++++
gcc/testsuite/gcc.target/i386/pr90773-4.c | 13 +++
gcc/testsuite/gcc.target/i386/pr90773-5.c | 13 +++
8 files changed, 171 insertions(+), 25 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-4.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-5.c
@@ -1934,6 +1934,10 @@ fmessage-length=
Common RejectNegative Joined UInteger
-fmessage-length=<number> Limit diagnostics to <number> characters per line. 0 suppresses line-wrapping.
+fminimize-op-by-pieces-run
+Common Report Var(flag_minimize_op_by_pieces_run) Optimization
+Minimize number of operations between two areas of memory.
+
fmodulo-sched
Common Report Var(flag_modulo_sched) Optimization
Perform SMS based modulo scheduling before the first scheduling pass.
@@ -431,8 +431,8 @@ Objective-C and Objective-C++ Dialects}.
-floop-block -floop-interchange -floop-strip-mine @gol
-floop-unroll-and-jam -floop-nest-optimize @gol
-floop-parallelize-all -flra-remat -flto -flto-compression-level @gol
--flto-partition=@var{alg} -fmerge-all-constants @gol
--fmerge-constants -fmodulo-sched -fmodulo-sched-allow-regmoves @gol
+-flto-partition=@var{alg} -fmerge-all-constants -fmerge-constants @gol
+-fminimize-op-by-pieces-run -fmodulo-sched -fmodulo-sched-allow-regmoves @gol
-fmove-loop-invariants -fno-branch-count-reg @gol
-fno-defer-pop -fno-fp-int-builtin-inexact -fno-function-cse @gol
-fno-guess-branch-probability -fno-inline -fno-math-errno -fno-peephole @gol
@@ -8610,6 +8610,11 @@ instances of the same variable in recursive calls, to have distinct locations,
so using this option results in non-conforming
behavior.
+@item -fminimize-op-by-pieces-run
+@opindex fminimize-op-by-pieces-run
+Minimize number of operations between two areas of memory by using
+the largest integer operation with ovelapping memory operations.
+
@item -fmodulo-sched
@opindex fmodulo-sched
Perform swing modulo scheduling immediately before the first scheduling
@@ -1015,6 +1015,9 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)
class op_by_pieces_d
{
+ private:
+ scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);
+
protected:
pieces_addr m_to, m_from;
unsigned HOST_WIDE_INT m_len;
@@ -1082,6 +1085,25 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
m_align = align;
}
+/* This function returns the largest usable integer mode for LEN bytes
+ whose size is no bigger than size of MODE. */
+
+scalar_int_mode
+op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
+{
+ unsigned int size;
+ do
+ {
+ size = GET_MODE_SIZE (mode);
+ if (len >= size && prepare_mode (mode, m_align))
+ break;
+ /* NB: widest_int_mode_for_size checks SIZE > 1. */
+ mode = widest_int_mode_for_size (size);
+ }
+ while (1);
+ return mode;
+}
+
/* This function contains the main loop used for expanding a block
operation. First move what we can in the largest integer mode,
then go to successively smaller modes. For every access, call
@@ -1090,42 +1112,71 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
void
op_by_pieces_d::run ()
{
- while (m_max_size > 1 && m_len > 0)
+ if (m_len == 0)
+ return;
+
+ /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */
+ scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+ mode = get_usable_mode (mode, m_len);
+
+ do
{
- scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+ unsigned int size = GET_MODE_SIZE (mode);
+ rtx to1 = NULL_RTX, from1;
- if (prepare_mode (mode, m_align))
+ while (m_len >= size)
{
- unsigned int size = GET_MODE_SIZE (mode);
- rtx to1 = NULL_RTX, from1;
+ if (m_reverse)
+ m_offset -= size;
- while (m_len >= size)
- {
- if (m_reverse)
- m_offset -= size;
+ to1 = m_to.adjust (mode, m_offset);
+ from1 = m_from.adjust (mode, m_offset);
- to1 = m_to.adjust (mode, m_offset);
- from1 = m_from.adjust (mode, m_offset);
+ m_to.maybe_predec (-(HOST_WIDE_INT)size);
+ m_from.maybe_predec (-(HOST_WIDE_INT)size);
- m_to.maybe_predec (-(HOST_WIDE_INT)size);
- m_from.maybe_predec (-(HOST_WIDE_INT)size);
+ generate (to1, from1, mode);
- generate (to1, from1, mode);
+ m_to.maybe_postinc (size);
+ m_from.maybe_postinc (size);
- m_to.maybe_postinc (size);
- m_from.maybe_postinc (size);
+ if (!m_reverse)
+ m_offset += size;
- if (!m_reverse)
- m_offset += size;
+ m_len -= size;
+ }
- m_len -= size;
- }
+ finish_mode (mode);
- finish_mode (mode);
- }
+ if (m_len == 0)
+ return;
- m_max_size = GET_MODE_SIZE (mode);
+ if (flag_minimize_op_by_pieces_run)
+ {
+ /* Get the smallest integer mode for M_LEN bytes. */
+ mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
+ mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
+ int gap = GET_MODE_SIZE (mode) - m_len;
+ if (gap > 0)
+ {
+ /* If size of MODE > M_LEN, generate the last operation
+ in MODE for the remaining bytes with ovelapping memory
+ from the previois operation. */
+ if (m_reverse)
+ m_offset += gap;
+ else
+ m_offset -= gap;
+ m_len += gap;
+ }
+ }
+ else
+ {
+ /* NB: widest_int_mode_for_size checks SIZE > 1. */
+ mode = widest_int_mode_for_size (size);
+ mode = get_usable_mode (mode, m_len);
+ }
}
+ while (1);
/* The code above should have handled everything. */
gcc_assert (!m_len);
new file mode 100644
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -fminimize-op-by-pieces-run" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 15);
+}
+
+/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -fminimize-op-by-pieces-run" } */
+/* { dg-additional-options "-msse2" { target { ! ia32 } } } */
+/* { dg-additional-options "-mno-sse" { target ia32 } } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 19);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+15\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+12\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+15\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
new file mode 100644
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -fminimize-op-by-pieces-run" } */
+/* { dg-additional-options "-msse2" { target { ! ia32 } } } */
+/* { dg-additional-options "-mno-sse" { target ia32 } } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 31);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movdqu\[\\t \]+15\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+12\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+16\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+20\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+24\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+27\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2 -mtune=generic -fminimize-op-by-pieces-run" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 0, 31);
+}
+
+/* { dg-final { scan-assembler-times "movups\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[\\t \]+%xmm\[0-9\]+, 15\\(%\[\^,\]+\\)" 1 } } */
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2 -mtune=generic -fminimize-op-by-pieces-run" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 0, 21);
+}
+
+/* { dg-final { scan-assembler-times "movups\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+\\\$0+, 13\\(%\[\^,\]+\\)" 1 } } */
--
2.20.1
For op_by_pieces operations between two areas of memory, this patch adds -fminimize-op-by-pieces-run to minimize number of operations. When operating on LENGTH bytes of memory, it starts with the widest usable integer size, MAX_SIZE, for LENGTH bytes and finishes with the smallest usable integer size, MIN_SIZE, for the remaining bytes where MAX_SIZE >= MIN_SIZE. If MIN_SIZE > the remaining bytes, the last operation is performed on MIN_SIZE bytes of overlapping memory from the previous operation. Tested on Linux/x86-64 with both -fminimize-op-by-pieces-run enabled and disabled by default. I will submit a separate patch to enable -fminimize-op-by-pieces-run for -Os. OK for trunk? gcc/ PR middl-end/90773 * common.opt (-fminimize-op-by-pieces-run): New. * expr.c (op_by_pieces_d): Add get_usable_mode. (op_by_pieces_d::get_usable_mode): New. (op_by_pieces_d::run): Use get_usable_mode to get the largest usable integer mode for -fminimize-op-by-pieces-run. * doc/invoke.texi: Document -fminimize-op-by-pieces-run. gcc/testsuite/ PR middl-end/90773 * gcc.target/i386/pr90773-1.c: New test. * gcc.target/i386/pr90773-2.c: Likewise. * gcc.target/i386/pr90773-3.c: Likewise. * gcc.target/i386/pr90773-4.c: Likewise. * gcc.target/i386/pr90773-5.c: Likewise. Thanks.