mbox series

[v5,00/10] Allow TImode/OImode/XImode in op_by_pieces operations

Message ID 20210730213211.1832774-1-hjl.tools@gmail.com
Headers show
Series Allow TImode/OImode/XImode in op_by_pieces operations | expand

Message

H.J. Lu July 30, 2021, 9:32 p.m. UTC
Changes in the v6 patches:

1. No need to add TARGET_GEN_MEMSET_SCRATCH_RTX nor change the memset
expanders since they have been checked into master branch.

Changes in the v5 patches:

1. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
scratch register to avoid stack realignment when expanding memset.
2. Use vec_duplicate, instead of adding TARGET_READ_MEMSET_VALUE and
TARGET_GEN_MEMSET_VALUE, to expand memset if available.

Changes in the v4 patches:

1. Define x86 MAX_MOVE_MAX to 64, which is the constant maximum number
of bytes that a single instruction can move quickly between memory and
registers or between two memory locations.
2. Define x86 MOVE_MAX to MOVE_MAX_PIECES, which is the maximum number of
bytes we can move from memory to memory in one reasonably fast instruction.
The difference between MAX_MOVE_MAX and MOVE_MAX is that MAX_MOVE_MAX
must be a constant, independent of compiler options, since it is used in
reload.h to define struct target_reload and MOVE_MAX can vary, depending
on compiler options.

Changes in the v3 patches:

1. Split the TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE changes
into the generic part and the x86 part.


1. Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
target instructions to duplicate QImode value to TImode/OImode/XImode
value for memmset.
2. x86: Avoid stack realignment when copying data
3. x86: Remov MAX_BITSIZE_MODE_ANY_INT.  Only x86 backend defines it.
4. x86: Use TImode/OImode/XImode integers for piecewise move and store.
5. x86: Add tests for TImode/OImode/XImode for piecewise move and store.
6. x86: Adjust existing tests.

On x86-64, SPEC CPU 2017 performance impact is neutral.  Glibc code size
differences with -O2 build are:

             Before         After
libc.so     1906572        1906444

Some code sequence differences in libc.so are:

<svcudp_bufcreate@GLIBC_2.2.5>:
	...
	jne    <svcudp_bufcreate@GLIBC_2.2.5+0x318>	      |		jne    <svcudp_bufcreate@GLIBC_2.2.5+0x2a8>
	test   %r15,%r15						test   %r15,%r15
	je     <svcudp_bufcreate@GLIBC_2.2.5+0x318>	      |		je     <svcudp_bufcreate@GLIBC_2.2.5+0x2a8>
	mov    %r13d,(%r14)						mov    %r13d,(%r14)
	lea    0x10(%r14),%rdi						lea    0x10(%r14),%rdi
	mov    $0x1,%ecx						mov    $0x1,%ecx
	mov    %r13d,%edx						mov    %r13d,%edx
	mov    %r15,0x40(%r12)						mov    %r15,0x40(%r12)
	mov    %r15,%rsi						mov    %r15,%rsi
	call   <xdrmem_create@GLIBC_2.2.5>				call   <xdrmem_create@GLIBC_2.2.5>
	lea    0xa2f9b(%rip),%rax        # <svcudp_op>	      |		lea    0xa2fab(%rip),%rax        # <svcudp_op>
	xor    %esi,%esi						xor    %esi,%esi
	mov    %ebp,%edi						mov    %ebp,%edi
	mov    %rax,0x8(%r12)						mov    %rax,0x8(%r12)
	movzwl 0x12(%rsp),%eax						movzwl 0x12(%rsp),%eax
	mov    $0x8,%edx				      <
	lea    0xc(%rsp),%rcx						lea    0xc(%rsp),%rcx
	mov    %r14,0x48(%r12)				      <
	add    $0x40,%r14				      <
	mov    $0x4,%r8d						mov    $0x4,%r8d
							      >		movq   $0x0,0x1d0(%r14)
							      >		mov    $0x8,%edx
	rol    $0x8,%ax							rol    $0x8,%ax
	mov    %ebp,(%r12)				      |		mov    %r14,0x48(%r12)
	movq   $0x0,0x190(%r14)				      |		add    $0x40,%r14
	mov    %ax,0x4(%r12)				      <
	mov    %r14,0x30(%r12)						mov    %r14,0x30(%r12)
							      >		mov    %ax,0x4(%r12)
							      >		mov    %ebp,(%r12)
	movl   $0x1,0xc(%rsp)						movl   $0x1,0xc(%rsp)
	call   <setsockopt>						call   <setsockopt>
	mov    %r12,%rdi						mov    %r12,%rdi
	movabs $0x101010101010101,%rdx			      <
	test   %eax,%eax						test   %eax,%eax
	mov    $0xff,%eax						mov    $0xff,%eax
	cmove  %eax,%ebx						cmove  %eax,%ebx
	movzbl %bl,%eax					      |		movd   %ebx,%xmm0
	mov    %ebx,0xc(%rsp)						mov    %ebx,0xc(%rsp)
	mov    %rax,%rsi				      |		punpcklbw %xmm0,%xmm0
	imul   %rdx,%rsi				      |		punpcklwd %xmm0,%xmm0
	mul    %rdx					      |		pshufd $0x0,%xmm0,%xmm0
	add    %rsi,%rdx				      |		movups %xmm0,0x50(%r12)
	mov    %rax,0x50(%r12)				      |		movups %xmm0,0x60(%r12)
	mov    %rdx,0x58(%r12)				      |		movups %xmm0,0x70(%r12)
	mov    %rax,0x60(%r12)				      |		movups %xmm0,0x80(%r12)
	mov    %rdx,0x68(%r12)				      |		movups %xmm0,0x90(%r12)
	mov    %rax,0x70(%r12)				      |		movups %xmm0,0xa0(%r12)
	mov    %rdx,0x78(%r12)				      |		movups %xmm0,0xb0(%r12)
	mov    %rax,0x80(%r12)				      |		movups %xmm0,0xc0(%r12)
	mov    %rdx,0x88(%r12)				      |		movups %xmm0,0xd0(%r12)
	mov    %rax,0x90(%r12)				      |		movups %xmm0,0xe0(%r12)
	mov    %rdx,0x98(%r12)				      |		movups %xmm0,0xf0(%r12)
	mov    %rax,0xa0(%r12)				      |		movups %xmm0,0x100(%r12)
	mov    %rdx,0xa8(%r12)				      |		movups %xmm0,0x110(%r12)
	mov    %rax,0xb0(%r12)				      |		movups %xmm0,0x120(%r12)
	mov    %rdx,0xb8(%r12)				      |		movups %xmm0,0x130(%r12)
	mov    %rax,0xc0(%r12)				      |		movups %xmm0,0x140(%r12)
	mov    %rdx,0xc8(%r12)				      <
	mov    %rax,0xd0(%r12)				      <
	mov    %rdx,0xd8(%r12)				      <
	mov    %rax,0xe0(%r12)				      <
	mov    %rdx,0xe8(%r12)				      <
	mov    %rax,0xf0(%r12)				      <
	mov    %rdx,0xf8(%r12)				      <
	mov    %rax,0x100(%r12)				      <
	mov    %rdx,0x108(%r12)				      <
	mov    %rax,0x110(%r12)				      <
	mov    %rdx,0x118(%r12)				      <
	mov    %rax,0x120(%r12)				      <
	mov    %rdx,0x128(%r12)				      <
	mov    %rax,0x130(%r12)				      <
	mov    %rdx,0x138(%r12)				      <
	mov    %rax,0x140(%r12)				      <
	mov    %rdx,0x148(%r12)				      <
	call   <xprt_register@GLIBC_2.2.5>				call   <xprt_register@GLIBC_2.2.5>
	add    $0x28,%rsp						add    $0x28,%rsp
	mov    %r12,%rax						mov    %r12,%rax
	pop    %rbx							pop    %rbx
	pop    %rbp							pop    %rbp
	pop    %r12							pop    %r12
	pop    %r13							pop    %r13
	pop    %r14							pop    %r14
	pop    %r15							pop    %r15
	ret    								ret    

H.J. Lu (10):
  x86: Add TARGET_GEN_MEMSET_SCRATCH_RTX
  x86: Avoid stack realignment when copying data
  x86: Update piecewise move and store
  x86: Add AVX2 tests for PR middle-end/90773
  x86: Add tests for piecewise move and store
  x86: Also pass -mno-avx to pr72839.c
  x86: Also pass -mno-avx to cold-attribute-1.c
  x86: Also pass -mno-avx to sw-1.c for ia32
  x86: Update gcc.target/i386/incoming-11.c
  x86: Also pass -mno-sse to vect8-ret.c

 gcc/config/i386/i386-expand.c                 |  4 +-
 gcc/config/i386/i386.c                        | 27 +++++++++++--
 gcc/config/i386/i386.h                        | 40 +++++++++++++++----
 .../gcc.target/i386/cold-attribute-1.c        |  2 +-
 gcc/testsuite/gcc.target/i386/eh_return-1.c   | 26 ++++++++++++
 gcc/testsuite/gcc.target/i386/incoming-11.c   |  2 +-
 .../gcc.target/i386/pieces-memcpy-10.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memcpy-11.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memcpy-12.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memcpy-13.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memcpy-14.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memcpy-15.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memcpy-16.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memcpy-7.c         | 15 +++++++
 .../gcc.target/i386/pieces-memcpy-8.c         | 14 +++++++
 .../gcc.target/i386/pieces-memcpy-9.c         | 14 +++++++
 .../gcc.target/i386/pieces-memset-1.c         | 16 ++++++++
 .../gcc.target/i386/pieces-memset-10.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-11.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-12.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-13.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-14.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-15.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-16.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-17.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-18.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-19.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-2.c         | 12 ++++++
 .../gcc.target/i386/pieces-memset-20.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-21.c        | 18 +++++++++
 .../gcc.target/i386/pieces-memset-22.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-23.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-24.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-25.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-26.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-27.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-28.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-29.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-3.c         | 18 +++++++++
 .../gcc.target/i386/pieces-memset-30.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-31.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-32.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-33.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-34.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-35.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-36.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-37.c        | 15 +++++++
 .../gcc.target/i386/pieces-memset-38.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-39.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-4.c         | 16 ++++++++
 .../gcc.target/i386/pieces-memset-40.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-41.c        | 16 ++++++++
 .../gcc.target/i386/pieces-memset-42.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-43.c        | 17 ++++++++
 .../gcc.target/i386/pieces-memset-44.c        | 18 +++++++++
 .../gcc.target/i386/pieces-memset-5.c         | 12 ++++++
 .../gcc.target/i386/pieces-memset-6.c         | 16 ++++++++
 .../gcc.target/i386/pieces-memset-7.c         | 16 ++++++++
 .../gcc.target/i386/pieces-memset-8.c         | 16 ++++++++
 .../gcc.target/i386/pieces-memset-9.c         | 16 ++++++++
 gcc/testsuite/gcc.target/i386/pr100865-1.c    |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-10a.c  |  4 +-
 gcc/testsuite/gcc.target/i386/pr100865-10b.c  |  4 +-
 gcc/testsuite/gcc.target/i386/pr100865-2.c    |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-3.c    |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-4a.c   |  6 +--
 gcc/testsuite/gcc.target/i386/pr100865-4b.c   |  8 ++--
 gcc/testsuite/gcc.target/i386/pr72839.c       |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-1.c     | 10 ++---
 gcc/testsuite/gcc.target/i386/pr90773-14.c    |  4 +-
 gcc/testsuite/gcc.target/i386/pr90773-15.c    | 14 +++++++
 gcc/testsuite/gcc.target/i386/pr90773-16.c    | 14 +++++++
 gcc/testsuite/gcc.target/i386/pr90773-17.c    | 14 +++++++
 gcc/testsuite/gcc.target/i386/pr90773-18.c    | 15 +++++++
 gcc/testsuite/gcc.target/i386/pr90773-19.c    | 14 +++++++
 gcc/testsuite/gcc.target/i386/pr90773-20.c    | 13 ++++++
 gcc/testsuite/gcc.target/i386/pr90773-21.c    | 13 ++++++
 gcc/testsuite/gcc.target/i386/pr90773-22.c    | 13 ++++++
 gcc/testsuite/gcc.target/i386/pr90773-23.c    | 13 ++++++
 gcc/testsuite/gcc.target/i386/pr90773-24.c    |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-25.c    |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-26.c    | 21 ++++++++++
 gcc/testsuite/gcc.target/i386/pr90773-4.c     |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-5.c     |  2 +-
 gcc/testsuite/gcc.target/i386/sw-1.c          |  1 +
 gcc/testsuite/gcc.target/i386/vect8-ret.c     |  2 +-
 86 files changed, 1135 insertions(+), 44 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/eh_return-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-23.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-24.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-25.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-26.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-27.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-28.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-29.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-30.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-31.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-32.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-33.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-34.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-35.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-36.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-37.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-38.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-39.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-40.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-41.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-42.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-43.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-44.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-23.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-26.c