diff mbox

[RFC,i386] : Allow zero_extended addresses (+ problems with reload and offsetable address, "o" constraint)

Message ID CAFULd4Y5mie9zBP5TbvFpuDxSwbKpp=n3sh8iyr3UpSfiqiYxQ@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak Aug. 8, 2011, 5:11 p.m. UTC
On Mon, Aug 8, 2011 at 5:30 PM, Ulrich Weigand <uweigand@de.ibm.com> wrote:
> Uros Bizjak wrote:
>
>> Although, it would be nice for reload to subsequently fix CSE'd
>> non-offsetable memory by copying address to temporary reg (*as said in
>> the documentation*), we could simply require an XMM temporary for
>> TImode reloads to/from integer registers, and this fixes ICE for x32.
>
> Moves are special as far as reload is concerned.  If there is already
> a move instruction present *before* reload, it will get fixed up
> according to its constraints as any other instruction.
>
> However, reload will *introduce* new moves as part of its operation,
> and those will *not* themselves get reloaded.  Instead, reload simply
> assumes that every plain move will just succeed without requiring
> any reload; if this is not true, the target *must* provide a
> secondary reload for this move.
>
> (Note that the secondary reload could also work by reloading the
> target address into a temporary; that's up to the target to
> implement.)

Whoa, indeed.

Using attached patch that reloads memory address instead of going
through XMM register, the code for the testcase improves from:

test:
.LFB0:
	.cfi_startproc
	pushq	%rbx
	.cfi_def_cfa_offset 16
	.cfi_offset 3, -16
	sall	$4, %esi
	addl	%edi, %esi
	movdqa	(%esi), %xmm0
	movdqa	%xmm0, -16(%rsp)
	movq	-16(%rsp), %rcx
	movq	-8(%rsp), %rbx
	addq	$1, %rcx
	adcq	$0, %rbx
	movq	%rcx, -16(%rsp)
	sall	$4, %edx
	movq	%rbx, -8(%rsp)
	movdqa	-16(%rsp), %xmm0
	movdqa	%xmm0, (%esi)
	pxor	%xmm0, %xmm0
	movdqa	%xmm0, (%edx,%esi)
	popq	%rbx
	.cfi_def_cfa_offset 8
	ret

to:

test:
.LFB0:
	.cfi_startproc
	sall	$4, %esi
	pushq	%rbx
	.cfi_def_cfa_offset 16
	.cfi_offset 3, -16
	addl	%edi, %esi
	pxor	%xmm0, %xmm0
	mov	%esi, %eax
	movq	(%rax), %rcx
	movq	8(%rax), %rbx
	addq	$1, %rcx
	adcq	$0, %rbx
	sall	$4, %edx
	movq	%rcx, (%rax)
	movq	%rbx, 8(%rax)
	movdqa	%xmm0, (%edx,%esi)
	popq	%rbx
	.cfi_def_cfa_offset 8
	ret

H.J., can you please test attached patch? This optimization won't
trigger on x86_64 anymore.

Thanks,
Uros.

Comments

H.J. Lu Aug. 8, 2011, 5:14 p.m. UTC | #1
On Mon, Aug 8, 2011 at 10:11 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Mon, Aug 8, 2011 at 5:30 PM, Ulrich Weigand <uweigand@de.ibm.com> wrote:
>> Uros Bizjak wrote:
>>
>>> Although, it would be nice for reload to subsequently fix CSE'd
>>> non-offsetable memory by copying address to temporary reg (*as said in
>>> the documentation*), we could simply require an XMM temporary for
>>> TImode reloads to/from integer registers, and this fixes ICE for x32.
>>
>> Moves are special as far as reload is concerned.  If there is already
>> a move instruction present *before* reload, it will get fixed up
>> according to its constraints as any other instruction.
>>
>> However, reload will *introduce* new moves as part of its operation,
>> and those will *not* themselves get reloaded.  Instead, reload simply
>> assumes that every plain move will just succeed without requiring
>> any reload; if this is not true, the target *must* provide a
>> secondary reload for this move.
>>
>> (Note that the secondary reload could also work by reloading the
>> target address into a temporary; that's up to the target to
>> implement.)
>
> Whoa, indeed.
>
> Using attached patch that reloads memory address instead of going
> through XMM register, the code for the testcase improves from:
>
> test:
> .LFB0:
>        .cfi_startproc
>        pushq   %rbx
>        .cfi_def_cfa_offset 16
>        .cfi_offset 3, -16
>        sall    $4, %esi
>        addl    %edi, %esi
>        movdqa  (%esi), %xmm0
>        movdqa  %xmm0, -16(%rsp)
>        movq    -16(%rsp), %rcx
>        movq    -8(%rsp), %rbx
>        addq    $1, %rcx
>        adcq    $0, %rbx
>        movq    %rcx, -16(%rsp)
>        sall    $4, %edx
>        movq    %rbx, -8(%rsp)
>        movdqa  -16(%rsp), %xmm0
>        movdqa  %xmm0, (%esi)
>        pxor    %xmm0, %xmm0
>        movdqa  %xmm0, (%edx,%esi)
>        popq    %rbx
>        .cfi_def_cfa_offset 8
>        ret
>
> to:
>
> test:
> .LFB0:
>        .cfi_startproc
>        sall    $4, %esi
>        pushq   %rbx
>        .cfi_def_cfa_offset 16
>        .cfi_offset 3, -16
>        addl    %edi, %esi
>        pxor    %xmm0, %xmm0
>        mov     %esi, %eax
>        movq    (%rax), %rcx
>        movq    8(%rax), %rbx
>        addq    $1, %rcx
>        adcq    $0, %rbx
>        sall    $4, %edx
>        movq    %rcx, (%rax)
>        movq    %rbx, 8(%rax)
>        movdqa  %xmm0, (%edx,%esi)
>        popq    %rbx
>        .cfi_def_cfa_offset 8
>        ret
>
> H.J., can you please test attached patch? This optimization won't
> trigger on x86_64 anymore.
>

I will test it.

Thanks.
Uros Bizjak Aug. 9, 2011, 7:40 a.m. UTC | #2
On Mon, Aug 8, 2011 at 7:11 PM, Uros Bizjak <ubizjak@gmail.com> wrote:

>> Moves are special as far as reload is concerned.  If there is already
>> a move instruction present *before* reload, it will get fixed up
>> according to its constraints as any other instruction.
>>
>> However, reload will *introduce* new moves as part of its operation,
>> and those will *not* themselves get reloaded.  Instead, reload simply
>> assumes that every plain move will just succeed without requiring
>> any reload; if this is not true, the target *must* provide a
>> secondary reload for this move.
>>
>> (Note that the secondary reload could also work by reloading the
>> target address into a temporary; that's up to the target to
>> implement.)
>
> Whoa, indeed.
>
> Using attached patch that reloads memory address instead of going
> through XMM register, the code for the testcase improves from:

Committed to mainline with following ChangeLog entry:

2011-08-09  Uros Bizjak  <ubizjak@gmail.com>

	PR target/49781
	* config/i386/i386.md (reload_noff_load): New.
	(reload_noff_store): Ditto.
	* config/i386/i386.c (ix86_secondary_reload): Use
	CODE_FOR_reload_noff_load and CODE_FOR_reload_noff_store to handle
	double-word moves from/to non-offsetable addresses instead of
	generating XMM temporary.

Re-bootstrapped and re-tested on x86_64-pc-linux-gnu {,-m32}.

Uros.
H.J. Lu Aug. 9, 2011, 3:40 p.m. UTC | #3
On Tue, Aug 9, 2011 at 12:40 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Mon, Aug 8, 2011 at 7:11 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>
>>> Moves are special as far as reload is concerned.  If there is already
>>> a move instruction present *before* reload, it will get fixed up
>>> according to its constraints as any other instruction.
>>>
>>> However, reload will *introduce* new moves as part of its operation,
>>> and those will *not* themselves get reloaded.  Instead, reload simply
>>> assumes that every plain move will just succeed without requiring
>>> any reload; if this is not true, the target *must* provide a
>>> secondary reload for this move.
>>>
>>> (Note that the secondary reload could also work by reloading the
>>> target address into a temporary; that's up to the target to
>>> implement.)
>>
>> Whoa, indeed.
>>
>> Using attached patch that reloads memory address instead of going
>> through XMM register, the code for the testcase improves from:
>
> Committed to mainline with following ChangeLog entry:
>
> 2011-08-09  Uros Bizjak  <ubizjak@gmail.com>
>
>        PR target/49781
>        * config/i386/i386.md (reload_noff_load): New.
>        (reload_noff_store): Ditto.
>        * config/i386/i386.c (ix86_secondary_reload): Use
>        CODE_FOR_reload_noff_load and CODE_FOR_reload_noff_store to handle
>        double-word moves from/to non-offsetable addresses instead of
>        generating XMM temporary.
>
> Re-bootstrapped and re-tested on x86_64-pc-linux-gnu {,-m32}.
>

No regressions on x32 with GCC, glibc and SPEC CPU 2K/2006.

Thanks.
diff mbox

Patch

Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 177565)
+++ config/i386/i386.md	(working copy)
@@ -2073,6 +2073,40 @@ 
        (const_string "orig")))
    (set_attr "mode" "SI,DI,DI,DI,SI,DI,DI,DI,DI,DI,TI,DI,TI,DI,DI,DI,DI,DI")])
 
+;; Reload patterns to support multi-word load/store
+;; with non-offsetable address.
+(define_expand "reload_noff_store"
+  [(parallel [(match_operand 0 "memory_operand" "=m")
+              (match_operand 1 "register_operand" "r")
+              (match_operand:DI 2 "register_operand" "=&r")])]
+  "TARGET_64BIT"
+{
+  rtx mem = operands[0];
+  rtx addr = XEXP (mem, 0);
+
+  emit_move_insn (operands[2], addr);
+  mem = replace_equiv_address_nv (mem, operands[2]);
+
+  emit_insn (gen_rtx_SET (VOIDmode, mem, operands[1]));
+  DONE;
+})
+
+(define_expand "reload_noff_load"
+  [(parallel [(match_operand 0 "register_operand" "=r")
+              (match_operand 1 "memory_operand" "m")
+              (match_operand:DI 2 "register_operand" "=r")])]
+  "TARGET_64BIT"
+{
+  rtx mem = operands[1];
+  rtx addr = XEXP (mem, 0);
+
+  emit_move_insn (operands[2], addr);
+  mem = replace_equiv_address_nv (mem, operands[2]);
+
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], mem));
+  DONE;
+})
+
 ;; Convert impossible stores of immediate to existing instructions.
 ;; First try to get scratch register and go through it.  In case this
 ;; fails, move by 32bit parts.
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 177566)
+++ config/i386/i386.c	(working copy)
@@ -28245,18 +28245,25 @@ 
 
 static reg_class_t
 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
-		       enum machine_mode mode,
-		       secondary_reload_info *sri ATTRIBUTE_UNUSED)
+		       enum machine_mode mode, secondary_reload_info *sri)
 {
   /* Double-word spills from general registers to non-offsettable memory
-     references (zero-extended addresses) go through XMM register.  */
+     references (zero-extended addresses) require special handling.  */
   if (TARGET_64BIT
       && MEM_P (x)
       && GET_MODE_SIZE (mode) > UNITS_PER_WORD
       && rclass == GENERAL_REGS
       && !offsettable_memref_p (x))
-    return SSE_REGS;
+    {
+      sri->icode = (in_p
+		    ? CODE_FOR_reload_noff_load
+		    : CODE_FOR_reload_noff_store);
+      /* Add the cost of move to a temporary.  */
+      sri->extra_cost = 1;
 
+      return NO_REGS;
+    }
+
   /* QImode spills from non-QI registers require
      intermediate register on 32bit targets.  */
   if (!TARGET_64BIT