diff mbox

New optimization for reload_combine

Message ID 4C41BD52.5040905@codesourcery.com
State New
Headers show

Commit Message

Bernd Schmidt July 17, 2010, 2:25 p.m. UTC
On 07/17/2010 04:38 AM, H.J. Lu wrote:
> This caused:
> 
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44970

Apparently, the sse_prologue_save_insn is broken.
Index: i386.md
===================================================================
--- i386.md	(revision 162146)
+++ i386.md	(working copy)
@@ -17944,7 +17944,7 @@ (define_split
 
 (define_insn "sse_prologue_save_insn"
   [(set (mem:BLK (plus:DI (match_operand:DI 0 "register_operand" "R")
-			  (match_operand:DI 4 "const_int_operand" "n")))
+			  (const_int -127)))
 	(unspec:BLK [(reg:DI XMM0_REG)
 		     (reg:DI XMM1_REG)
 		     (reg:DI XMM2_REG)
@@ -17956,14 +17956,12 @@ (define_insn "sse_prologue_save_insn"
    (use (match_operand:DI 1 "register_operand" "r"))
    (use (match_operand:DI 2 "const_int_operand" "i"))
    (use (label_ref:DI (match_operand 3 "" "X")))
-   (use (match_operand:DI 5 "const_int_operand" "i"))]
-  "TARGET_64BIT
-   && INTVAL (operands[4]) + X86_64_SSE_REGPARM_MAX * 16 - 16 < 128
-   && INTVAL (operands[4]) + INTVAL (operands[2]) * 16 >= -128"
+   (use (match_operand:DI 4 "const_int_operand" "i"))]
+  "TARGET_64BIT"
 {
   int i;
   operands[0] = gen_rtx_MEM (Pmode,
-			     gen_rtx_PLUS (Pmode, operands[0], operands[4]));
+			     gen_rtx_PLUS (Pmode, operands[0], GEN_INT (-127)));
   /* VEX instruction with a REX prefix will #UD.  */
   if (TARGET_AVX && GET_CODE (XEXP (operands[0], 0)) != PLUS)
     gcc_unreachable ();
@@ -17971,15 +17969,16 @@ (define_insn "sse_prologue_save_insn"
   output_asm_insn ("jmp\t%A1", operands);
   for (i = X86_64_SSE_REGPARM_MAX - 1; i >= INTVAL (operands[2]); i--)
     {
-      operands[4] = adjust_address (operands[0], DImode, i*16);
-      operands[5] = gen_rtx_REG (TImode, SSE_REGNO (i));
-      PUT_MODE (operands[4], TImode);
+      rtx xops[2];
+      xops[0] = adjust_address (operands[0], DImode, i*16);
+      xops[1] = gen_rtx_REG (TImode, SSE_REGNO (i));
+      PUT_MODE (operands[0], TImode);
       if (GET_CODE (XEXP (operands[0], 0)) != PLUS)
         output_asm_insn ("rex", operands);
       if (crtl->stack_alignment_needed < 128)
-        output_asm_insn ("%vmovsd\t{%5, %4|%4, %5}", operands);
+        output_asm_insn ("%vmovsd\t{%1, %0|%0, %1}", xops);
       else
-        output_asm_insn ("%vmovaps\t{%5, %4|%4, %5}", operands);
+        output_asm_insn ("%vmovaps\t{%1, %0|%0, %1}", xops);
     }
   targetm.asm_out.internal_label (asm_out_file, "L",
 				  CODE_LABEL_NUMBER (operands[3]));
@@ -17991,7 +17990,7 @@ (define_insn "sse_prologue_save_insn"
    ;; 2 bytes for jump and opernds[4] bytes for each save.
    (set (attr "length")
      (plus (const_int 2)
-	   (mult (symbol_ref ("INTVAL (operands[5])"))
+	   (mult (symbol_ref ("INTVAL (operands[4])"))
 		 (symbol_ref ("X86_64_SSE_REGPARM_MAX - INTVAL (operands[2])")))))
    (set_attr "memory" "store")
    (set_attr "modrm" "0")

Comments

H.J. Lu July 17, 2010, 3:03 p.m. UTC | #1
On Sat, Jul 17, 2010 at 7:25 AM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> On 07/17/2010 04:38 AM, H.J. Lu wrote:
>> This caused:
>>
>> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44970
>
> Apparently, the sse_prologue_save_insn is broken.
>

It is more than that. It failed to boostrap on Linux/ia32 when
configured with

--enable-clocale=gnu --with-system-zlib --enable-shared
--with-demangler-in-ld  --with-fpmath=sse
Iain Sandoe July 17, 2010, 3:10 p.m. UTC | #2
On 17 Jul 2010, at 16:03, H.J. Lu wrote:

> On Sat, Jul 17, 2010 at 7:25 AM, Bernd Schmidt <bernds@codesourcery.com 
> > wrote:
>> On 07/17/2010 04:38 AM, H.J. Lu wrote:
>>> This caused:
>>>
>>> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44970
>>
>> Apparently, the sse_prologue_save_insn is broken.
>>
>
> It is more than that. It failed to boostrap on Linux/ia32 when
> configured with
>
> --enable-clocale=gnu --with-system-zlib --enable-shared
> --with-demangler-in-ld  --with-fpmath=sse

I think it breaks powerpc bootstrap too.
Iain
Bernd Schmidt July 17, 2010, 3:59 p.m. UTC | #3
On 07/17/2010 05:03 PM, H.J. Lu wrote:
> It is more than that. It failed to boostrap on Linux/ia32 when
> configured with
> 
> --enable-clocale=gnu --with-system-zlib --enable-shared
> --with-demangler-in-ld  --with-fpmath=sse

I can't seem to reproduce this.  Is that the full command line?


Bernd
H.J. Lu July 17, 2010, 4:14 p.m. UTC | #4
On Sat, Jul 17, 2010 at 8:59 AM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> On 07/17/2010 05:03 PM, H.J. Lu wrote:
>> It is more than that. It failed to boostrap on Linux/ia32 when
>> configured with
>>
>> --enable-clocale=gnu --with-system-zlib --enable-shared
>> --with-demangler-in-ld  --with-fpmath=sse
>
> I can't seem to reproduce this.  Is that the full command line?
>
>

I used:

../src-trunk/configure \
		 --enable-clocale=gnu --with-system-zlib --enable-shared --with-
demangler-in-ld -with-plugin-ld=ld.gold --enable-gold --with-fpmath=sse

on Fedora 12/ia32.
Bernd Schmidt July 17, 2010, 5:17 p.m. UTC | #5
On 07/17/2010 06:14 PM, H.J. Lu wrote:
> On Sat, Jul 17, 2010 at 8:59 AM, Bernd Schmidt <bernds@codesourcery.com> wrote:
>> On 07/17/2010 05:03 PM, H.J. Lu wrote:
>>> It is more than that. It failed to boostrap on Linux/ia32 when
>>> configured with
>>>
>>> --enable-clocale=gnu --with-system-zlib --enable-shared
>>> --with-demangler-in-ld  --with-fpmath=sse
>>
>> I can't seem to reproduce this.  Is that the full command line?
>>
>>
> 
> I used:
> 
> ../src-trunk/configure \
> 		 --enable-clocale=gnu --with-system-zlib --enable-shared --with-
> demangler-in-ld -with-plugin-ld=ld.gold --enable-gold --with-fpmath=sse
> 
> on Fedora 12/ia32.

I'm on Gentoo, without gold - not sure whether that made a difference,
but I'm not seeing these failures.  I don't have access to SPEC2k6
either.  Can you isolate any testcases?


Bernd
H.J. Lu July 17, 2010, 5:27 p.m. UTC | #6
On Sat, Jul 17, 2010 at 10:17 AM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> On 07/17/2010 06:14 PM, H.J. Lu wrote:
>> On Sat, Jul 17, 2010 at 8:59 AM, Bernd Schmidt <bernds@codesourcery.com> wrote:
>>> On 07/17/2010 05:03 PM, H.J. Lu wrote:
>>>> It is more than that. It failed to boostrap on Linux/ia32 when
>>>> configured with
>>>>
>>>> --enable-clocale=gnu --with-system-zlib --enable-shared
>>>> --with-demangler-in-ld  --with-fpmath=sse
>>>
>>> I can't seem to reproduce this.  Is that the full command line?
>>>
>>>
>>
>> I used:
>>
>> ../src-trunk/configure \
>>                --enable-clocale=gnu --with-system-zlib --enable-shared --with-
>> demangler-in-ld -with-plugin-ld=ld.gold --enable-gold --with-fpmath=sse
>>
>> on Fedora 12/ia32.
>
> I'm on Gentoo, without gold - not sure whether that made a difference,

Is that possible for you to install Fedora 12/13? 64bit Fedora is fine.
You can bootstrap 32bit gcc on 64bit Fedora.

> but I'm not seeing these failures.  I don't have access to SPEC2k6
> either.  Can you isolate any testcases?
>

SPEC CPU 2006 failure is the run-time comparison failure.
It won't be easy to find a small testcase. Gcc bootstrap comparison
failure and testsuite regressions are much easier to debug.

Thanks.
H.J. Lu July 18, 2010, 6:15 p.m. UTC | #7
On Sat, Jul 17, 2010 at 8:59 AM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> On 07/17/2010 05:03 PM, H.J. Lu wrote:
>> It is more than that. It failed to boostrap on Linux/ia32 when
>> configured with
>>
>> --enable-clocale=gnu --with-system-zlib --enable-shared
>> --with-demangler-in-ld  --with-fpmath=sse
>
> I can't seem to reproduce this.  Is that the full command line?
>

Many people have reported bootstrap failure on Linux/ia32. May
I suggest you reproduce it on a different Linux/ia32 OS? I know
it can be reproduced on Fedora 12/13.

Thanks.
Richard Henderson July 19, 2010, 3:41 p.m. UTC | #8
On 07/17/2010 07:25 AM, Bernd Schmidt wrote:
>  	leaq	0(,%rax,4), %rcx
>  	movl	$.L2, %eax
>  	subq	%rcx, %rax
>  	jmp	*%rax

I've often thought this was over-engineering in the x86_64 abi.
This jump table is trading memory bandwidth for unpredictability
in the branch target.

I've often wondered if we'd get better performance if we changed
to a simple comparison against zero.  I.e.

	test	%al,%al
	jz	1f
	// 8 xmm stores
1:

H.J., do you think you'd be able to measure performance on this?



r~
H.J. Lu July 19, 2010, 3:56 p.m. UTC | #9
On Mon, Jul 19, 2010 at 8:41 AM, Richard Henderson <rth@redhat.com> wrote:
> On 07/17/2010 07:25 AM, Bernd Schmidt wrote:
>>       leaq    0(,%rax,4), %rcx
>>       movl    $.L2, %eax
>>       subq    %rcx, %rax
>>       jmp     *%rax
>
> I've often thought this was over-engineering in the x86_64 abi.
> This jump table is trading memory bandwidth for unpredictability
> in the branch target.
>
> I've often wondered if we'd get better performance if we changed
> to a simple comparison against zero.  I.e.
>
>        test    %al,%al
>        jz      1f
>        // 8 xmm stores
> 1:
>
> H.J., do you think you'd be able to measure performance on this?
>

Sure.
Andi Kleen July 19, 2010, 4:09 p.m. UTC | #10
Richard Henderson <rth@redhat.com> writes:

> On 07/17/2010 07:25 AM, Bernd Schmidt wrote:
>>  	leaq	0(,%rax,4), %rcx
>>  	movl	$.L2, %eax
>>  	subq	%rcx, %rax
>>  	jmp	*%rax
>
> I've often thought this was over-engineering in the x86_64 abi.
> This jump table is trading memory bandwidth for unpredictability
> in the branch target.

The other problem with the jump is that if you misdeclare
the prototype and nothing correct is passed in %eax
then you get a random jump somewhere which tends to be hard
to debug. This has happened in the past when porting
existing code.

Just alone getting rid of that would be worth changing it.
varargs shouldn't be that time critical anyways

-Andi
H.J. Lu July 19, 2010, 4:15 p.m. UTC | #11
On Mon, Jul 19, 2010 at 9:09 AM, Andi Kleen <andi@firstfloor.org> wrote:
> Richard Henderson <rth@redhat.com> writes:
>
>> On 07/17/2010 07:25 AM, Bernd Schmidt wrote:
>>>      leaq    0(,%rax,4), %rcx
>>>      movl    $.L2, %eax
>>>      subq    %rcx, %rax
>>>      jmp     *%rax
>>
>> I've often thought this was over-engineering in the x86_64 abi.
>> This jump table is trading memory bandwidth for unpredictability
>> in the branch target.
>
> The other problem with the jump is that if you misdeclare
> the prototype and nothing correct is passed in %eax

FWIW, we DON'T support varargs without correct prototype
on x86-64. See AVX support in x86-64 psABI for details.
Jeff Law July 19, 2010, 4:19 p.m. UTC | #12
On 07/17/10 08:25, Bernd Schmidt wrote:
> On 07/17/2010 04:38 AM, H.J. Lu wrote:
>    
>> This caused:
>>
>> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44970
>>      
> Apparently, the sse_prologue_save_insn is broken.
>
> diff -dru old/nest-stdar-1.s new/nest-stdar-1.s
> --- old/nest-stdar-1.s	2010-07-17 14:10:40.308605357 +0000
> +++ new/nest-stdar-1.s	2010-07-17 14:00:30.592312121 +0000
> @@ -9,25 +9,24 @@
>   	subq	$48, %rsp
>   	.cfi_def_cfa_offset 56
>   	leaq	0(,%rax,4), %rcx
> -	leaq	39(%rsp), %rdx
>   	movl	$.L2, %eax
>   	subq	%rcx, %rax
>   	jmp	*%rax
> -	movaps	%xmm7, -15(%rdx)
> -	movaps	%xmm6, -31(%rdx)
> -	movaps	%xmm5, -47(%rdx)
> -	movaps	%xmm4, -63(%rdx)
> -	movaps	%xmm3, -79(%rdx)
> -	movaps	%xmm2, -95(%rdx)
> -	movaps	%xmm1, -111(%rdx)
> -	movaps	%xmm0, -127(%rdx)
> +	movaps	%xmm7, 24(%rsp)
> +	movaps	%xmm6, 8(%rsp)
> +	movaps	%xmm5, -8(%rsp)
> +	movaps	%xmm4, -24(%rsp)
> +	movaps	%xmm3, -40(%rsp)
> +	movaps	%xmm2, -56(%rsp)
> +	movaps	%xmm1, -72(%rsp)
> +	movaps	%xmm0, -88(%rsp)
>
> It's implementing a crazy jump table, which requires that all insns have
> the same length, which in turn requires that no one modifies the address
> in the pattern.
>    
Unreal.  Anytime I see such fragile code I want to cry -- then I find 
out it's for varargs and I want to scream.

> I can fix this testcase with the patch below, but I'll leave it for the
> x86 maintainers to choose this fix or another.
>    
Yea, let's let the x86 maintainers fix this -- presumably they'll find 
something that doesn't rely upon exact instruction lengths.

jeff
Jan Hubicka July 19, 2010, 4:23 p.m. UTC | #13
> On 07/17/2010 07:25 AM, Bernd Schmidt wrote:
> >  	leaq	0(,%rax,4), %rcx
> >  	movl	$.L2, %eax
> >  	subq	%rcx, %rax
> >  	jmp	*%rax
> 
> I've often thought this was over-engineering in the x86_64 abi.
> This jump table is trading memory bandwidth for unpredictability
> in the branch target.
> 
> I've often wondered if we'd get better performance if we changed
> to a simple comparison against zero.  I.e.
> 
> 	test	%al,%al
> 	jz	1f
> 	// 8 xmm stores
> 1:
> 
> H.J., do you think you'd be able to measure performance on this?

THe orginal problem was the fact that early K8 chips had no way of effectively
storing SSE register to memory whithout knowing its type.  So the stores in
prologue executed very slow when reformating happent.  Same reason was
for not having callee saved/restored SSE regs.

On current chips this is not big issue, so I do not care what way we output.
In fact I used to have patch for doing the jz but lost it.  I think we might
keep supporting both to get some checking that ABI is not terribly broken
(i.e. that no other copmilers just feeds rax with random value, but always
by number of args).

Honza
> 
> 
> 
> r~
Andi Kleen July 19, 2010, 4:25 p.m. UTC | #14
On Mon, Jul 19, 2010 at 09:15:47AM -0700, H.J. Lu wrote:
> On Mon, Jul 19, 2010 at 9:09 AM, Andi Kleen <andi@firstfloor.org> wrote:
> > Richard Henderson <rth@redhat.com> writes:
> >
> >> On 07/17/2010 07:25 AM, Bernd Schmidt wrote:
> >>>      leaq    0(,%rax,4), %rcx
> >>>      movl    $.L2, %eax
> >>>      subq    %rcx, %rax
> >>>      jmp     *%rax
> >>
> >> I've often thought this was over-engineering in the x86_64 abi.
> >> This jump table is trading memory bandwidth for unpredictability
> >> in the branch target.
> >
> > The other problem with the jump is that if you misdeclare
> > the prototype and nothing correct is passed in %eax
> 
> FWIW, we DON'T support varargs without correct prototype
> on x86-64. See AVX support in x86-64 psABI for details.

Yes, it's the same on non AVX. But still if it goes wrong
(and in practice it sometimes does) it's better if it's a smooth landing 
than if it jumps randomly over the address space. 

-Andi
H.J. Lu July 19, 2010, 8:57 p.m. UTC | #15
On Mon, Jul 19, 2010 at 8:56 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Jul 19, 2010 at 8:41 AM, Richard Henderson <rth@redhat.com> wrote:
>> On 07/17/2010 07:25 AM, Bernd Schmidt wrote:
>>>       leaq    0(,%rax,4), %rcx
>>>       movl    $.L2, %eax
>>>       subq    %rcx, %rax
>>>       jmp     *%rax
>>
>> I've often thought this was over-engineering in the x86_64 abi.
>> This jump table is trading memory bandwidth for unpredictability
>> in the branch target.
>>
>> I've often wondered if we'd get better performance if we changed
>> to a simple comparison against zero.  I.e.
>>
>>        test    %al,%al
>>        jz      1f
>>        // 8 xmm stores
>> 1:
>>
>> H.J., do you think you'd be able to measure performance on this?
>>
>
> Sure.

Just to be clear.  I can test performance impact if there is a patch.
But I don't have time to create a patch in the near future.
diff mbox

Patch

diff -dru old/nest-stdar-1.s new/nest-stdar-1.s
--- old/nest-stdar-1.s	2010-07-17 14:10:40.308605357 +0000
+++ new/nest-stdar-1.s	2010-07-17 14:00:30.592312121 +0000
@@ -9,25 +9,24 @@ 
 	subq	$48, %rsp
 	.cfi_def_cfa_offset 56
 	leaq	0(,%rax,4), %rcx
-	leaq	39(%rsp), %rdx
 	movl	$.L2, %eax
 	subq	%rcx, %rax
 	jmp	*%rax
-	movaps	%xmm7, -15(%rdx)
-	movaps	%xmm6, -31(%rdx)
-	movaps	%xmm5, -47(%rdx)
-	movaps	%xmm4, -63(%rdx)
-	movaps	%xmm3, -79(%rdx)
-	movaps	%xmm2, -95(%rdx)
-	movaps	%xmm1, -111(%rdx)
-	movaps	%xmm0, -127(%rdx)
+	movaps	%xmm7, 24(%rsp)
+	movaps	%xmm6, 8(%rsp)
+	movaps	%xmm5, -8(%rsp)
+	movaps	%xmm4, -24(%rsp)
+	movaps	%xmm3, -40(%rsp)
+	movaps	%xmm2, -56(%rsp)
+	movaps	%xmm1, -72(%rsp)
+	movaps	%xmm0, -88(%rsp)

It's implementing a crazy jump table, which requires that all insns have
the same length, which in turn requires that no one modifies the address
in the pattern.

I can fix this testcase with the patch below, but I'll leave it for the
x86 maintainers to choose this fix or another.


Bernd