Patchwork [PR58066] preferred_stack_boundary update for tls expanded call

login
register
mail settings
Submitter H.J. Lu
Date March 13, 2014, 12:33 a.m.
Message ID <CAMe9rOqto_vXZUN4Mx8ogGw5KH9=XmvkQfmc+x8RFZe=dL+5Cw@mail.gmail.com>
Download mbox | patch
Permalink /patch/329755/
State New
Headers show

Comments

H.J. Lu - March 13, 2014, 12:33 a.m.
On Wed, Mar 12, 2014 at 5:28 PM, Wei Mi <wmi@google.com> wrote:
>>> Does my patch fix the original problem?
>>
>> Yes, it works. I am doing bootstrap and regression test for your patch. Thanks!
>>
>
> The patch passes bootstrap and regression test on x86_64-linux-gnu.
>

My patch fails to handle ia32.  Here is the updated one.
Wei Mi - March 13, 2014, 5:52 a.m.
I saw the problem last patch had on ia32. Without explicit call in rtl
template, scheduler may schedule the sp adjusting insn across tls
descriptor and break the alignment assumption.
I am testing the updated patch on x86_64.

Can we combine the last two patches, both adding call explicitly in
rtl template for tls_local_dynamic_base_32/tls_global_dynamic_32, and
set ix86_tls_descriptor_calls_expanded_in_cfun to true only after
reload complete?

Regards,
Wei.

On Wed, Mar 12, 2014 at 5:33 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Wed, Mar 12, 2014 at 5:28 PM, Wei Mi <wmi@google.com> wrote:
>>>> Does my patch fix the original problem?
>>>
>>> Yes, it works. I am doing bootstrap and regression test for your patch. Thanks!
>>>
>>
>> The patch passes bootstrap and regression test on x86_64-linux-gnu.
>>
>
> My patch fails to handle ia32.  Here is the updated one.
>
> --
> H.J.
H.J. Lu - March 13, 2014, 3:07 p.m.
On Wed, Mar 12, 2014 at 10:52 PM, Wei Mi <wmi@google.com> wrote:
> I saw the problem last patch had on ia32. Without explicit call in rtl
> template, scheduler may schedule the sp adjusting insn across tls
> descriptor and break the alignment assumption.
> I am testing the updated patch on x86_64.
>
> Can we combine the last two patches, both adding call explicitly in
> rtl template for tls_local_dynamic_base_32/tls_global_dynamic_32, and
> set ix86_tls_descriptor_calls_expanded_in_cfun to true only after
> reload complete?
>

My ia32 change generates much worse code:

[hjl@gnu-6 gcc]$ cat /tmp/c.i
static __thread char ccc, bbb;

int __cxa_get_globals()
{
 return &ccc - &bbb;
}
[hjl@gnu-6 gcc]$ ./xgcc -B./ -S -O2 -fPIC /tmp/c.i
[hjl@gnu-6 gcc]$ cat c.s
.file "c.i"
.section .text.unlikely,"ax",@progbits
.LCOLDB0:
.text
.LHOTB0:
.p2align 4,,15
.globl __cxa_get_globals
.type __cxa_get_globals, @function
__cxa_get_globals:
.LFB0:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
leaq ccc@tlsld(%rip), %rdi
call __tls_get_addr@PLT
addq $8, %rsp
.cfi_def_cfa_offset 8
leaq ccc@dtpoff(%rax), %rcx
leaq bbb@dtpoff(%rax), %rdx
movq %rcx, %rax
subq %rdx, %rax
ret
.cfi_endproc
.LFE0:
.size __cxa_get_globals, .-__cxa_get_globals
.section .text.unlikely
.LCOLDE0:
.text
.LHOTE0:
.section .tbss,"awT",@nobits
.type bbb, @object
.size bbb, 1
bbb:
.zero 1
.type ccc, @object
.size ccc, 1
ccc:
.zero 1
.ident "GCC: (GNU) 4.9.0 20140312 (experimental)"
.section .note.GNU-stack,"",@progbits
[hjl@gnu-6 gcc]$ cat /tmp/c.i
static __thread char ccc, bbb;

int __cxa_get_globals()
{
 return &ccc - &bbb;
}
[hjl@gnu-6 gcc]$ ./xgcc -B./ -S -O2 -fPIC /tmp/c.i -m32
[hjl@gnu-6 gcc]$ cat c.s
.file "c.i"
.section .text.unlikely,"ax",@progbits
.LCOLDB0:
.text
.LHOTB0:
.p2align 4,,15
.globl __cxa_get_globals
.type __cxa_get_globals, @function
__cxa_get_globals:
.LFB0:
.cfi_startproc
pushl %esi
.cfi_def_cfa_offset 8
.cfi_offset 6, -8
pushl %ebx
.cfi_def_cfa_offset 12
.cfi_offset 3, -12
call __x86.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
subl $4, %esp
.cfi_def_cfa_offset 16
leal ccc@tlsldm(%ebx), %eax
call ___tls_get_addr@PLT
leal ccc@dtpoff(%eax), %esi
leal ccc@tlsldm(%ebx), %eax
call ___tls_get_addr@PLT
addl $4, %esp
.cfi_def_cfa_offset 12
leal bbb@dtpoff(%eax), %eax
popl %ebx
.cfi_restore 3
.cfi_def_cfa_offset 8
subl %eax, %esi
movl %esi, %eax
popl %esi
.cfi_restore 6
.cfi_def_cfa_offset 4
ret
.cfi_endproc

Maybe we should keep the original patterns and
split them to add CALL.
Wei Mi - March 13, 2014, 3:35 p.m.
>
> My ia32 change generates much worse code:
>
> [hjl@gnu-6 gcc]$ cat /tmp/c.i
> static __thread char ccc, bbb;
>
> int __cxa_get_globals()
> {
>  return &ccc - &bbb;
> }
> [hjl@gnu-6 gcc]$ ./xgcc -B./ -S -O2 -fPIC /tmp/c.i
> [hjl@gnu-6 gcc]$ cat c.s
> .file "c.i"
> .section .text.unlikely,"ax",@progbits
> .LCOLDB0:
> .text
> .LHOTB0:
> .p2align 4,,15
> .globl __cxa_get_globals
> .type __cxa_get_globals, @function
> __cxa_get_globals:
> .LFB0:
> .cfi_startproc
> subq $8, %rsp
> .cfi_def_cfa_offset 16
> leaq ccc@tlsld(%rip), %rdi
> call __tls_get_addr@PLT
> addq $8, %rsp
> .cfi_def_cfa_offset 8
> leaq ccc@dtpoff(%rax), %rcx
> leaq bbb@dtpoff(%rax), %rdx
> movq %rcx, %rax
> subq %rdx, %rax
> ret
> .cfi_endproc
> .LFE0:
> .size __cxa_get_globals, .-__cxa_get_globals
> .section .text.unlikely
> .LCOLDE0:
> .text
> .LHOTE0:
> .section .tbss,"awT",@nobits
> .type bbb, @object
> .size bbb, 1
> bbb:
> .zero 1
> .type ccc, @object
> .size ccc, 1
> ccc:
> .zero 1
> .ident "GCC: (GNU) 4.9.0 20140312 (experimental)"
> .section .note.GNU-stack,"",@progbits
> [hjl@gnu-6 gcc]$ cat /tmp/c.i
> static __thread char ccc, bbb;
>
> int __cxa_get_globals()
> {
>  return &ccc - &bbb;
> }
> [hjl@gnu-6 gcc]$ ./xgcc -B./ -S -O2 -fPIC /tmp/c.i -m32
> [hjl@gnu-6 gcc]$ cat c.s
> .file "c.i"
> .section .text.unlikely,"ax",@progbits
> .LCOLDB0:
> .text
> .LHOTB0:
> .p2align 4,,15
> .globl __cxa_get_globals
> .type __cxa_get_globals, @function
> __cxa_get_globals:
> .LFB0:
> .cfi_startproc
> pushl %esi
> .cfi_def_cfa_offset 8
> .cfi_offset 6, -8
> pushl %ebx
> .cfi_def_cfa_offset 12
> .cfi_offset 3, -12
> call __x86.get_pc_thunk.bx
> addl $_GLOBAL_OFFSET_TABLE_, %ebx
> subl $4, %esp
> .cfi_def_cfa_offset 16
> leal ccc@tlsldm(%ebx), %eax
> call ___tls_get_addr@PLT
> leal ccc@dtpoff(%eax), %esi
> leal ccc@tlsldm(%ebx), %eax
> call ___tls_get_addr@PLT
> addl $4, %esp
> .cfi_def_cfa_offset 12
> leal bbb@dtpoff(%eax), %eax
> popl %ebx
> .cfi_restore 3
> .cfi_def_cfa_offset 8
> subl %eax, %esi
> movl %esi, %eax
> popl %esi
> .cfi_restore 6
> .cfi_def_cfa_offset 4
> ret
> .cfi_endproc
>
> Maybe we should keep the original patterns and
> split them to add CALL.
>
> --
> H.J.

I tried pr58066-3.patch on the above testcase, the code it generated
seems ok. I think after we change the 32bits pattern in i386.md to be
similar as 64bits pattern, we should change 32bit expand to be similar
as 64bit expand in legitimize_tls_address too?

Thanks,
Wei.

~/workarea/gcc-r208410-2/build/install/bin/gcc -m32 -S -fPIC 1.c

        .file   "1.c"
        .section        .tbss,"awT",@nobits
        .type   ccc, @object
        .size   ccc, 1
ccc:
        .zero   1
        .type   bbb, @object
        .size   bbb, 1
bbb:
        .zero   1
        .text
        .globl  __cxa_get_globals
        .type   __cxa_get_globals, @function
__cxa_get_globals:
.LFB0:
        .cfi_startproc
        pushl   %ebp
        .cfi_def_cfa_offset 8
        .cfi_offset 5, -8
        movl    %esp, %ebp
        .cfi_def_cfa_register 5
        pushl   %esi
        pushl   %ebx
        .cfi_offset 6, -12
        .cfi_offset 3, -16
        call    __x86.get_pc_thunk.bx
        addl    $_GLOBAL_OFFSET_TABLE_, %ebx
        leal    ccc@tlsgd(,%ebx,1), %eax
        call    ___tls_get_addr@PLT
        movl    %eax, %esi
        leal    bbb@tlsgd(,%ebx,1), %eax
        call    ___tls_get_addr@PLT
        subl    %eax, %esi
        movl    %esi, %eax
        popl    %ebx
        .cfi_restore 3
        popl    %esi
        .cfi_restore 6
        popl    %ebp
        .cfi_restore 5
        .cfi_def_cfa 4, 4
        ret
        .cfi_endproc
.LFE0:
        .size   __cxa_get_globals, .-__cxa_get_globals
        .section
.text.__x86.get_pc_thunk.bx,"axG",@progbits,__x86.get_pc_thunk.bx,comdat
        .globl  __x86.get_pc_thunk.bx
        .hidden __x86.get_pc_thunk.bx
        .type   __x86.get_pc_thunk.bx, @function
__x86.get_pc_thunk.bx:
.LFB1:
        .cfi_startproc
        movl    (%esp), %ebx
        ret
        .cfi_endproc
.LFE1:
        .ident  "GCC: (GNU) 4.9.0 20140307 (experimental)"
        .section        .note.GNU-stack,"",@progbits
Wei Mi - March 13, 2014, 5:16 p.m.
> I tried pr58066-3.patch on the above testcase, the code it generated
> seems ok. I think after we change the 32bits pattern in i386.md to be
> similar as 64bits pattern, we should change 32bit expand to be similar
> as 64bit expand in legitimize_tls_address too?
>
> Thanks,
> Wei.
>

Sorry, I pasted the wrong code. This is the code generated by pr58066-3.patch.
wmi@miwei:/tmp$ cat 1.c
static __thread char ccc, bbb;

int __cxa_get_globals()
{
 return &ccc - &bbb;
}

wmi@miwei:/tmp$ ~/workarea/gcc-r208410-2/build/install/bin/gcc -O2
-fPIC -m32 -S 1.c
wmi@miwei:/tmp$ cat 1.s
        .file   "1.c"
        .section        .text.unlikely,"ax",@progbits
.LCOLDB0:
        .text
.LHOTB0:
        .p2align 4,,15
        .globl  __cxa_get_globals
        .type   __cxa_get_globals, @function
__cxa_get_globals:
.LFB0:
        .cfi_startproc
        pushl   %ebx
        .cfi_def_cfa_offset 8
        .cfi_offset 3, -8
        call    __x86.get_pc_thunk.bx
        addl    $_GLOBAL_OFFSET_TABLE_, %ebx
        subl    $8, %esp
        .cfi_def_cfa_offset 16
        leal    ccc@tlsldm(%ebx), %eax
        call    ___tls_get_addr@PLT
        addl    $8, %esp
        .cfi_def_cfa_offset 8
        leal    ccc@dtpoff(%eax), %edx
        leal    bbb@dtpoff(%eax), %eax
        popl    %ebx
        .cfi_restore 3
        .cfi_def_cfa_offset 4
        subl    %eax, %edx
        movl    %edx, %eax
        ret
        .cfi_endproc
.LFE0:
        .size   __cxa_get_globals, .-__cxa_get_globals
        .section        .text.unlikely
.LCOLDE0:
        .text
.LHOTE0:
        .section        .tbss,"awT",@nobits
        .type   bbb, @object
        .size   bbb, 1
bbb:
        .zero   1
        .type   ccc, @object
        .size   ccc, 1
ccc:
        .zero   1
        .section
.text.__x86.get_pc_thunk.bx,"axG",@progbits,__x86.get_pc_thunk.bx,comdat
        .globl  __x86.get_pc_thunk.bx
        .hidden __x86.get_pc_thunk.bx
        .type   __x86.get_pc_thunk.bx, @function
__x86.get_pc_thunk.bx:
.LFB1:
        .cfi_startproc
        movl    (%esp), %ebx
.LFB1:
        .cfi_startproc
        movl    (%esp), %ebx
        ret
        .cfi_endproc
.LFE1:
        .ident  "GCC: (GNU) 4.9.0 20140307 (experimental)"
        .section        .note.GNU-stack,"",@progbits

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 9e33d53..da603b6 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -9490,20 +9490,30 @@  ix86_compute_frame_layout (struct ix86_frame *frame)
   frame->nregs = ix86_nsaved_regs ();
   frame->nsseregs = ix86_nsaved_sseregs ();
 
-  stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
-  preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
-
   /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
      function prologues and leaf.  */
-  if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
+  if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
       && (!crtl->is_leaf || cfun->calls_alloca != 0
           || ix86_current_function_calls_tls_descriptor))
     {
-      preferred_alignment = 16;
-      stack_alignment_needed = 16;
       crtl->preferred_stack_boundary = 128;
       crtl->stack_alignment_needed = 128;
     }
+  /* preferred_stack_boundary is never updated for call
+     expanded from tls descriptor. Update it here. We don't update it in
+     expand stage because according to the comments before
+     ix86_current_function_calls_tls_descriptor, tls calls may be optimized
+     away.  */
+  else if (ix86_current_function_calls_tls_descriptor
+	   && crtl->preferred_stack_boundary < PREFERRED_STACK_BOUNDARY)
+    {
+      crtl->preferred_stack_boundary = PREFERRED_STACK_BOUNDARY;
+      if (crtl->stack_alignment_needed < PREFERRED_STACK_BOUNDARY)
+	crtl->stack_alignment_needed = PREFERRED_STACK_BOUNDARY;
+    }
+
+  stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
+  preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
 
   gcc_assert (!size || stack_alignment_needed);
   gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index ea1d85f..bc8fb1f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12859,13 +12859,14 @@ 
 
 (define_insn "*tls_global_dynamic_32_gnu"
   [(set (match_operand:SI 0 "register_operand" "=a")
-	(unspec:SI
-	 [(match_operand:SI 1 "register_operand" "b")
-	  (match_operand 2 "tls_symbolic_operand")
-	  (match_operand 3 "constant_call_address_operand" "z")]
-	 UNSPEC_TLS_GD))
-   (clobber (match_scratch:SI 4 "=d"))
-   (clobber (match_scratch:SI 5 "=c"))
+	(call:SI
+	 (mem:QI (match_operand 3 "constant_call_address_operand" "z"))
+	 (match_operand 4)))
+   (unspec:SI [(match_operand:SI 1 "register_operand" "b")
+	       (match_operand 2 "tls_symbolic_operand")]
+	      UNSPEC_TLS_GD)
+   (clobber (match_scratch:SI 5 "=d"))
+   (clobber (match_scratch:SI 6 "=c"))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_64BIT && TARGET_GNU_TLS"
 {
@@ -12885,13 +12886,19 @@ 
 (define_expand "tls_global_dynamic_32"
   [(parallel
     [(set (match_operand:SI 0 "register_operand")
-	  (unspec:SI [(match_operand:SI 2 "register_operand")
-		      (match_operand 1 "tls_symbolic_operand")
-		      (match_operand 3 "constant_call_address_operand")]
-		     UNSPEC_TLS_GD))
+	  (call:SI
+	   (mem:QI (match_operand 3 "constant_call_address_operand"))
+	   (const_int 0)))
+     (unspec:SI [(match_operand:SI 2 "register_operand")
+		 (match_operand 1 "tls_symbolic_operand")]
+		UNSPEC_TLS_GD)
      (clobber (match_scratch:SI 4))
      (clobber (match_scratch:SI 5))
-     (clobber (reg:CC FLAGS_REG))])])
+     (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
 
 (define_insn "*tls_global_dynamic_64_<mode>"
   [(set (match_operand:P 0 "register_operand" "=a")
@@ -12946,16 +12953,20 @@ 
 	   (const_int 0)))
      (unspec:P [(match_operand 1 "tls_symbolic_operand")]
 	       UNSPEC_TLS_GD)])]
-  "TARGET_64BIT")
+  "TARGET_64BIT"
+{
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
 
 (define_insn "*tls_local_dynamic_base_32_gnu"
   [(set (match_operand:SI 0 "register_operand" "=a")
-	(unspec:SI
-	 [(match_operand:SI 1 "register_operand" "b")
-	  (match_operand 2 "constant_call_address_operand" "z")]
-	 UNSPEC_TLS_LD_BASE))
-   (clobber (match_scratch:SI 3 "=d"))
-   (clobber (match_scratch:SI 4 "=c"))
+	(call:SI
+	 (mem:QI (match_operand 2 "constant_call_address_operand" "z"))
+	 (match_operand 3)))
+   (unspec:SI [(match_operand:SI 1 "register_operand" "b")]
+	      UNSPEC_TLS_LD_BASE)
+   (clobber (match_scratch:SI 4 "=d"))
+   (clobber (match_scratch:SI 5 "=c"))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_64BIT && TARGET_GNU_TLS"
 {
@@ -12976,13 +12987,18 @@ 
 (define_expand "tls_local_dynamic_base_32"
   [(parallel
      [(set (match_operand:SI 0 "register_operand")
-	   (unspec:SI
-	    [(match_operand:SI 1 "register_operand")
-	     (match_operand 2 "constant_call_address_operand")]
-	    UNSPEC_TLS_LD_BASE))
+	   (call:SI
+	    (mem:QI (match_operand 2 "constant_call_address_operand"))
+	    (const_int 0)))
+      (unspec:SI [(match_operand:SI 1 "register_operand")]
+		 UNSPEC_TLS_LD_BASE)
       (clobber (match_scratch:SI 3))
       (clobber (match_scratch:SI 4))
-      (clobber (reg:CC FLAGS_REG))])])
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
 
 (define_insn "*tls_local_dynamic_base_64_<mode>"
   [(set (match_operand:P 0 "register_operand" "=a")
@@ -13029,33 +13045,10 @@ 
 	    (mem:QI (match_operand 1))
 	    (const_int 0)))
       (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)])]
-  "TARGET_64BIT")
-
-;; Local dynamic of a single variable is a lose.  Show combine how
-;; to convert that back to global dynamic.
-
-(define_insn_and_split "*tls_local_dynamic_32_once"
-  [(set (match_operand:SI 0 "register_operand" "=a")
-	(plus:SI
-	 (unspec:SI [(match_operand:SI 1 "register_operand" "b")
-		     (match_operand 2 "constant_call_address_operand" "z")]
-		    UNSPEC_TLS_LD_BASE)
-	 (const:SI (unspec:SI
-		    [(match_operand 3 "tls_symbolic_operand")]
-		    UNSPEC_DTPOFF))))
-   (clobber (match_scratch:SI 4 "=d"))
-   (clobber (match_scratch:SI 5 "=c"))
-   (clobber (reg:CC FLAGS_REG))]
-  ""
-  "#"
-  ""
-  [(parallel
-     [(set (match_dup 0)
-	   (unspec:SI [(match_dup 1) (match_dup 3) (match_dup 2)]
-		      UNSPEC_TLS_GD))
-      (clobber (match_dup 4))
-      (clobber (match_dup 5))
-      (clobber (reg:CC FLAGS_REG))])])
+  "TARGET_64BIT"
+{
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
 
 ;; Segment register for the thread base ptr load
 (define_mode_attr tp_seg [(SI "gs") (DI "fs")])