Message ID | CAMe9rOqto_vXZUN4Mx8ogGw5KH9=XmvkQfmc+x8RFZe=dL+5Cw@mail.gmail.com |
---|---|
State | New |
Headers | show |
I saw the problem last patch had on ia32. Without explicit call in rtl template, scheduler may schedule the sp adjusting insn across tls descriptor and break the alignment assumption. I am testing the updated patch on x86_64. Can we combine the last two patches, both adding call explicitly in rtl template for tls_local_dynamic_base_32/tls_global_dynamic_32, and set ix86_tls_descriptor_calls_expanded_in_cfun to true only after reload complete? Regards, Wei. On Wed, Mar 12, 2014 at 5:33 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Wed, Mar 12, 2014 at 5:28 PM, Wei Mi <wmi@google.com> wrote: >>>> Does my patch fix the original problem? >>> >>> Yes, it works. I am doing bootstrap and regression test for your patch. Thanks! >>> >> >> The patch passes bootstrap and regression test on x86_64-linux-gnu. >> > > My patch fails to handle ia32. Here is the updated one. > > -- > H.J.
On Wed, Mar 12, 2014 at 10:52 PM, Wei Mi <wmi@google.com> wrote: > I saw the problem last patch had on ia32. Without explicit call in rtl > template, scheduler may schedule the sp adjusting insn across tls > descriptor and break the alignment assumption. > I am testing the updated patch on x86_64. > > Can we combine the last two patches, both adding call explicitly in > rtl template for tls_local_dynamic_base_32/tls_global_dynamic_32, and > set ix86_tls_descriptor_calls_expanded_in_cfun to true only after > reload complete? > My ia32 change generates much worse code: [hjl@gnu-6 gcc]$ cat /tmp/c.i static __thread char ccc, bbb; int __cxa_get_globals() { return &ccc - &bbb; } [hjl@gnu-6 gcc]$ ./xgcc -B./ -S -O2 -fPIC /tmp/c.i [hjl@gnu-6 gcc]$ cat c.s .file "c.i" .section .text.unlikely,"ax",@progbits .LCOLDB0: .text .LHOTB0: .p2align 4,,15 .globl __cxa_get_globals .type __cxa_get_globals, @function __cxa_get_globals: .LFB0: .cfi_startproc subq $8, %rsp .cfi_def_cfa_offset 16 leaq ccc@tlsld(%rip), %rdi call __tls_get_addr@PLT addq $8, %rsp .cfi_def_cfa_offset 8 leaq ccc@dtpoff(%rax), %rcx leaq bbb@dtpoff(%rax), %rdx movq %rcx, %rax subq %rdx, %rax ret .cfi_endproc .LFE0: .size __cxa_get_globals, .-__cxa_get_globals .section .text.unlikely .LCOLDE0: .text .LHOTE0: .section .tbss,"awT",@nobits .type bbb, @object .size bbb, 1 bbb: .zero 1 .type ccc, @object .size ccc, 1 ccc: .zero 1 .ident "GCC: (GNU) 4.9.0 20140312 (experimental)" .section .note.GNU-stack,"",@progbits [hjl@gnu-6 gcc]$ cat /tmp/c.i static __thread char ccc, bbb; int __cxa_get_globals() { return &ccc - &bbb; } [hjl@gnu-6 gcc]$ ./xgcc -B./ -S -O2 -fPIC /tmp/c.i -m32 [hjl@gnu-6 gcc]$ cat c.s .file "c.i" .section .text.unlikely,"ax",@progbits .LCOLDB0: .text .LHOTB0: .p2align 4,,15 .globl __cxa_get_globals .type __cxa_get_globals, @function __cxa_get_globals: .LFB0: .cfi_startproc pushl %esi .cfi_def_cfa_offset 8 .cfi_offset 6, -8 pushl %ebx .cfi_def_cfa_offset 12 .cfi_offset 3, -12 call __x86.get_pc_thunk.bx addl $_GLOBAL_OFFSET_TABLE_, %ebx subl $4, %esp .cfi_def_cfa_offset 16 leal ccc@tlsldm(%ebx), %eax call ___tls_get_addr@PLT leal ccc@dtpoff(%eax), %esi leal ccc@tlsldm(%ebx), %eax call ___tls_get_addr@PLT addl $4, %esp .cfi_def_cfa_offset 12 leal bbb@dtpoff(%eax), %eax popl %ebx .cfi_restore 3 .cfi_def_cfa_offset 8 subl %eax, %esi movl %esi, %eax popl %esi .cfi_restore 6 .cfi_def_cfa_offset 4 ret .cfi_endproc Maybe we should keep the original patterns and split them to add CALL.
> > My ia32 change generates much worse code: > > [hjl@gnu-6 gcc]$ cat /tmp/c.i > static __thread char ccc, bbb; > > int __cxa_get_globals() > { > return &ccc - &bbb; > } > [hjl@gnu-6 gcc]$ ./xgcc -B./ -S -O2 -fPIC /tmp/c.i > [hjl@gnu-6 gcc]$ cat c.s > .file "c.i" > .section .text.unlikely,"ax",@progbits > .LCOLDB0: > .text > .LHOTB0: > .p2align 4,,15 > .globl __cxa_get_globals > .type __cxa_get_globals, @function > __cxa_get_globals: > .LFB0: > .cfi_startproc > subq $8, %rsp > .cfi_def_cfa_offset 16 > leaq ccc@tlsld(%rip), %rdi > call __tls_get_addr@PLT > addq $8, %rsp > .cfi_def_cfa_offset 8 > leaq ccc@dtpoff(%rax), %rcx > leaq bbb@dtpoff(%rax), %rdx > movq %rcx, %rax > subq %rdx, %rax > ret > .cfi_endproc > .LFE0: > .size __cxa_get_globals, .-__cxa_get_globals > .section .text.unlikely > .LCOLDE0: > .text > .LHOTE0: > .section .tbss,"awT",@nobits > .type bbb, @object > .size bbb, 1 > bbb: > .zero 1 > .type ccc, @object > .size ccc, 1 > ccc: > .zero 1 > .ident "GCC: (GNU) 4.9.0 20140312 (experimental)" > .section .note.GNU-stack,"",@progbits > [hjl@gnu-6 gcc]$ cat /tmp/c.i > static __thread char ccc, bbb; > > int __cxa_get_globals() > { > return &ccc - &bbb; > } > [hjl@gnu-6 gcc]$ ./xgcc -B./ -S -O2 -fPIC /tmp/c.i -m32 > [hjl@gnu-6 gcc]$ cat c.s > .file "c.i" > .section .text.unlikely,"ax",@progbits > .LCOLDB0: > .text > .LHOTB0: > .p2align 4,,15 > .globl __cxa_get_globals > .type __cxa_get_globals, @function > __cxa_get_globals: > .LFB0: > .cfi_startproc > pushl %esi > .cfi_def_cfa_offset 8 > .cfi_offset 6, -8 > pushl %ebx > .cfi_def_cfa_offset 12 > .cfi_offset 3, -12 > call __x86.get_pc_thunk.bx > addl $_GLOBAL_OFFSET_TABLE_, %ebx > subl $4, %esp > .cfi_def_cfa_offset 16 > leal ccc@tlsldm(%ebx), %eax > call ___tls_get_addr@PLT > leal ccc@dtpoff(%eax), %esi > leal ccc@tlsldm(%ebx), %eax > call ___tls_get_addr@PLT > addl $4, %esp > .cfi_def_cfa_offset 12 > leal bbb@dtpoff(%eax), %eax > popl %ebx > .cfi_restore 3 > .cfi_def_cfa_offset 8 > subl %eax, %esi > movl %esi, %eax > popl %esi > .cfi_restore 6 > .cfi_def_cfa_offset 4 > ret > .cfi_endproc > > Maybe we should keep the original patterns and > split them to add CALL. > > -- > H.J. I tried pr58066-3.patch on the above testcase, the code it generated seems ok. I think after we change the 32bits pattern in i386.md to be similar as 64bits pattern, we should change 32bit expand to be similar as 64bit expand in legitimize_tls_address too? Thanks, Wei. ~/workarea/gcc-r208410-2/build/install/bin/gcc -m32 -S -fPIC 1.c .file "1.c" .section .tbss,"awT",@nobits .type ccc, @object .size ccc, 1 ccc: .zero 1 .type bbb, @object .size bbb, 1 bbb: .zero 1 .text .globl __cxa_get_globals .type __cxa_get_globals, @function __cxa_get_globals: .LFB0: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 movl %esp, %ebp .cfi_def_cfa_register 5 pushl %esi pushl %ebx .cfi_offset 6, -12 .cfi_offset 3, -16 call __x86.get_pc_thunk.bx addl $_GLOBAL_OFFSET_TABLE_, %ebx leal ccc@tlsgd(,%ebx,1), %eax call ___tls_get_addr@PLT movl %eax, %esi leal bbb@tlsgd(,%ebx,1), %eax call ___tls_get_addr@PLT subl %eax, %esi movl %esi, %eax popl %ebx .cfi_restore 3 popl %esi .cfi_restore 6 popl %ebp .cfi_restore 5 .cfi_def_cfa 4, 4 ret .cfi_endproc .LFE0: .size __cxa_get_globals, .-__cxa_get_globals .section .text.__x86.get_pc_thunk.bx,"axG",@progbits,__x86.get_pc_thunk.bx,comdat .globl __x86.get_pc_thunk.bx .hidden __x86.get_pc_thunk.bx .type __x86.get_pc_thunk.bx, @function __x86.get_pc_thunk.bx: .LFB1: .cfi_startproc movl (%esp), %ebx ret .cfi_endproc .LFE1: .ident "GCC: (GNU) 4.9.0 20140307 (experimental)" .section .note.GNU-stack,"",@progbits
> I tried pr58066-3.patch on the above testcase, the code it generated > seems ok. I think after we change the 32bits pattern in i386.md to be > similar as 64bits pattern, we should change 32bit expand to be similar > as 64bit expand in legitimize_tls_address too? > > Thanks, > Wei. > Sorry, I pasted the wrong code. This is the code generated by pr58066-3.patch. wmi@miwei:/tmp$ cat 1.c static __thread char ccc, bbb; int __cxa_get_globals() { return &ccc - &bbb; } wmi@miwei:/tmp$ ~/workarea/gcc-r208410-2/build/install/bin/gcc -O2 -fPIC -m32 -S 1.c wmi@miwei:/tmp$ cat 1.s .file "1.c" .section .text.unlikely,"ax",@progbits .LCOLDB0: .text .LHOTB0: .p2align 4,,15 .globl __cxa_get_globals .type __cxa_get_globals, @function __cxa_get_globals: .LFB0: .cfi_startproc pushl %ebx .cfi_def_cfa_offset 8 .cfi_offset 3, -8 call __x86.get_pc_thunk.bx addl $_GLOBAL_OFFSET_TABLE_, %ebx subl $8, %esp .cfi_def_cfa_offset 16 leal ccc@tlsldm(%ebx), %eax call ___tls_get_addr@PLT addl $8, %esp .cfi_def_cfa_offset 8 leal ccc@dtpoff(%eax), %edx leal bbb@dtpoff(%eax), %eax popl %ebx .cfi_restore 3 .cfi_def_cfa_offset 4 subl %eax, %edx movl %edx, %eax ret .cfi_endproc .LFE0: .size __cxa_get_globals, .-__cxa_get_globals .section .text.unlikely .LCOLDE0: .text .LHOTE0: .section .tbss,"awT",@nobits .type bbb, @object .size bbb, 1 bbb: .zero 1 .type ccc, @object .size ccc, 1 ccc: .zero 1 .section .text.__x86.get_pc_thunk.bx,"axG",@progbits,__x86.get_pc_thunk.bx,comdat .globl __x86.get_pc_thunk.bx .hidden __x86.get_pc_thunk.bx .type __x86.get_pc_thunk.bx, @function __x86.get_pc_thunk.bx: .LFB1: .cfi_startproc movl (%esp), %ebx .LFB1: .cfi_startproc movl (%esp), %ebx ret .cfi_endproc .LFE1: .ident "GCC: (GNU) 4.9.0 20140307 (experimental)" .section .note.GNU-stack,"",@progbits
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 9e33d53..da603b6 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -9490,20 +9490,30 @@ ix86_compute_frame_layout (struct ix86_frame *frame) frame->nregs = ix86_nsaved_regs (); frame->nsseregs = ix86_nsaved_sseregs (); - stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT; - preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT; - /* 64-bit MS ABI seem to require stack alignment to be always 16 except for function prologues and leaf. */ - if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16) + if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128) && (!crtl->is_leaf || cfun->calls_alloca != 0 || ix86_current_function_calls_tls_descriptor)) { - preferred_alignment = 16; - stack_alignment_needed = 16; crtl->preferred_stack_boundary = 128; crtl->stack_alignment_needed = 128; } + /* preferred_stack_boundary is never updated for call + expanded from tls descriptor. Update it here. We don't update it in + expand stage because according to the comments before + ix86_current_function_calls_tls_descriptor, tls calls may be optimized + away. */ + else if (ix86_current_function_calls_tls_descriptor + && crtl->preferred_stack_boundary < PREFERRED_STACK_BOUNDARY) + { + crtl->preferred_stack_boundary = PREFERRED_STACK_BOUNDARY; + if (crtl->stack_alignment_needed < PREFERRED_STACK_BOUNDARY) + crtl->stack_alignment_needed = PREFERRED_STACK_BOUNDARY; + } + + stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT; + preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT; gcc_assert (!size || stack_alignment_needed); gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index ea1d85f..bc8fb1f 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -12859,13 +12859,14 @@ (define_insn "*tls_global_dynamic_32_gnu" [(set (match_operand:SI 0 "register_operand" "=a") - (unspec:SI - [(match_operand:SI 1 "register_operand" "b") - (match_operand 2 "tls_symbolic_operand") - (match_operand 3 "constant_call_address_operand" "z")] - UNSPEC_TLS_GD)) - (clobber (match_scratch:SI 4 "=d")) - (clobber (match_scratch:SI 5 "=c")) + (call:SI + (mem:QI (match_operand 3 "constant_call_address_operand" "z")) + (match_operand 4))) + (unspec:SI [(match_operand:SI 1 "register_operand" "b") + (match_operand 2 "tls_symbolic_operand")] + UNSPEC_TLS_GD) + (clobber (match_scratch:SI 5 "=d")) + (clobber (match_scratch:SI 6 "=c")) (clobber (reg:CC FLAGS_REG))] "!TARGET_64BIT && TARGET_GNU_TLS" { @@ -12885,13 +12886,19 @@ (define_expand "tls_global_dynamic_32" [(parallel [(set (match_operand:SI 0 "register_operand") - (unspec:SI [(match_operand:SI 2 "register_operand") - (match_operand 1 "tls_symbolic_operand") - (match_operand 3 "constant_call_address_operand")] - UNSPEC_TLS_GD)) + (call:SI + (mem:QI (match_operand 3 "constant_call_address_operand")) + (const_int 0))) + (unspec:SI [(match_operand:SI 2 "register_operand") + (match_operand 1 "tls_symbolic_operand")] + UNSPEC_TLS_GD) (clobber (match_scratch:SI 4)) (clobber (match_scratch:SI 5)) - (clobber (reg:CC FLAGS_REG))])]) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + ix86_tls_descriptor_calls_expanded_in_cfun = true; +}) (define_insn "*tls_global_dynamic_64_<mode>" [(set (match_operand:P 0 "register_operand" "=a") @@ -12946,16 +12953,20 @@ (const_int 0))) (unspec:P [(match_operand 1 "tls_symbolic_operand")] UNSPEC_TLS_GD)])] - "TARGET_64BIT") + "TARGET_64BIT" +{ + ix86_tls_descriptor_calls_expanded_in_cfun = true; +}) (define_insn "*tls_local_dynamic_base_32_gnu" [(set (match_operand:SI 0 "register_operand" "=a") - (unspec:SI - [(match_operand:SI 1 "register_operand" "b") - (match_operand 2 "constant_call_address_operand" "z")] - UNSPEC_TLS_LD_BASE)) - (clobber (match_scratch:SI 3 "=d")) - (clobber (match_scratch:SI 4 "=c")) + (call:SI + (mem:QI (match_operand 2 "constant_call_address_operand" "z")) + (match_operand 3))) + (unspec:SI [(match_operand:SI 1 "register_operand" "b")] + UNSPEC_TLS_LD_BASE) + (clobber (match_scratch:SI 4 "=d")) + (clobber (match_scratch:SI 5 "=c")) (clobber (reg:CC FLAGS_REG))] "!TARGET_64BIT && TARGET_GNU_TLS" { @@ -12976,13 +12987,18 @@ (define_expand "tls_local_dynamic_base_32" [(parallel [(set (match_operand:SI 0 "register_operand") - (unspec:SI - [(match_operand:SI 1 "register_operand") - (match_operand 2 "constant_call_address_operand")] - UNSPEC_TLS_LD_BASE)) + (call:SI + (mem:QI (match_operand 2 "constant_call_address_operand")) + (const_int 0))) + (unspec:SI [(match_operand:SI 1 "register_operand")] + UNSPEC_TLS_LD_BASE) (clobber (match_scratch:SI 3)) (clobber (match_scratch:SI 4)) - (clobber (reg:CC FLAGS_REG))])]) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + ix86_tls_descriptor_calls_expanded_in_cfun = true; +}) (define_insn "*tls_local_dynamic_base_64_<mode>" [(set (match_operand:P 0 "register_operand" "=a") @@ -13029,33 +13045,10 @@ (mem:QI (match_operand 1)) (const_int 0))) (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)])] - "TARGET_64BIT") - -;; Local dynamic of a single variable is a lose. Show combine how -;; to convert that back to global dynamic. - -(define_insn_and_split "*tls_local_dynamic_32_once" - [(set (match_operand:SI 0 "register_operand" "=a") - (plus:SI - (unspec:SI [(match_operand:SI 1 "register_operand" "b") - (match_operand 2 "constant_call_address_operand" "z")] - UNSPEC_TLS_LD_BASE) - (const:SI (unspec:SI - [(match_operand 3 "tls_symbolic_operand")] - UNSPEC_DTPOFF)))) - (clobber (match_scratch:SI 4 "=d")) - (clobber (match_scratch:SI 5 "=c")) - (clobber (reg:CC FLAGS_REG))] - "" - "#" - "" - [(parallel - [(set (match_dup 0) - (unspec:SI [(match_dup 1) (match_dup 3) (match_dup 2)] - UNSPEC_TLS_GD)) - (clobber (match_dup 4)) - (clobber (match_dup 5)) - (clobber (reg:CC FLAGS_REG))])]) + "TARGET_64BIT" +{ + ix86_tls_descriptor_calls_expanded_in_cfun = true; +}) ;; Segment register for the thread base ptr load (define_mode_attr tp_seg [(SI "gs") (DI "fs")])