diff mbox

PATCH: Properly generate X32 IE sequence

Message ID CAMe9rOoCPj=ScD8JiZc7gHbMQ1oBNjiAuVY5Mr=ARy+b7PJ+2w@mail.gmail.com
State New
Headers show

Commit Message

H.J. Lu March 11, 2012, 5:11 p.m. UTC
On Sat, Mar 10, 2012 at 10:49 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, Mar 10, 2012 at 5:09 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
>> On Fri, Mar 9, 2012 at 11:26 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Mon, Mar 5, 2012 at 9:25 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
>>>> On Mon, Mar 5, 2012 at 6:03 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>
>>>>>>> X86-64 linker optimizes TLS_MODEL_INITIAL_EXEC to TLS_MODEL_LOCAL_EXEC
>>>>>>> by checking
>>>>>>>
>>>>>>>        movq foo@gottpoff(%rip), %reg
>>>>>>>
>>>>>>> and
>>>>>>>
>>>>>>>        addq foo@gottpoff(%rip), %reg
>>>>>>>
>>>>>>> It uses the REX prefix to avoid the last byte of the previous
>>>>>>> instruction.  With 32bit Pmode, we may not have the REX prefix and
>>>>>>> the last byte of the previous instruction may be an offset, which
>>>>>>> may look like a REX prefix.  IE->LE optimization will generate corrupted
>>>>>>> binary.  This patch makes sure we always output an REX pfrefix for
>>>>>>> UNSPEC_GOTNTPOFF.  OK for trunk?
>>>>>>
>>>>>> Actually, linker has:
>>>>>>
>>>>>>    case R_X86_64_GOTTPOFF:
>>>>>>      /* Check transition from IE access model:
>>>>>>                mov foo@gottpoff(%rip), %reg
>>>>>>                add foo@gottpoff(%rip), %reg
>>>>>>       */
>>>>>>
>>>>>>      /* Check REX prefix first.  */
>>>>>>      if (offset >= 3 && (offset + 4) <= sec->size)
>>>>>>        {
>>>>>>          val = bfd_get_8 (abfd, contents + offset - 3);
>>>>>>          if (val != 0x48 && val != 0x4c)
>>>>>>            {
>>>>>>              /* X32 may have 0x44 REX prefix or no REX prefix.  */
>>>>>>              if (ABI_64_P (abfd))
>>>>>>                return FALSE;
>>>>>>            }
>>>>>>        }
>>>>>>      else
>>>>>>        {
>>>>>>          /* X32 may not have any REX prefix.  */
>>>>>>          if (ABI_64_P (abfd))
>>>>>>            return FALSE;
>>>>>>          if (offset < 2 || (offset + 3) > sec->size)
>>>>>>            return FALSE;
>>>>>>        }
>>>>>>
>>>>>> So, it should handle the case without REX just OK. If it doesn't, then
>>>>>> this is a bug in binutils.
>>>>>>
>>>>>
>>>>> The last byte of the displacement in the previous instruction
>>>>> may happen to look like a REX byte. In that case, linker
>>>>> will overwrite the last byte of the previous instruction and
>>>>> generate the wrong instruction sequence.
>>>>>
>>>>> I need to update linker to enforce the REX byte check.
>>>>
>>>> One important observation: if we want to follow the x86_64 TLS spec
>>>> strictly, we have to use existing DImode patterns only. This also
>>>> means that we should NOT convert other TLS patterns to Pmode, since
>>>> they explicitly state movq and addq. If this is not the case, then we
>>>> need new TLS specification for X32.
>>>
>>> Here is a patch to properly generate X32 IE sequence.
>>>
>>> This is the summary of differences between x86-64 TLS and x32 TLS:
>>>
>>>                     x86-64                               x32
>>> GD
>>>    byte 0x66; leaq foo@tlsgd(%rip),%rdi;         leaq foo@tlsgd(%rip),%rdi;
>>>    .word 0x6666; rex64; call __tls_get_addr@plt  .word 0x6666; rex64;
>>> call __tls_get_addr@plt
>>>
>>> GD->IE optimization
>>>   movq %fs:0,%rax; addq x@gottpoff(%rip),%rax    movl %fs:0,%eax;
>>> addq x@gottpoff(%rip),%rax
>>>
>>> GD->LE optimization
>>>   movq %fs:0,%rax; leaq x@tpoff(%rax),%rax       movl %fs:0,%eax;
>>> leaq x@tpoff(%rax),%rax
>>>
>>> LD
>>>  leaq foo@tlsld(%rip),%rdi;                      leaq foo@tlsld(%rip),%rdi;
>>>  call __tls_get_addr@plt                         call __tls_get_addr@plt
>>>
>>> LD->LE optimization
>>>  .word 0x6666; .byte 0x66; movq %fs:0, %rax      nopl 0x0(%rax); movl
>>> %fs:0, %eax
>>>
>>> IE
>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>   addq x@gottpoff(%rip),%reg64                   addl x@gottpoff(%rip),%reg32
>>>
>>>   or
>>>                                                  Not supported if
>>> Pmode == SImode
>>>   movq x@gottpoff(%rip),%reg64;                  movq x@gottpoff(%rip),%reg64;
>>>   movq %fs:(%reg64),%reg32                       movl %fs:(%reg64), %reg32
>>>
>>> IE->LE optimization
>>>
>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>   addq x@gottpoff(%rip),%reg64                   addl x@gottpoff(%rip),%reg32
>>>
>>>   to
>>>
>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>   addq foo@tpoff, %reg64                         addl foo@tpoff, %reg32
>>>
>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>   leaq foo@tpoff(%reg64), %reg64                 leal foo@tpoff(%reg32), %reg32
>>>
>>>   or
>>>
>>>   movq x@gottpoff(%rip),%reg64                   movq x@gottpoff(%rip),%reg64;
>>>   movl %fs:(%reg64),%reg32                       movl %fs:(%reg64), %reg32
>>>
>>>   to
>>>
>>>   movq foo@tpoff, %reg64                         movq foo@tpoff, %reg64
>>>   movl %fs:(%reeg64),%reg32                      movl %fs:(%reg64), %reg32
>>>
>>> LE
>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>   leaq x@tpoff(%reg64),%reg32                    leal x@tpoff(%reg32),%reg32
>>>
>>>   or
>>>
>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>   addq $x@tpoff,%reg64                           addl $x@tpoff,%reg32
>>>
>>>   or
>>>
>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>   movl x@tpoff(%reg64),%reg32                    movl x@tpoff(%reg32),%reg32
>>>
>>>   or
>>>
>>>   movl %fs:x@tpoff,%reg32                        movl %fs:x@tpoff,%reg32
>>>
>>>
>>> X32 TLS implementation is straight forward, except for IE:
>>>
>>> 1. Since address override works only on the (reg32) part in fs:(reg32),
>>> we can't use it as memory operand.  This patch changes ix86_decompose_address
>>> to disallow  fs:(reg) if Pmode != word_mode.
>>> 2. When Pmode == SImode, there may be no REX prefix for ADD.  Avoid
>>> any instructions between MOV and ADD, which may interfere linker
>>> IE->LE optimization, since the last byte of the previous instruction
>>> before ADD may look like a REX prefix.  This patch adds tls_initial_exec_x32
>>> to make sure that we always have
>>>
>>> movl %fs:0, %reg32
>>> addl xgottpoff(%rip), %reg32
>>>
>>> so that the last byte of the previous instruction before ADD will
>>> never be a REX byte.  Tested on Linux/x32.
>>>
>>> 2012-03-09  H.J. Lu  <hongjiu.lu@intel.com>
>>>
>>>        * config/i386/i386.c (ix86_decompose_address): Disallow fs:(reg)
>>>        if Pmode != word_mode.
>>>        (legitimize_tls_address): Call gen_tls_initial_exec_x32 if
>>>        Pmode == SImode for x32.
>>>
>>>        * config/i386/i386.md (UNSPEC_TLS_IE_X32): New.
>>>        (tls_initial_exec_x32): Likewise.
>>
>> Nice solution!
>>
>> OK for mainline.
>
> Done.
>
>> BTW: Did you investigate the issue with memory aliasing?
>>
>
> It isn't a problem since it is wrapped in UNSPEC_TLS_IE_X32
> which loads address of the TLS symbol.
>
> Thanks.
>

Since we must use reg64 in %fs:(%reg) memory operand like

movq x@gottpoff(%rip),%reg64;
mov %fs:(%reg64),%reg

this patch optimizes x32 TLS IE load and store by wrapping
%reg64 inside of UNSPEC when Pmode == SImode.  OK for
trunk?

Thanks.

Comments

Uros Bizjak March 11, 2012, 5:55 p.m. UTC | #1
On Sun, Mar 11, 2012 at 6:11 PM, H.J. Lu <hjl.tools@gmail.com> wrote:

>>>>>>>> X86-64 linker optimizes TLS_MODEL_INITIAL_EXEC to TLS_MODEL_LOCAL_EXEC
>>>>>>>> by checking
>>>>>>>>
>>>>>>>>        movq foo@gottpoff(%rip), %reg
>>>>>>>>
>>>>>>>> and
>>>>>>>>
>>>>>>>>        addq foo@gottpoff(%rip), %reg
>>>>>>>>
>>>>>>>> It uses the REX prefix to avoid the last byte of the previous
>>>>>>>> instruction.  With 32bit Pmode, we may not have the REX prefix and
>>>>>>>> the last byte of the previous instruction may be an offset, which
>>>>>>>> may look like a REX prefix.  IE->LE optimization will generate corrupted
>>>>>>>> binary.  This patch makes sure we always output an REX pfrefix for
>>>>>>>> UNSPEC_GOTNTPOFF.  OK for trunk?
>>>>>>>
>>>>>>> Actually, linker has:
>>>>>>>
>>>>>>>    case R_X86_64_GOTTPOFF:
>>>>>>>      /* Check transition from IE access model:
>>>>>>>                mov foo@gottpoff(%rip), %reg
>>>>>>>                add foo@gottpoff(%rip), %reg
>>>>>>>       */
>>>>>>>
>>>>>>>      /* Check REX prefix first.  */
>>>>>>>      if (offset >= 3 && (offset + 4) <= sec->size)
>>>>>>>        {
>>>>>>>          val = bfd_get_8 (abfd, contents + offset - 3);
>>>>>>>          if (val != 0x48 && val != 0x4c)
>>>>>>>            {
>>>>>>>              /* X32 may have 0x44 REX prefix or no REX prefix.  */
>>>>>>>              if (ABI_64_P (abfd))
>>>>>>>                return FALSE;
>>>>>>>            }
>>>>>>>        }
>>>>>>>      else
>>>>>>>        {
>>>>>>>          /* X32 may not have any REX prefix.  */
>>>>>>>          if (ABI_64_P (abfd))
>>>>>>>            return FALSE;
>>>>>>>          if (offset < 2 || (offset + 3) > sec->size)
>>>>>>>            return FALSE;
>>>>>>>        }
>>>>>>>
>>>>>>> So, it should handle the case without REX just OK. If it doesn't, then
>>>>>>> this is a bug in binutils.
>>>>>>>
>>>>>>
>>>>>> The last byte of the displacement in the previous instruction
>>>>>> may happen to look like a REX byte. In that case, linker
>>>>>> will overwrite the last byte of the previous instruction and
>>>>>> generate the wrong instruction sequence.
>>>>>>
>>>>>> I need to update linker to enforce the REX byte check.
>>>>>
>>>>> One important observation: if we want to follow the x86_64 TLS spec
>>>>> strictly, we have to use existing DImode patterns only. This also
>>>>> means that we should NOT convert other TLS patterns to Pmode, since
>>>>> they explicitly state movq and addq. If this is not the case, then we
>>>>> need new TLS specification for X32.
>>>>
>>>> Here is a patch to properly generate X32 IE sequence.
>>>>
>>>> This is the summary of differences between x86-64 TLS and x32 TLS:
>>>>
>>>>                     x86-64                               x32
>>>> GD
>>>>    byte 0x66; leaq foo@tlsgd(%rip),%rdi;         leaq foo@tlsgd(%rip),%rdi;
>>>>    .word 0x6666; rex64; call __tls_get_addr@plt  .word 0x6666; rex64;
>>>> call __tls_get_addr@plt
>>>>
>>>> GD->IE optimization
>>>>   movq %fs:0,%rax; addq x@gottpoff(%rip),%rax    movl %fs:0,%eax;
>>>> addq x@gottpoff(%rip),%rax
>>>>
>>>> GD->LE optimization
>>>>   movq %fs:0,%rax; leaq x@tpoff(%rax),%rax       movl %fs:0,%eax;
>>>> leaq x@tpoff(%rax),%rax
>>>>
>>>> LD
>>>>  leaq foo@tlsld(%rip),%rdi;                      leaq foo@tlsld(%rip),%rdi;
>>>>  call __tls_get_addr@plt                         call __tls_get_addr@plt
>>>>
>>>> LD->LE optimization
>>>>  .word 0x6666; .byte 0x66; movq %fs:0, %rax      nopl 0x0(%rax); movl
>>>> %fs:0, %eax
>>>>
>>>> IE
>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>   addq x@gottpoff(%rip),%reg64                   addl x@gottpoff(%rip),%reg32
>>>>
>>>>   or
>>>>                                                  Not supported if
>>>> Pmode == SImode
>>>>   movq x@gottpoff(%rip),%reg64;                  movq x@gottpoff(%rip),%reg64;
>>>>   movq %fs:(%reg64),%reg32                       movl %fs:(%reg64), %reg32
>>>>
>>>> IE->LE optimization
>>>>
>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>   addq x@gottpoff(%rip),%reg64                   addl x@gottpoff(%rip),%reg32
>>>>
>>>>   to
>>>>
>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>   addq foo@tpoff, %reg64                         addl foo@tpoff, %reg32
>>>>
>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>   leaq foo@tpoff(%reg64), %reg64                 leal foo@tpoff(%reg32), %reg32
>>>>
>>>>   or
>>>>
>>>>   movq x@gottpoff(%rip),%reg64                   movq x@gottpoff(%rip),%reg64;
>>>>   movl %fs:(%reg64),%reg32                       movl %fs:(%reg64), %reg32
>>>>
>>>>   to
>>>>
>>>>   movq foo@tpoff, %reg64                         movq foo@tpoff, %reg64
>>>>   movl %fs:(%reeg64),%reg32                      movl %fs:(%reg64), %reg32
>>>>
>>>> LE
>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>   leaq x@tpoff(%reg64),%reg32                    leal x@tpoff(%reg32),%reg32
>>>>
>>>>   or
>>>>
>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>   addq $x@tpoff,%reg64                           addl $x@tpoff,%reg32
>>>>
>>>>   or
>>>>
>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>   movl x@tpoff(%reg64),%reg32                    movl x@tpoff(%reg32),%reg32
>>>>
>>>>   or
>>>>
>>>>   movl %fs:x@tpoff,%reg32                        movl %fs:x@tpoff,%reg32
>>>>
>>>>
>>>> X32 TLS implementation is straight forward, except for IE:
>>>>
>>>> 1. Since address override works only on the (reg32) part in fs:(reg32),
>>>> we can't use it as memory operand.  This patch changes ix86_decompose_address
>>>> to disallow  fs:(reg) if Pmode != word_mode.
>>>> 2. When Pmode == SImode, there may be no REX prefix for ADD.  Avoid
>>>> any instructions between MOV and ADD, which may interfere linker
>>>> IE->LE optimization, since the last byte of the previous instruction
>>>> before ADD may look like a REX prefix.  This patch adds tls_initial_exec_x32
>>>> to make sure that we always have
>>>>
>>>> movl %fs:0, %reg32
>>>> addl xgottpoff(%rip), %reg32
>>>>
>>>> so that the last byte of the previous instruction before ADD will
>>>> never be a REX byte.  Tested on Linux/x32.
>>>>
>>>> 2012-03-09  H.J. Lu  <hongjiu.lu@intel.com>
>>>>
>>>>        * config/i386/i386.c (ix86_decompose_address): Disallow fs:(reg)
>>>>        if Pmode != word_mode.
>>>>        (legitimize_tls_address): Call gen_tls_initial_exec_x32 if
>>>>        Pmode == SImode for x32.
>>>>
>>>>        * config/i386/i386.md (UNSPEC_TLS_IE_X32): New.
>>>>        (tls_initial_exec_x32): Likewise.
>>>
>>> Nice solution!
>>>
>>> OK for mainline.
>>
>> Done.
>>
>>> BTW: Did you investigate the issue with memory aliasing?
>>>
>>
>> It isn't a problem since it is wrapped in UNSPEC_TLS_IE_X32
>> which loads address of the TLS symbol.
>>
>> Thanks.
>>
>
> Since we must use reg64 in %fs:(%reg) memory operand like
>
> movq x@gottpoff(%rip),%reg64;
> mov %fs:(%reg64),%reg
>
> this patch optimizes x32 TLS IE load and store by wrapping
> %reg64 inside of UNSPEC when Pmode == SImode.  OK for
> trunk?

I think we should just scrap all these complications and go with the
idea of clearing MASK_TLS_DIRECT_SEG_REFS.

Uros.
H.J. Lu March 11, 2012, 6:16 p.m. UTC | #2
On Sun, Mar 11, 2012 at 10:55 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Sun, Mar 11, 2012 at 6:11 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>
>>>>>>>>> X86-64 linker optimizes TLS_MODEL_INITIAL_EXEC to TLS_MODEL_LOCAL_EXEC
>>>>>>>>> by checking
>>>>>>>>>
>>>>>>>>>        movq foo@gottpoff(%rip), %reg
>>>>>>>>>
>>>>>>>>> and
>>>>>>>>>
>>>>>>>>>        addq foo@gottpoff(%rip), %reg
>>>>>>>>>
>>>>>>>>> It uses the REX prefix to avoid the last byte of the previous
>>>>>>>>> instruction.  With 32bit Pmode, we may not have the REX prefix and
>>>>>>>>> the last byte of the previous instruction may be an offset, which
>>>>>>>>> may look like a REX prefix.  IE->LE optimization will generate corrupted
>>>>>>>>> binary.  This patch makes sure we always output an REX pfrefix for
>>>>>>>>> UNSPEC_GOTNTPOFF.  OK for trunk?
>>>>>>>>
>>>>>>>> Actually, linker has:
>>>>>>>>
>>>>>>>>    case R_X86_64_GOTTPOFF:
>>>>>>>>      /* Check transition from IE access model:
>>>>>>>>                mov foo@gottpoff(%rip), %reg
>>>>>>>>                add foo@gottpoff(%rip), %reg
>>>>>>>>       */
>>>>>>>>
>>>>>>>>      /* Check REX prefix first.  */
>>>>>>>>      if (offset >= 3 && (offset + 4) <= sec->size)
>>>>>>>>        {
>>>>>>>>          val = bfd_get_8 (abfd, contents + offset - 3);
>>>>>>>>          if (val != 0x48 && val != 0x4c)
>>>>>>>>            {
>>>>>>>>              /* X32 may have 0x44 REX prefix or no REX prefix.  */
>>>>>>>>              if (ABI_64_P (abfd))
>>>>>>>>                return FALSE;
>>>>>>>>            }
>>>>>>>>        }
>>>>>>>>      else
>>>>>>>>        {
>>>>>>>>          /* X32 may not have any REX prefix.  */
>>>>>>>>          if (ABI_64_P (abfd))
>>>>>>>>            return FALSE;
>>>>>>>>          if (offset < 2 || (offset + 3) > sec->size)
>>>>>>>>            return FALSE;
>>>>>>>>        }
>>>>>>>>
>>>>>>>> So, it should handle the case without REX just OK. If it doesn't, then
>>>>>>>> this is a bug in binutils.
>>>>>>>>
>>>>>>>
>>>>>>> The last byte of the displacement in the previous instruction
>>>>>>> may happen to look like a REX byte. In that case, linker
>>>>>>> will overwrite the last byte of the previous instruction and
>>>>>>> generate the wrong instruction sequence.
>>>>>>>
>>>>>>> I need to update linker to enforce the REX byte check.
>>>>>>
>>>>>> One important observation: if we want to follow the x86_64 TLS spec
>>>>>> strictly, we have to use existing DImode patterns only. This also
>>>>>> means that we should NOT convert other TLS patterns to Pmode, since
>>>>>> they explicitly state movq and addq. If this is not the case, then we
>>>>>> need new TLS specification for X32.
>>>>>
>>>>> Here is a patch to properly generate X32 IE sequence.
>>>>>
>>>>> This is the summary of differences between x86-64 TLS and x32 TLS:
>>>>>
>>>>>                     x86-64                               x32
>>>>> GD
>>>>>    byte 0x66; leaq foo@tlsgd(%rip),%rdi;         leaq foo@tlsgd(%rip),%rdi;
>>>>>    .word 0x6666; rex64; call __tls_get_addr@plt  .word 0x6666; rex64;
>>>>> call __tls_get_addr@plt
>>>>>
>>>>> GD->IE optimization
>>>>>   movq %fs:0,%rax; addq x@gottpoff(%rip),%rax    movl %fs:0,%eax;
>>>>> addq x@gottpoff(%rip),%rax
>>>>>
>>>>> GD->LE optimization
>>>>>   movq %fs:0,%rax; leaq x@tpoff(%rax),%rax       movl %fs:0,%eax;
>>>>> leaq x@tpoff(%rax),%rax
>>>>>
>>>>> LD
>>>>>  leaq foo@tlsld(%rip),%rdi;                      leaq foo@tlsld(%rip),%rdi;
>>>>>  call __tls_get_addr@plt                         call __tls_get_addr@plt
>>>>>
>>>>> LD->LE optimization
>>>>>  .word 0x6666; .byte 0x66; movq %fs:0, %rax      nopl 0x0(%rax); movl
>>>>> %fs:0, %eax
>>>>>
>>>>> IE
>>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>>   addq x@gottpoff(%rip),%reg64                   addl x@gottpoff(%rip),%reg32
>>>>>
>>>>>   or
>>>>>                                                  Not supported if
>>>>> Pmode == SImode
>>>>>   movq x@gottpoff(%rip),%reg64;                  movq x@gottpoff(%rip),%reg64;
>>>>>   movq %fs:(%reg64),%reg32                       movl %fs:(%reg64), %reg32
>>>>>
>>>>> IE->LE optimization
>>>>>
>>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>>   addq x@gottpoff(%rip),%reg64                   addl x@gottpoff(%rip),%reg32
>>>>>
>>>>>   to
>>>>>
>>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>>   addq foo@tpoff, %reg64                         addl foo@tpoff, %reg32
>>>>>
>>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>>   leaq foo@tpoff(%reg64), %reg64                 leal foo@tpoff(%reg32), %reg32
>>>>>
>>>>>   or
>>>>>
>>>>>   movq x@gottpoff(%rip),%reg64                   movq x@gottpoff(%rip),%reg64;
>>>>>   movl %fs:(%reg64),%reg32                       movl %fs:(%reg64), %reg32
>>>>>
>>>>>   to
>>>>>
>>>>>   movq foo@tpoff, %reg64                         movq foo@tpoff, %reg64
>>>>>   movl %fs:(%reeg64),%reg32                      movl %fs:(%reg64), %reg32
>>>>>
>>>>> LE
>>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>>   leaq x@tpoff(%reg64),%reg32                    leal x@tpoff(%reg32),%reg32
>>>>>
>>>>>   or
>>>>>
>>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>>   addq $x@tpoff,%reg64                           addl $x@tpoff,%reg32
>>>>>
>>>>>   or
>>>>>
>>>>>   movq %fs:0,%reg64;                             movl %fs:0,%reg32;
>>>>>   movl x@tpoff(%reg64),%reg32                    movl x@tpoff(%reg32),%reg32
>>>>>
>>>>>   or
>>>>>
>>>>>   movl %fs:x@tpoff,%reg32                        movl %fs:x@tpoff,%reg32
>>>>>
>>>>>
>>>>> X32 TLS implementation is straight forward, except for IE:
>>>>>
>>>>> 1. Since address override works only on the (reg32) part in fs:(reg32),
>>>>> we can't use it as memory operand.  This patch changes ix86_decompose_address
>>>>> to disallow  fs:(reg) if Pmode != word_mode.
>>>>> 2. When Pmode == SImode, there may be no REX prefix for ADD.  Avoid
>>>>> any instructions between MOV and ADD, which may interfere linker
>>>>> IE->LE optimization, since the last byte of the previous instruction
>>>>> before ADD may look like a REX prefix.  This patch adds tls_initial_exec_x32
>>>>> to make sure that we always have
>>>>>
>>>>> movl %fs:0, %reg32
>>>>> addl xgottpoff(%rip), %reg32
>>>>>
>>>>> so that the last byte of the previous instruction before ADD will
>>>>> never be a REX byte.  Tested on Linux/x32.
>>>>>
>>>>> 2012-03-09  H.J. Lu  <hongjiu.lu@intel.com>
>>>>>
>>>>>        * config/i386/i386.c (ix86_decompose_address): Disallow fs:(reg)
>>>>>        if Pmode != word_mode.
>>>>>        (legitimize_tls_address): Call gen_tls_initial_exec_x32 if
>>>>>        Pmode == SImode for x32.
>>>>>
>>>>>        * config/i386/i386.md (UNSPEC_TLS_IE_X32): New.
>>>>>        (tls_initial_exec_x32): Likewise.
>>>>
>>>> Nice solution!
>>>>
>>>> OK for mainline.
>>>
>>> Done.
>>>
>>>> BTW: Did you investigate the issue with memory aliasing?
>>>>
>>>
>>> It isn't a problem since it is wrapped in UNSPEC_TLS_IE_X32
>>> which loads address of the TLS symbol.
>>>
>>> Thanks.
>>>
>>
>> Since we must use reg64 in %fs:(%reg) memory operand like
>>
>> movq x@gottpoff(%rip),%reg64;
>> mov %fs:(%reg64),%reg
>>
>> this patch optimizes x32 TLS IE load and store by wrapping
>> %reg64 inside of UNSPEC when Pmode == SImode.  OK for
>> trunk?
>
> I think we should just scrap all these complications and go with the
> idea of clearing MASK_TLS_DIRECT_SEG_REFS.
>

I will give it a try.
Uros Bizjak March 11, 2012, 6:21 p.m. UTC | #3
On Sun, Mar 11, 2012 at 7:16 PM, H.J. Lu <hjl.tools@gmail.com> wrote:

>>>>>>        * config/i386/i386.c (ix86_decompose_address): Disallow fs:(reg)
>>>>>>        if Pmode != word_mode.
>>>>>>        (legitimize_tls_address): Call gen_tls_initial_exec_x32 if
>>>>>>        Pmode == SImode for x32.
>>>>>>
>>>>>>        * config/i386/i386.md (UNSPEC_TLS_IE_X32): New.
>>>>>>        (tls_initial_exec_x32): Likewise.
>>>>>
>>>>> Nice solution!
>>>>>
>>>>> OK for mainline.
>>>>
>>>> Done.
>>>>
>>>>> BTW: Did you investigate the issue with memory aliasing?
>>>>>
>>>>
>>>> It isn't a problem since it is wrapped in UNSPEC_TLS_IE_X32
>>>> which loads address of the TLS symbol.
>>>>
>>>> Thanks.
>>>>
>>>
>>> Since we must use reg64 in %fs:(%reg) memory operand like
>>>
>>> movq x@gottpoff(%rip),%reg64;
>>> mov %fs:(%reg64),%reg
>>>
>>> this patch optimizes x32 TLS IE load and store by wrapping
>>> %reg64 inside of UNSPEC when Pmode == SImode.  OK for
>>> trunk?
>>
>> I think we should just scrap all these complications and go with the
>> idea of clearing MASK_TLS_DIRECT_SEG_REFS.
>>
>
> I will give it a try.

You can also revert:

>>>>>>        * config/i386/i386.c (ix86_decompose_address): Disallow fs:(reg)
>>>>>>        if Pmode != word_mode.

then, since this part is handled later in the function.

Uros.
Uros Bizjak March 17, 2012, 6:10 p.m. UTC | #4
On Sun, Mar 11, 2012 at 6:11 PM, H.J. Lu <hjl.tools@gmail.com> wrote:

> Since we must use reg64 in %fs:(%reg) memory operand like
>
> movq x@gottpoff(%rip),%reg64;
> mov %fs:(%reg64),%reg
>
> this patch optimizes x32 TLS IE load and store by wrapping
> %reg64 inside of UNSPEC when Pmode == SImode.  OK for
> trunk?
>
> Thanks.
>
> --
> H.J.
> ---
> 2012-03-11  H.J. Lu  <hongjiu.lu@intel.com>
>
>        * config/i386/i386.md (*tls_initial_exec_x32_load): New.
>        (*tls_initial_exec_x32_store): Likewise.

Can you implement this with define_insn_and_split, like i.e.
*tls_dynamic_gnu2_combine_32 ?

Uros.
H.J. Lu March 17, 2012, 6:18 p.m. UTC | #5
On Sat, Mar 17, 2012 at 11:10 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Sun, Mar 11, 2012 at 6:11 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>
>> Since we must use reg64 in %fs:(%reg) memory operand like
>>
>> movq x@gottpoff(%rip),%reg64;
>> mov %fs:(%reg64),%reg
>>
>> this patch optimizes x32 TLS IE load and store by wrapping
>> %reg64 inside of UNSPEC when Pmode == SImode.  OK for
>> trunk?
>>
>> Thanks.
>>
>> --
>> H.J.
>> ---
>> 2012-03-11  H.J. Lu  <hongjiu.lu@intel.com>
>>
>>        * config/i386/i386.md (*tls_initial_exec_x32_load): New.
>>        (*tls_initial_exec_x32_store): Likewise.
>
> Can you implement this with define_insn_and_split, like i.e.
> *tls_dynamic_gnu2_combine_32 ?
>

I will give it a try again.  Last time when I tried it, GCC didn't
like memory operand in DImode when Pmode == SImode.
Uros Bizjak March 17, 2012, 6:20 p.m. UTC | #6
On Sat, Mar 17, 2012 at 7:18 PM, H.J. Lu <hjl.tools@gmail.com> wrote:

>>> Since we must use reg64 in %fs:(%reg) memory operand like
>>>
>>> movq x@gottpoff(%rip),%reg64;
>>> mov %fs:(%reg64),%reg
>>>
>>> this patch optimizes x32 TLS IE load and store by wrapping
>>> %reg64 inside of UNSPEC when Pmode == SImode.  OK for
>>> trunk?
>>>
>>> Thanks.
>>>
>>> --
>>> H.J.
>>> ---
>>> 2012-03-11  H.J. Lu  <hongjiu.lu@intel.com>
>>>
>>>        * config/i386/i386.md (*tls_initial_exec_x32_load): New.
>>>        (*tls_initial_exec_x32_store): Likewise.
>>
>> Can you implement this with define_insn_and_split, like i.e.
>> *tls_dynamic_gnu2_combine_32 ?
>>
>
> I will give it a try again.  Last time when I tried it, GCC didn't
> like memory operand in DImode when Pmode == SImode.

You should remove mode for tls_symbolic_operand predicate.

Uros.
diff mbox

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index ae1dd1c..67441cd 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12806,6 +12806,41 @@ 
 }
   [(set_attr "type" "multi")])
 
+(define_insn "*tls_initial_exec_x32_load"
+  [(set (match_operand:SWI1248x 0 "register_operand" "=r")
+        (mem:SWI1248x
+	  (unspec:SI
+	   [(match_operand:SI 1 "tls_symbolic_operand" "")]
+	   UNSPEC_TLS_IE_X32)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_X32"
+{
+  output_asm_insn
+    ("mov{q}\t{%a1@gottpoff(%%rip), %q0|%q0, %a1@gottpoff[rip]}",
+     operands);
+  if (!TARGET_MOVX || <MODE>mode == DImode || <MODE>mode == SImode)
+    return "mov{<imodesuffix>}\t{%%fs:(%q0), %0|%0, <iptrsize> PTR fs:[%q0]}";
+    return "movz{<imodesuffix>l|x}\t{%%fs:(%q0), %k0|%k0, <iptrsize> PTR fs:[%q0]}";
+}
+  [(set_attr "type" "multi")])
+
+(define_insn "*tls_initial_exec_x32_store"
+  [(set (mem:SWI1248x
+	  (unspec:SI
+	   [(match_operand:SI 0 "tls_symbolic_operand" "")]
+	   UNSPEC_TLS_IE_X32))
+  	(match_operand:SWI1248x 1 "register_operand" "r"))
+   (clobber (match_scratch:DI 2 "=&r"))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_X32"
+{
+  output_asm_insn
+    ("mov{q}\t{%a0@gottpoff(%%rip), %q2|%q2, %a0@gottpoff[rip]}",
+     operands);
+  return "mov{<imodesuffix>}\t{%1, %%fs:(%q2)|<iptrsize> PTR fs:[%q2], %1}";
+}
+  [(set_attr "type" "multi")])
+
 ;; GNU2 TLS patterns can be split.
 
 (define_expand "tls_dynamic_gnu2_32"