diff mbox

PATCH: Properly generate X32 IE sequence

Message ID CAMe9rOphPyFPMfHLyPFGJxdBtPyz4j4vuK5YLp-BEX6zjji2-g@mail.gmail.com
State New
Headers show

Commit Message

H.J. Lu March 19, 2012, 4:34 p.m. UTC
On Mon, Mar 19, 2012 at 9:19 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Mar 19, 2012 at 8:54 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Mon, Mar 19, 2012 at 8:51 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Sun, Mar 18, 2012 at 1:55 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>>>> On Sun, Mar 18, 2012 at 5:01 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>>>>
>>>>>> I am testing this patch.  OK for trunk if it passes all tests?
>>>>>
>>>>> No, force_reg will generate a pseudo, so this conversion is valid only
>>>>> for !can_create_pseudo ().
>>>>>
>>>>> At least for *tls_initial_exec_x32_store, you will need a temporary to
>>>>> split the pattern after reload.
>>>
>>> Here is the updated patch to add can_create_pseudo.  I also changed
>>> tls_initial_exec_x32 to take an input register operand as thread pointer.
>>>
>>>> Please try attached patch. It simply throws away all recent
>>>> complications w.r.t. to thread pointer and always handles TP in
>>>> DImode.
>>>>
>>>> The testcase:
>>>>
>>>> --cut here--
>>>> __thread int foo __attribute__ ((tls_model ("initial-exec")));
>>>>
>>>> void bar (int x)
>>>> {
>>>>  foo = x;
>>>> }
>>>>
>>>> int baz (void)
>>>> {
>>>>  return foo;
>>>> }
>>>> --cut here--
>>>>
>>>> Now compiles to:
>>>>
>>>> bar:
>>>>        movq    foo@gottpoff(%rip), %rax
>>>>        movl    %edi, %fs:(%rax)
>>>>        ret
>>>>
>>>> baz:
>>>>        movq    foo@gottpoff(%rip), %rax
>>>>        movl    %fs:(%rax), %eax
>>>>        ret
>>>>
>>>> In effect, this always generates %fs(%rDI) and emits REX prefix before
>>>> mov/add to satisfy brain-dead linkers.
>>>>
>>>> The patch is bootstrapping now on x86_64-pc-linux-gnu.
>>>>
>>>
>>> For
>>>
>>> --
>>> extern __thread char c;
>>> extern char y;
>>> void
>>> ie (void)
>>> {
>>>  y = c;
>>> }
>>> --
>>>
>>> Your patch generates:
>>>
>>>        movl    %fs:0, %eax
>>>        movq    c@gottpoff(%rip), %rdx
>>>        movzbl  (%rax,%rdx), %edx
>>>        movb    %dl, y(%rip)
>>>        ret
>>>
>>> It can be optimized to:
>>>
>>>        movq    c@gottpoff(%rip), %rax
>>>        movzbl  %fs:(%rax), %eax
>>>        movb    %al, y(%rip)
>>>        ret
>>>
>>
>> Combine failed:
>>
>> (set (reg:QI 63 [ c ])
>>    (mem/c:QI (plus:DI (zero_extend:DI (unspec:SI [
>>                        (const_int 0 [0])
>>                    ] UNSPEC_TP))
>>            (mem/u/c:DI (const:DI (unspec:DI [
>>                            (symbol_ref:SI ("c") [flags 0x60]
>> <var_decl 0x7ffff19b8140 c>)
>>                        ] UNSPEC_GOTNTPOFF)) [2 S8 A8])) [0 c+0 S1 A8]))
>>
>>
>
> Wrong testcase.  IT should be
>
> --
> extern __thread char c;
> extern __thread short w;
> extern char y;
> extern short i;
> void
> ie (void)
> {
>  y = c;
>  i = w;
> }
> ---
>
> I got
>
>        movl    %fs:0, %eax
>        movq    c@gottpoff(%rip), %rdx
>        movzbl  (%rax,%rdx), %edx
>        movb    %dl, y(%rip)
>        movq    w@gottpoff(%rip), %rdx
>        movzwl  (%rax,%rdx), %eax
>        movw    %ax, i(%rip)
>        ret
>
> It can be
>
>        movq    c@gottpoff(%rip), %rax
>        movzbl  %fs:(%rax), %eax
>        movb    %al, y(%rip)
>        movq    w@gottpoff(%rip), %rax
>        movzwl  %fs:(%rax), %eax
>        movw    %ax, i(%rip)
>        ret
>
>

How about this patch?  I changed 32 TP load to

(define_insn "*load_tp_x32_<mode>"
  [(set (match_operand:SWI48x 0 "register_operand" "=r")
        (unspec:SWI48x [(const_int 0)] UNSPEC_TP))]
  "TARGET_X32"
  "mov{l}\t{%%fs:0, %k0|%k0, DWORD PTR fs:0}"
  [(set_attr "type" "imov")
   (set_attr "modrm" "0")
   (set_attr "length" "7")
   (set_attr "memory" "load")
   (set_attr "imm_disp" "false")])

and removed *load_tp_x32_zext.

Comments

Uros Bizjak March 19, 2012, 4:37 p.m. UTC | #1
On Mon, Mar 19, 2012 at 5:34 PM, H.J. Lu <hjl.tools@gmail.com> wrote:

>>> Combine failed:
>>>
>>> (set (reg:QI 63 [ c ])
>>>    (mem/c:QI (plus:DI (zero_extend:DI (unspec:SI [
>>>                        (const_int 0 [0])
>>>                    ] UNSPEC_TP))
>>>            (mem/u/c:DI (const:DI (unspec:DI [
>>>                            (symbol_ref:SI ("c") [flags 0x60]
>>> <var_decl 0x7ffff19b8140 c>)
>>>                        ] UNSPEC_GOTNTPOFF)) [2 S8 A8])) [0 c+0 S1 A8]))
>>>
>>>
>>
>> Wrong testcase.  IT should be
>>
>> --
>> extern __thread char c;
>> extern __thread short w;
>> extern char y;
>> extern short i;
>> void
>> ie (void)
>> {
>>  y = c;
>>  i = w;
>> }
>> ---
>>
>> I got
>>
>>        movl    %fs:0, %eax
>>        movq    c@gottpoff(%rip), %rdx
>>        movzbl  (%rax,%rdx), %edx
>>        movb    %dl, y(%rip)
>>        movq    w@gottpoff(%rip), %rdx
>>        movzwl  (%rax,%rdx), %eax
>>        movw    %ax, i(%rip)
>>        ret
>>
>> It can be
>>
>>        movq    c@gottpoff(%rip), %rax
>>        movzbl  %fs:(%rax), %eax
>>        movb    %al, y(%rip)
>>        movq    w@gottpoff(%rip), %rax
>>        movzwl  %fs:(%rax), %eax
>>        movw    %ax, i(%rip)
>>        ret
>>
>>
>
> How about this patch?  I changed 32 TP load to
>
> (define_insn "*load_tp_x32_<mode>"
>  [(set (match_operand:SWI48x 0 "register_operand" "=r")
>        (unspec:SWI48x [(const_int 0)] UNSPEC_TP))]
>  "TARGET_X32"
>  "mov{l}\t{%%fs:0, %k0|%k0, DWORD PTR fs:0}"
>  [(set_attr "type" "imov")
>   (set_attr "modrm" "0")
>   (set_attr "length" "7")
>   (set_attr "memory" "load")
>   (set_attr "imm_disp" "false")])
>
> and removed *load_tp_x32_zext.

No, your whole approach with splitters is wrong.

@@ -12747,11 +12747,11 @@
 (define_mode_attr tp_seg [(SI "gs") (DI "fs")])

 ;; Load and add the thread base pointer from %<tp_seg>:0.
-(define_insn "*load_tp_x32"
-  [(set (match_operand:SI 0 "register_operand" "=r")
-	(unspec:SI [(const_int 0)] UNSPEC_TP))]
+(define_insn "*load_tp_x32_<mode>"
+  [(set (match_operand:SWI48x 0 "register_operand" "=r")
+	(unspec:SWI48x [(const_int 0)] UNSPEC_TP))]
   "TARGET_X32"
-  "mov{l}\t{%%fs:0, %0|%0, DWORD PTR fs:0}"
+  "mov{l}\t{%%fs:0, %k0|%k0, DWORD PTR fs:0}"

The result is zero_extended SImode register, not fake SImode register in DImore.

But as said, you should generate correct sequence from the beginning.

Uros.
H.J. Lu March 19, 2012, 4:47 p.m. UTC | #2
On Mon, Mar 19, 2012 at 9:37 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Mon, Mar 19, 2012 at 5:34 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>
>>>> Combine failed:
>>>>
>>>> (set (reg:QI 63 [ c ])
>>>>    (mem/c:QI (plus:DI (zero_extend:DI (unspec:SI [
>>>>                        (const_int 0 [0])
>>>>                    ] UNSPEC_TP))
>>>>            (mem/u/c:DI (const:DI (unspec:DI [
>>>>                            (symbol_ref:SI ("c") [flags 0x60]
>>>> <var_decl 0x7ffff19b8140 c>)
>>>>                        ] UNSPEC_GOTNTPOFF)) [2 S8 A8])) [0 c+0 S1 A8]))
>>>>
>>>>
>>>
>>> Wrong testcase.  IT should be
>>>
>>> --
>>> extern __thread char c;
>>> extern __thread short w;
>>> extern char y;
>>> extern short i;
>>> void
>>> ie (void)
>>> {
>>>  y = c;
>>>  i = w;
>>> }
>>> ---
>>>
>>> I got
>>>
>>>        movl    %fs:0, %eax
>>>        movq    c@gottpoff(%rip), %rdx
>>>        movzbl  (%rax,%rdx), %edx
>>>        movb    %dl, y(%rip)
>>>        movq    w@gottpoff(%rip), %rdx
>>>        movzwl  (%rax,%rdx), %eax
>>>        movw    %ax, i(%rip)
>>>        ret
>>>
>>> It can be
>>>
>>>        movq    c@gottpoff(%rip), %rax
>>>        movzbl  %fs:(%rax), %eax
>>>        movb    %al, y(%rip)
>>>        movq    w@gottpoff(%rip), %rax
>>>        movzwl  %fs:(%rax), %eax
>>>        movw    %ax, i(%rip)
>>>        ret
>>>
>>>
>>
>> How about this patch?  I changed 32 TP load to
>>
>> (define_insn "*load_tp_x32_<mode>"
>>  [(set (match_operand:SWI48x 0 "register_operand" "=r")
>>        (unspec:SWI48x [(const_int 0)] UNSPEC_TP))]
>>  "TARGET_X32"
>>  "mov{l}\t{%%fs:0, %k0|%k0, DWORD PTR fs:0}"
>>  [(set_attr "type" "imov")
>>   (set_attr "modrm" "0")
>>   (set_attr "length" "7")
>>   (set_attr "memory" "load")
>>   (set_attr "imm_disp" "false")])
>>
>> and removed *load_tp_x32_zext.
>
> No, your whole approach with splitters is wrong.
>
> @@ -12747,11 +12747,11 @@
>  (define_mode_attr tp_seg [(SI "gs") (DI "fs")])
>
>  ;; Load and add the thread base pointer from %<tp_seg>:0.
> -(define_insn "*load_tp_x32"
> -  [(set (match_operand:SI 0 "register_operand" "=r")
> -       (unspec:SI [(const_int 0)] UNSPEC_TP))]
> +(define_insn "*load_tp_x32_<mode>"
> +  [(set (match_operand:SWI48x 0 "register_operand" "=r")
> +       (unspec:SWI48x [(const_int 0)] UNSPEC_TP))]
>   "TARGET_X32"
> -  "mov{l}\t{%%fs:0, %0|%0, DWORD PTR fs:0}"
> +  "mov{l}\t{%%fs:0, %k0|%k0, DWORD PTR fs:0}"
>
> The result is zero_extended SImode register, not fake SImode register in DImore.
>
> But as said, you should generate correct sequence from the beginning.
>

For x32,  thread pointer is an unsigned 32bit value.

movl %fs:0, %eax

is the correct instruction to load thread pointer into EAX and RAX.
Uros Bizjak March 19, 2012, 4:49 p.m. UTC | #3
On Mon, Mar 19, 2012 at 5:47 PM, H.J. Lu <hjl.tools@gmail.com> wrote:

> For x32,  thread pointer is an unsigned 32bit value.
>
> movl %fs:0, %eax
>
> is the correct instruction to load thread pointer into EAX and RAX.

So, where is ZERO_EXTEND RTX then?

Uros.
H.J. Lu March 19, 2012, 4:55 p.m. UTC | #4
On Mon, Mar 19, 2012 at 9:49 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Mon, Mar 19, 2012 at 5:47 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>
>> For x32,  thread pointer is an unsigned 32bit value.
>>
>> movl %fs:0, %eax
>>
>> is the correct instruction to load thread pointer into EAX and RAX.
>
> So, where is ZERO_EXTEND RTX then?
>

Thread pointer (TP) is an opaque value to GCC.  GCC needs to load
TP into a SImode or DImode register.  ZERO_EXTEND isn't needed
when there is a single instruction to load TP into a DImode register.
Uros Bizjak March 19, 2012, 5:01 p.m. UTC | #5
On Mon, Mar 19, 2012 at 5:55 PM, H.J. Lu <hjl.tools@gmail.com> wrote:

>>> For x32,  thread pointer is an unsigned 32bit value.
>>>
>>> movl %fs:0, %eax
>>>
>>> is the correct instruction to load thread pointer into EAX and RAX.
>>
>> So, where is ZERO_EXTEND RTX then?
>>
>
> Thread pointer (TP) is an opaque value to GCC.  GCC needs to load
> TP into a SImode or DImode register.  ZERO_EXTEND isn't needed
> when there is a single instruction to load TP into a DImode register.

I don't agree with this explanation. The mode can't be SImode and
DImode. TP is either SImode or ZERO_EXTENDed to DImode, this is the
reason we went for all that TARGET_X32 stuff in TP load RTX.

Please test my proposed patch. If it works OK, I will commit it to SVN.

Thanks,
Uros.
Uros Bizjak March 19, 2012, 5:29 p.m. UTC | #6
On Mon, Mar 19, 2012 at 6:01 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>>>> For x32,  thread pointer is an unsigned 32bit value.
>>>>
>>>> movl %fs:0, %eax
>>>>
>>>> is the correct instruction to load thread pointer into EAX and RAX.
>>>
>>> So, where is ZERO_EXTEND RTX then?
>>>
>>
>> Thread pointer (TP) is an opaque value to GCC.  GCC needs to load
>> TP into a SImode or DImode register.  ZERO_EXTEND isn't needed
>> when there is a single instruction to load TP into a DImode register.
>
> I don't agree with this explanation. The mode can't be SImode and
> DImode. TP is either SImode or ZERO_EXTENDed to DImode, this is the
> reason we went for all that TARGET_X32 stuff in TP load RTX.
>
> Please test my proposed patch. If it works OK, I will commit it to SVN.

The onyl acceptable way is to generate ZERO_EXTEND in place, so:

--cut here--
static rtx
get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
{
  rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);

  if (GET_MODE (tp) != tp_mode)
    {
      gcc_assert (GET_MODE (tp) == SImode);
      gcc_assert (tp_mode == DImode);

      tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
    }

  if (to_reg)
    tp = copy_to_mode_reg (tp_mode, tp);

  return tp;
}
--cut here--

This will generate:

        movq    c@gottpoff(%rip), %rax
        movzbl  %fs:(%rax), %eax
        movb    %al, y(%rip)
        movq    w@gottpoff(%rip), %rax
        movzwl  %fs:(%rax), %eax
        movw    %ax, i(%rip)
        ret

Uros.
H.J. Lu March 19, 2012, 5:50 p.m. UTC | #7
On Mon, Mar 19, 2012 at 10:29 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Mon, Mar 19, 2012 at 6:01 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>>>>> For x32,  thread pointer is an unsigned 32bit value.
>>>>>
>>>>> movl %fs:0, %eax
>>>>>
>>>>> is the correct instruction to load thread pointer into EAX and RAX.
>>>>
>>>> So, where is ZERO_EXTEND RTX then?
>>>>
>>>
>>> Thread pointer (TP) is an opaque value to GCC.  GCC needs to load
>>> TP into a SImode or DImode register.  ZERO_EXTEND isn't needed
>>> when there is a single instruction to load TP into a DImode register.
>>
>> I don't agree with this explanation. The mode can't be SImode and
>> DImode. TP is either SImode or ZERO_EXTENDed to DImode, this is the
>> reason we went for all that TARGET_X32 stuff in TP load RTX.

FWIW, TP maintained by OS is opaque to GCC and GCC mode doesn't
apply to the TP value maintained by OS.  The instruction pattern to load TP
into a register is provided by OS and is also opaque to GCC.  X32 OS provides
single instructions to load TP into SImode and DImode registers.  We
can load x32 TP into SImode register and ZERO_EXTENDs to DImode.
Or we can use the OS provided instruction to load TP into DImode
register directly.

>> Please test my proposed patch. If it works OK, I will commit it to SVN.
>
> The onyl acceptable way is to generate ZERO_EXTEND in place, so:
>
> --cut here--
> static rtx
> get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
> {
>  rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
>
>  if (GET_MODE (tp) != tp_mode)
>    {
>      gcc_assert (GET_MODE (tp) == SImode);
>      gcc_assert (tp_mode == DImode);
>
>      tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
>    }
>
>  if (to_reg)
>    tp = copy_to_mode_reg (tp_mode, tp);
>
>  return tp;
> }
> --cut here--

This version works fine.

Thanks.
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 9aa5ee7..66221e4 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -12483,15 +12483,12 @@  legitimize_pic_address (rtx orig, rtx reg)
 /* Load the thread pointer.  If TO_REG is true, force it into a register.  */
 
 static rtx
-get_thread_pointer (bool to_reg)
+get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
 {
-  rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
-
-  if (GET_MODE (tp) != Pmode)
-    tp = convert_to_mode (Pmode, tp, 1);
+  rtx tp = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
 
   if (to_reg)
-    tp = copy_addr_to_reg (tp);
+    tp = copy_to_mode_reg (tp_mode, tp);
 
   return tp;
 }
@@ -12543,6 +12540,7 @@  legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
 {
   rtx dest, base, off;
   rtx pic = NULL_RTX, tp = NULL_RTX;
+  enum machine_mode tp_mode = Pmode;
   int type;
 
   switch (model)
@@ -12568,7 +12566,7 @@  legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
 	  else
 	    emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
 
-	  tp = get_thread_pointer (true);
+	  tp = get_thread_pointer (Pmode, true);
 	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
 
 	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
@@ -12618,7 +12616,7 @@  legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
 	  else
 	    emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
 
-	  tp = get_thread_pointer (true);
+	  tp = get_thread_pointer (Pmode, true);
 	  set_unique_reg_note (get_last_insn (), REG_EQUAL,
 			       gen_rtx_MINUS (Pmode, tmp, tp));
 	}
@@ -12664,27 +12662,18 @@  legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
     case TLS_MODEL_INITIAL_EXEC:
       if (TARGET_64BIT)
 	{
+	  tp_mode = DImode;
+
 	  if (TARGET_SUN_TLS)
 	    {
 	      /* The Sun linker took the AMD64 TLS spec literally
 		 and can only handle %rax as destination of the
 		 initial executable code sequence.  */
 
-	      dest = gen_reg_rtx (Pmode);
+	      dest = gen_reg_rtx (tp_mode);
 	      emit_insn (gen_tls_initial_exec_64_sun (dest, x));
 	      return dest;
 	    }
-	  else if (Pmode == SImode)
-	    {
-	      /* Always generate
-			movl %fs:0, %reg32
-			addl xgottpoff(%rip), %reg32
-		 to support linker IE->LE optimization and avoid
-		 fs:(%reg32) as memory operand.  */
-	      dest = gen_reg_rtx (Pmode);
-	      emit_insn (gen_tls_initial_exec_x32 (dest, x));
-	      return dest;
-	    }
 
 	  pic = NULL;
 	  type = UNSPEC_GOTNTPOFF;
@@ -12708,24 +12697,23 @@  legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
 	  type = UNSPEC_INDNTPOFF;
 	}
 
-      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
-      off = gen_rtx_CONST (Pmode, off);
+      off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
+      off = gen_rtx_CONST (tp_mode, off);
       if (pic)
-	off = gen_rtx_PLUS (Pmode, pic, off);
-      off = gen_const_mem (Pmode, off);
+	off = gen_rtx_PLUS (tp_mode, pic, off);
+      off = gen_const_mem (tp_mode, off);
       set_mem_alias_set (off, ix86_GOT_alias_set ());
 
       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
 	{
-          base = get_thread_pointer (for_mov
-				     || !(TARGET_TLS_DIRECT_SEG_REFS
-					  && TARGET_TLS_INDIRECT_SEG_REFS));
-	  off = force_reg (Pmode, off);
-	  return gen_rtx_PLUS (Pmode, base, off);
+	  base = get_thread_pointer (tp_mode,
+				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
+	  off = force_reg (tp_mode, off);
+	  return gen_rtx_PLUS (tp_mode, base, off);
 	}
       else
 	{
-	  base = get_thread_pointer (true);
+	  base = get_thread_pointer (Pmode, true);
 	  dest = gen_reg_rtx (Pmode);
 	  emit_insn (ix86_gen_sub3 (dest, base, off));
 	}
@@ -12739,14 +12727,13 @@  legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
 
       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
 	{
-	  base = get_thread_pointer (for_mov
-				     || !(TARGET_TLS_DIRECT_SEG_REFS
-					  && TARGET_TLS_INDIRECT_SEG_REFS));
+	  base = get_thread_pointer (Pmode,
+				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
 	  return gen_rtx_PLUS (Pmode, base, off);
 	}
       else
 	{
-	  base = get_thread_pointer (true);
+	  base = get_thread_pointer (Pmode, true);
 	  dest = gen_reg_rtx (Pmode);
 	  emit_insn (ix86_gen_sub3 (dest, base, off));
 	}
@@ -13274,8 +13261,7 @@  ix86_delegitimize_tls_address (rtx orig_x)
   rtx x = orig_x, unspec;
   struct ix86_address addr;
 
-  if (!(TARGET_TLS_DIRECT_SEG_REFS
-	&& TARGET_TLS_INDIRECT_SEG_REFS))
+  if (!TARGET_TLS_DIRECT_SEG_REFS)
     return orig_x;
   if (MEM_P (x))
     x = XEXP (x, 0);
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 9e5ac00..3fcd209 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -467,9 +467,6 @@  extern int x86_prefetch_sse;
 #define TARGET_TLS_DIRECT_SEG_REFS_DEFAULT 0
 #endif
 
-/* Address override works only on the (%reg) part of %fs:(%reg).  */
-#define TARGET_TLS_INDIRECT_SEG_REFS (Pmode == word_mode)
-
 /* Fence to use after loop using storent.  */
 
 extern tree x86_mfence;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d23c67b..e167ceb 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12747,20 +12747,9 @@ 
 (define_mode_attr tp_seg [(SI "gs") (DI "fs")])
 
 ;; Load and add the thread base pointer from %<tp_seg>:0.
-(define_insn "*load_tp_x32"
-  [(set (match_operand:SI 0 "register_operand" "=r")
-	(unspec:SI [(const_int 0)] UNSPEC_TP))]
-  "TARGET_X32"
-  "mov{l}\t{%%fs:0, %0|%0, DWORD PTR fs:0}"
-  [(set_attr "type" "imov")
-   (set_attr "modrm" "0")
-   (set_attr "length" "7")
-   (set_attr "memory" "load")
-   (set_attr "imm_disp" "false")])
-
-(define_insn "*load_tp_x32_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-	(zero_extend:DI (unspec:SI [(const_int 0)] UNSPEC_TP)))]
+(define_insn "*load_tp_x32_<mode>"
+  [(set (match_operand:SWI48x 0 "register_operand" "=r")
+	(unspec:SWI48x [(const_int 0)] UNSPEC_TP))]
   "TARGET_X32"
   "mov{l}\t{%%fs:0, %k0|%k0, DWORD PTR fs:0}"
   [(set_attr "type" "imov")
@@ -12836,28 +12825,6 @@ 
 }
   [(set_attr "type" "multi")])
 
-;; When Pmode == SImode, there may be no REX prefix for ADD.  Avoid
-;; any instructions between MOV and ADD, which may interfere linker
-;; IE->LE optimization, since the last byte of the previous instruction
-;; before ADD may look like a REX prefix.  This also avoids
-;;	movl x@gottpoff(%rip), %reg32
-;;	movl $fs:(%reg32), %reg32
-;; Since address override works only on the (reg32) part in fs:(reg32),
-;; we can't use it as memory operand.
-(define_insn "tls_initial_exec_x32"
-  [(set (match_operand:SI 0 "register_operand" "=r")
-	(unspec:SI
-	 [(match_operand 1 "tls_symbolic_operand")]
-	 UNSPEC_TLS_IE_X32))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_X32"
-{
-  output_asm_insn
-    ("mov{l}\t{%%fs:0, %0|%0, DWORD PTR fs:0}", operands);
-  return "add{l}\t{%a1@gottpoff(%%rip), %0|%0, %a1@gottpoff[rip]}";
-}
-  [(set_attr "type" "multi")])
-
 ;; GNU2 TLS patterns can be split.
 
 (define_expand "tls_dynamic_gnu2_32"