diff mbox

PATCH: Properly generate X32 IE sequence

Message ID CAMe9rOrQSBoh5Diw4NJMOp_EepLXn_w3C_6ZLaQCzJuJOQqFLA@mail.gmail.com
State New
Headers show

Commit Message

H.J. Lu March 19, 2012, 3:51 p.m. UTC
On Sun, Mar 18, 2012 at 1:55 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Sun, Mar 18, 2012 at 5:01 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>
>>> I am testing this patch.  OK for trunk if it passes all tests?
>>
>> No, force_reg will generate a pseudo, so this conversion is valid only
>> for !can_create_pseudo ().
>>
>> At least for *tls_initial_exec_x32_store, you will need a temporary to
>> split the pattern after reload.

Here is the updated patch to add can_create_pseudo.  I also changed
tls_initial_exec_x32 to take an input register operand as thread pointer.

> Please try attached patch. It simply throws away all recent
> complications w.r.t. to thread pointer and always handles TP in
> DImode.
>
> The testcase:
>
> --cut here--
> __thread int foo __attribute__ ((tls_model ("initial-exec")));
>
> void bar (int x)
> {
>  foo = x;
> }
>
> int baz (void)
> {
>  return foo;
> }
> --cut here--
>
> Now compiles to:
>
> bar:
>        movq    foo@gottpoff(%rip), %rax
>        movl    %edi, %fs:(%rax)
>        ret
>
> baz:
>        movq    foo@gottpoff(%rip), %rax
>        movl    %fs:(%rax), %eax
>        ret
>
> In effect, this always generates %fs(%rDI) and emits REX prefix before
> mov/add to satisfy brain-dead linkers.
>
> The patch is bootstrapping now on x86_64-pc-linux-gnu.
>

For

--
extern __thread char c;
extern char y;
void
ie (void)
{
  y = c;
}
--

Your patch generates:

	movl	%fs:0, %eax	
	movq	c@gottpoff(%rip), %rdx	
	movzbl	(%rax,%rdx), %edx	
	movb	%dl, y(%rip)	
	ret	

It can be optimized to:

        movq	c@gottpoff(%rip), %rax	
	movzbl	%fs:(%rax), %eax	
	movb	%al, y(%rip)	
	ret	

H.J.

Comments

H.J. Lu March 19, 2012, 3:54 p.m. UTC | #1
On Mon, Mar 19, 2012 at 8:51 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sun, Mar 18, 2012 at 1:55 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>> On Sun, Mar 18, 2012 at 5:01 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>>
>>>> I am testing this patch.  OK for trunk if it passes all tests?
>>>
>>> No, force_reg will generate a pseudo, so this conversion is valid only
>>> for !can_create_pseudo ().
>>>
>>> At least for *tls_initial_exec_x32_store, you will need a temporary to
>>> split the pattern after reload.
>
> Here is the updated patch to add can_create_pseudo.  I also changed
> tls_initial_exec_x32 to take an input register operand as thread pointer.
>
>> Please try attached patch. It simply throws away all recent
>> complications w.r.t. to thread pointer and always handles TP in
>> DImode.
>>
>> The testcase:
>>
>> --cut here--
>> __thread int foo __attribute__ ((tls_model ("initial-exec")));
>>
>> void bar (int x)
>> {
>>  foo = x;
>> }
>>
>> int baz (void)
>> {
>>  return foo;
>> }
>> --cut here--
>>
>> Now compiles to:
>>
>> bar:
>>        movq    foo@gottpoff(%rip), %rax
>>        movl    %edi, %fs:(%rax)
>>        ret
>>
>> baz:
>>        movq    foo@gottpoff(%rip), %rax
>>        movl    %fs:(%rax), %eax
>>        ret
>>
>> In effect, this always generates %fs(%rDI) and emits REX prefix before
>> mov/add to satisfy brain-dead linkers.
>>
>> The patch is bootstrapping now on x86_64-pc-linux-gnu.
>>
>
> For
>
> --
> extern __thread char c;
> extern char y;
> void
> ie (void)
> {
>  y = c;
> }
> --
>
> Your patch generates:
>
>        movl    %fs:0, %eax
>        movq    c@gottpoff(%rip), %rdx
>        movzbl  (%rax,%rdx), %edx
>        movb    %dl, y(%rip)
>        ret
>
> It can be optimized to:
>
>        movq    c@gottpoff(%rip), %rax
>        movzbl  %fs:(%rax), %eax
>        movb    %al, y(%rip)
>        ret
>

Combine failed:

(set (reg:QI 63 [ c ])
    (mem/c:QI (plus:DI (zero_extend:DI (unspec:SI [
                        (const_int 0 [0])
                    ] UNSPEC_TP))
            (mem/u/c:DI (const:DI (unspec:DI [
                            (symbol_ref:SI ("c") [flags 0x60]
<var_decl 0x7ffff19b8140 c>)
                        ] UNSPEC_GOTNTPOFF)) [2 S8 A8])) [0 c+0 S1 A8]))
H.J. Lu March 19, 2012, 4:19 p.m. UTC | #2
On Mon, Mar 19, 2012 at 8:54 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Mar 19, 2012 at 8:51 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Sun, Mar 18, 2012 at 1:55 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>>> On Sun, Mar 18, 2012 at 5:01 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>>>
>>>>> I am testing this patch.  OK for trunk if it passes all tests?
>>>>
>>>> No, force_reg will generate a pseudo, so this conversion is valid only
>>>> for !can_create_pseudo ().
>>>>
>>>> At least for *tls_initial_exec_x32_store, you will need a temporary to
>>>> split the pattern after reload.
>>
>> Here is the updated patch to add can_create_pseudo.  I also changed
>> tls_initial_exec_x32 to take an input register operand as thread pointer.
>>
>>> Please try attached patch. It simply throws away all recent
>>> complications w.r.t. to thread pointer and always handles TP in
>>> DImode.
>>>
>>> The testcase:
>>>
>>> --cut here--
>>> __thread int foo __attribute__ ((tls_model ("initial-exec")));
>>>
>>> void bar (int x)
>>> {
>>>  foo = x;
>>> }
>>>
>>> int baz (void)
>>> {
>>>  return foo;
>>> }
>>> --cut here--
>>>
>>> Now compiles to:
>>>
>>> bar:
>>>        movq    foo@gottpoff(%rip), %rax
>>>        movl    %edi, %fs:(%rax)
>>>        ret
>>>
>>> baz:
>>>        movq    foo@gottpoff(%rip), %rax
>>>        movl    %fs:(%rax), %eax
>>>        ret
>>>
>>> In effect, this always generates %fs(%rDI) and emits REX prefix before
>>> mov/add to satisfy brain-dead linkers.
>>>
>>> The patch is bootstrapping now on x86_64-pc-linux-gnu.
>>>
>>
>> For
>>
>> --
>> extern __thread char c;
>> extern char y;
>> void
>> ie (void)
>> {
>>  y = c;
>> }
>> --
>>
>> Your patch generates:
>>
>>        movl    %fs:0, %eax
>>        movq    c@gottpoff(%rip), %rdx
>>        movzbl  (%rax,%rdx), %edx
>>        movb    %dl, y(%rip)
>>        ret
>>
>> It can be optimized to:
>>
>>        movq    c@gottpoff(%rip), %rax
>>        movzbl  %fs:(%rax), %eax
>>        movb    %al, y(%rip)
>>        ret
>>
>
> Combine failed:
>
> (set (reg:QI 63 [ c ])
>    (mem/c:QI (plus:DI (zero_extend:DI (unspec:SI [
>                        (const_int 0 [0])
>                    ] UNSPEC_TP))
>            (mem/u/c:DI (const:DI (unspec:DI [
>                            (symbol_ref:SI ("c") [flags 0x60]
> <var_decl 0x7ffff19b8140 c>)
>                        ] UNSPEC_GOTNTPOFF)) [2 S8 A8])) [0 c+0 S1 A8]))
>
>

Wrong testcase.  IT should be

--
extern __thread char c;
extern __thread short w;
extern char y;
extern short i;
void
ie (void)
{
  y = c;
  i = w;
}
---

I got

	movl	%fs:0, %eax	
	movq	c@gottpoff(%rip), %rdx	
	movzbl	(%rax,%rdx), %edx	
	movb	%dl, y(%rip)	
	movq	w@gottpoff(%rip), %rdx	
	movzwl	(%rax,%rdx), %eax	
	movw	%ax, i(%rip)	
	ret	

It can be

	movq	c@gottpoff(%rip), %rax	
	movzbl	%fs:(%rax), %eax	
	movb	%al, y(%rip)	
	movq	w@gottpoff(%rip), %rax	
	movzwl	%fs:(%rax), %eax	
	movw	%ax, i(%rip)	
	ret
Uros Bizjak March 19, 2012, 4:47 p.m. UTC | #3
On Mon, Mar 19, 2012 at 5:19 PM, H.J. Lu <hjl.tools@gmail.com> wrote:

>        movl    %fs:0, %eax
>        movq    c@gottpoff(%rip), %rdx
>        movzbl  (%rax,%rdx), %edx
>        movb    %dl, y(%rip)
>        movq    w@gottpoff(%rip), %rdx
>        movzwl  (%rax,%rdx), %eax
>        movw    %ax, i(%rip)
>        ret
>
> It can be
>
>        movq    c@gottpoff(%rip), %rax
>        movzbl  %fs:(%rax), %eax
>        movb    %al, y(%rip)
>        movq    w@gottpoff(%rip), %rax
>        movzwl  %fs:(%rax), %eax
>        movw    %ax, i(%rip)
>        ret

This is just CSE in action. It CSEd movl %fs:0, %eax, since it has to
be zero extended before going into address.

Uros.
diff mbox

Patch

2012-03-19  H.J. Lu  <hongjiu.lu@intel.com>

	* config/i386/i386-protos.h (ix86_split_tls_initial_exec_x32): New.

	* config/i386/i386.c (legitimize_tls_address): Also pass thread
	pointer to gen_tls_initial_exec_x32.
	(ix86_split_tls_initial_exec_x32): New.

	* config/i386/i386.md (*load_tp_x32): Renamed to ...
	(*load_tp_x32_<mode>): This. Replace SI with SWI48x.
	(tls_initial_exec_x32): Add an input register operand as thread
	pointer.  Generate a REX prefix if needed.
	(*tls_initial_exec_x32_load): New.
	(*tls_initial_exec_x32_store): Likewise.

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 630112f..528eeaa 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -142,6 +142,7 @@  extern void ix86_split_lshr (rtx *, rtx, enum machine_mode);
 extern rtx ix86_find_base_term (rtx);
 extern bool ix86_check_movabs (rtx, int);
 extern void ix86_split_idivmod (enum machine_mode, rtx[], bool);
+extern void ix86_split_tls_initial_exec_x32 (rtx [], enum machine_mode, bool);
 
 extern rtx assign_386_stack_local (enum machine_mode, enum ix86_stack_slot);
 extern int ix86_attr_length_immediate_default (rtx, bool);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 78a366e..fb802ee 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -12671,13 +12671,14 @@  legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
 	    }
 	  else if (Pmode == SImode)
 	    {
-	      /* Always generate
-			movl %fs:0, %reg32
+	      /* Always generate a REX prefix for
 			addl xgottpoff(%rip), %reg32
-		 to support linker IE->LE optimization and avoid
-		 fs:(%reg32) as memory operand.  */
+		 to support linker IE->LE optimization.  */
 	      dest = gen_reg_rtx (Pmode);
-	      emit_insn (gen_tls_initial_exec_x32 (dest, x));
+	      base = get_thread_pointer (for_mov
+					 || !(TARGET_TLS_DIRECT_SEG_REFS
+					      && TARGET_TLS_INDIRECT_SEG_REFS));
+	      emit_insn (gen_tls_initial_exec_x32 (dest, base, x));
 	      return dest;
 	    }
 
@@ -12754,6 +12755,28 @@  legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
   return dest;
 }
 
+/* Split x32 TLS IE access in MODE.  Split load if LOAD is TRUE,
+   otherwise split store.  */
+
+void
+ix86_split_tls_initial_exec_x32 (rtx operands[],
+				 enum machine_mode mode, bool load)
+{
+  rtx base, mem;
+  rtx off = load ? operands[1] : operands[0];
+  off = gen_rtx_UNSPEC (DImode, gen_rtvec (1, off), UNSPEC_GOTNTPOFF);
+  off = gen_rtx_CONST (DImode, off);
+  off = gen_const_mem (DImode, off);
+  set_mem_alias_set (off, ix86_GOT_alias_set ());
+  base = gen_rtx_UNSPEC (DImode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
+  off = gen_rtx_PLUS (DImode, base, force_reg (DImode, off));
+  mem = gen_rtx_MEM (mode, off);
+  if (load)
+    emit_move_insn (operands[0], mem);
+  else
+    emit_move_insn (mem, operands[1]);
+}
+
 /* Create or return the unique __imp_DECL dllimport symbol corresponding
    to symbol DECL.  */
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index eae26ae..1643792 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12747,11 +12747,11 @@ 
 (define_mode_attr tp_seg [(SI "gs") (DI "fs")])
 
 ;; Load and add the thread base pointer from %<tp_seg>:0.
-(define_insn "*load_tp_x32"
-  [(set (match_operand:SI 0 "register_operand" "=r")
-	(unspec:SI [(const_int 0)] UNSPEC_TP))]
+(define_insn "*load_tp_x32_<mode>"
+  [(set (match_operand:SWI48x 0 "register_operand" "=r")
+	(unspec:SWI48x [(const_int 0)] UNSPEC_TP))]
   "TARGET_X32"
-  "mov{l}\t{%%fs:0, %0|%0, DWORD PTR fs:0}"
+  "mov{l}\t{%%fs:0, %k0|%k0, DWORD PTR fs:0}"
   [(set_attr "type" "imov")
    (set_attr "modrm" "0")
    (set_attr "length" "7")
@@ -12836,27 +12836,54 @@ 
 }
   [(set_attr "type" "multi")])
 
-;; When Pmode == SImode, there may be no REX prefix for ADD.  Avoid
-;; any instructions between MOV and ADD, which may interfere linker
-;; IE->LE optimization, since the last byte of the previous instruction
-;; before ADD may look like a REX prefix.  This also avoids
-;;	movl x@gottpoff(%rip), %reg32
-;;	movl $fs:(%reg32), %reg32
-;; Since address override works only on the (reg32) part in fs:(reg32),
-;; we can't use it as memory operand.
+;; When Pmode == SImode, there may be no REX prefix for ADD.  Make sure
+;; there is a REX prefix.
 (define_insn "tls_initial_exec_x32"
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(unspec:SI
-	 [(match_operand 1 "tls_symbolic_operand" "")]
+	 [(match_operand:SI 1 "register_operand" "0")
+	  (match_operand 2 "tls_symbolic_operand" "")]
 	 UNSPEC_TLS_IE_X32))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_X32"
 {
-  output_asm_insn
-    ("mov{l}\t{%%fs:0, %0|%0, DWORD PTR fs:0}", operands);
-  return "add{l}\t{%a1@gottpoff(%%rip), %0|%0, %a1@gottpoff[rip]}";
+  if (!REX_INT_REG_P (operands[0]))
+    fputs ("\trex ", asm_out_file);
+  return "add{l}\t{%a2@gottpoff(%%rip), %0|%0, %a2@gottpoff[rip]}";
 }
-  [(set_attr "type" "multi")])
+  [(set_attr "type" "alu")
+   (set_attr "length" "7")
+   (set_attr "memory" "load")])
+
+(define_insn_and_split "*tls_initial_exec_x32_load"
+  [(set (match_operand:SWI1248x 0 "register_operand" "=r")
+        (mem:SWI1248x
+	  (unspec:SI
+	   [(unspec:SI [(const_int 0)] UNSPEC_TP)
+	    (match_operand 1 "tls_symbolic_operand" "")]
+	   UNSPEC_TLS_IE_X32)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_X32
+   && can_create_pseudo_p ()"
+  "#"
+  ""
+  [(const_int 0)]
+  "ix86_split_tls_initial_exec_x32 (operands, <MODE>mode, TRUE); DONE;")
+
+(define_insn_and_split "*tls_initial_exec_x32_store"
+  [(set (mem:SWI1248x
+	  (unspec:SI
+	   [(unspec:SI [(const_int 0)] UNSPEC_TP)
+	    (match_operand 0 "tls_symbolic_operand" "")]
+	   UNSPEC_TLS_IE_X32))
+  	(match_operand:SWI1248x 1 "register_operand" "r"))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_X32
+   && can_create_pseudo_p ()"
+  "#"
+  ""
+  [(const_int 0)]
+  "ix86_split_tls_initial_exec_x32 (operands, <MODE>mode, FALSE); DONE;")
 
 ;; GNU2 TLS patterns can be split.