diff mbox

-mcmodel=large -fpic TLS GD and LD support gcc + binutils (PR target/58067)

Message ID 20130813194235.GJ1814@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Aug. 13, 2013, 7:42 p.m. UTC
Hi!

We right now ICE with -mcmodel=large -fpic on x86_64 on TLS GD and LD
sequences, because obviously we can't call __tls_get_addr@plt there from code
potentially more than 2GB away from the PLT slot.

The attached patches add support for that in gcc and also teaches linker
about those, because otherwise the linker will fail if you try to link such
-mcmodel=large -fpic code into binaries or PIEs.

To make transitions possible, we emit always
leaq foo@tlsgd(%rip), %rdi
movabsq $__tls_get_addr@pltoff, %rax
addq $rbx, %rax
call *%rax
resp.
leaq foo@tlsld(%rip), %rdi
movabsq $__tls_get_addr@pltoff, %rax
addq $rbx, %rax
call *%rax
sequences (22 bytes, 6 bytes longer than what we do for TLSGD for normal
libraries).

Bootstrapped/regtested on x86_64-linux and i686-linux, attached is also the
sources I've used to test all the 3 different transitions.

Ok for trunk and 4.8 branch (and binutils trunk)?

	Jakub
2013-08-13  Jakub Jelinek  <jakub@redhat.com>

	PR target/58067
	* config/i386/i386.md (*tls_global_dynamic_64_largepic): New insn.
	(*tls_local_dynamic_base_64_largepic): Likewise.
	(tls_global_dynamic_64_<mode>, tls_local_dynamic_base_64_<mode>):
	Remove predicate from call operand.
	* config/i386/i386.c (ix86_tls_get_addr): For -mcmodel=large -fpic
	return sum of pic_offset_table_rtx and UNSPEC_PLTOFF of the symbol.
2013-08-13  Jakub Jelinek  <jakub@redhat.com>

	* elf64-x86-64.c (elf_x86_64_check_tls_transition): Allow
	64-bit -mcmodel=large -fpic TLS GD and LD sequences.
	(elf_x86_64_relocate_section): Handle -mcmodel=large -fpic
	TLS GD and LD sequences in GD->LE, GD->IE and LD->LE transitions.

--- bfd/elf64-x86-64.c.jj	2013-08-13 13:41:41.000000000 +0200
+++ bfd/elf64-x86-64.c	2013-08-13 16:11:28.902439602 +0200
@@ -1087,6 +1087,7 @@ elf_x86_64_check_tls_transition (bfd *ab
 {
   unsigned int val;
   unsigned long r_symndx;
+  bfd_boolean largepic = FALSE;
   struct elf_link_hash_entry *h;
   bfd_vma offset;
   struct elf_x86_64_link_hash_table *htab;
@@ -1124,16 +1125,32 @@ elf_x86_64_check_tls_transition (bfd *ab
 	     can transit to different access model.  For 32bit, only
 		leaq foo@tlsgd(%rip), %rdi
 		.word 0x6666; rex64; call __tls_get_addr
-	     can transit to different access model.  */
+	     can transit to different access model.  For largepic
+	     we also support:
+		leaq foo@tlsgd(%rip), %rdi
+		movabsq $__tls_get_addr@pltoff, %rax
+		addq $rbx, %rax
+		call *%rax.  */
 
 	  static const unsigned char call[] = { 0x66, 0x66, 0x48, 0xe8 };
 	  static const unsigned char leaq[] = { 0x66, 0x48, 0x8d, 0x3d };
 
-	  if ((offset + 12) > sec->size
-	      || memcmp (contents + offset + 4, call, 4) != 0)
+	  if ((offset + 12) > sec->size)
 	    return FALSE;
 
-	  if (ABI_64_P (abfd))
+	  if (memcmp (contents + offset + 4, call, 4) != 0)
+	    {
+	      if (!ABI_64_P (abfd)
+		  || (offset + 19) > sec->size
+		  || offset < 3
+		  || memcmp (contents + offset - 3, leaq + 1, 3) != 0
+		  || memcmp (contents + offset + 4, "\x48\xb8", 2) != 0
+		  || memcmp (contents + offset + 14, "\x48\x01\xd8\xff\xd0", 5)
+		     != 0)
+		return FALSE;
+	      largepic = TRUE;
+	    }
+	  else if (ABI_64_P (abfd))
 	    {
 	      if (offset < 4
 		  || memcmp (contents + offset - 4, leaq, 4) != 0)
@@ -1151,16 +1168,31 @@ elf_x86_64_check_tls_transition (bfd *ab
 	  /* Check transition from LD access model.  Only
 		leaq foo@tlsld(%rip), %rdi;
 		call __tls_get_addr
-	     can transit to different access model.  */
+	     can transit to different access model.  For largepic
+	     we also support:
+		leaq foo@tlsld(%rip), %rdi
+		movabsq $__tls_get_addr@pltoff, %rax
+		addq $rbx, %rax
+		call *%rax.  */
 
 	  static const unsigned char lea[] = { 0x48, 0x8d, 0x3d };
 
 	  if (offset < 3 || (offset + 9) > sec->size)
 	    return FALSE;
 
-	  if (memcmp (contents + offset - 3, lea, 3) != 0
-	      || 0xe8 != *(contents + offset + 4))
+	  if (memcmp (contents + offset - 3, lea, 3) != 0)
 	    return FALSE;
+
+	  if (0xe8 != *(contents + offset + 4))
+	    {
+	      if (!ABI_64_P (abfd)
+		  || (offset + 19) > sec->size
+		  || memcmp (contents + offset + 4, "\x48\xb8", 2) != 0
+		  || memcmp (contents + offset + 14, "\x48\x01\xd8\xff\xd0", 5)
+		     != 0)
+		return FALSE;
+	      largepic = TRUE;
+	    }
 	}
 
       r_symndx = htab->r_sym (rel[1].r_info);
@@ -1172,8 +1204,10 @@ elf_x86_64_check_tls_transition (bfd *ab
 	 may be versioned.  */
       return (h != NULL
 	      && h->root.root.string != NULL
-	      && (ELF32_R_TYPE (rel[1].r_info) == R_X86_64_PC32
-		  || ELF32_R_TYPE (rel[1].r_info) == R_X86_64_PLT32)
+	      && (largepic
+		  ? ELF32_R_TYPE (rel[1].r_info) == R_X86_64_PLTOFF64
+		  : (ELF32_R_TYPE (rel[1].r_info) == R_X86_64_PC32
+		     || ELF32_R_TYPE (rel[1].r_info) == R_X86_64_PLT32))
 	      && (strncmp (h->root.root.string,
 			   "__tls_get_addr", 14) == 0));
 
@@ -3947,8 +3981,26 @@ direct:
 		     .word 0x6666; rex64; call __tls_get_addr
 		     into:
 		     movl %fs:0, %eax
-		     leaq foo@tpoff(%rax), %rax */
-		  if (ABI_64_P (output_bfd))
+		     leaq foo@tpoff(%rax), %rax
+		     For largepic, change:
+		     leaq foo@tlsgd(%rip), %rdi
+		     movabsq $__tls_get_addr@pltoff, %rax
+		     addq %rbx, %rax
+		     call *%rax
+		     into:
+		     movq %fs:0, %rax
+		     leaq foo@tpoff(%rax), %rax
+		     nopw 0x0(%rax,%rax,1) */
+		  int largepic = 0;
+		  if (ABI_64_P (output_bfd)
+		      && contents[roff + 5] == (bfd_byte) '\xb8')
+		    {
+		      memcpy (contents + roff - 3,
+			      "\x64\x48\x8b\x04\x25\0\0\0\0\x48\x8d\x80"
+			      "\0\0\0\0\x66\x0f\x1f\x44\0", 22);
+		      largepic = 1;
+		    }
+		  else if (ABI_64_P (output_bfd))
 		    memcpy (contents + roff - 4,
 			    "\x64\x48\x8b\x04\x25\0\0\0\0\x48\x8d\x80\0\0\0",
 			    16);
@@ -3958,8 +4010,8 @@ direct:
 			    15);
 		  bfd_put_32 (output_bfd,
 			      elf_x86_64_tpoff (info, relocation),
-			      contents + roff + 8);
-		  /* Skip R_X86_64_PC32/R_X86_64_PLT32.  */
+			      contents + roff + 8 + largepic);
+		  /* Skip R_X86_64_PC32/R_X86_64_PLT32/R_X86_64_PLTOFF64.  */
 		  rel++;
 		  continue;
 		}
@@ -4194,8 +4246,26 @@ direct:
 		     .word 0x6666; rex64; call __tls_get_addr@plt
 		     into:
 		     movl %fs:0, %eax
-		     addq foo@gottpoff(%rip), %rax */
-		  if (ABI_64_P (output_bfd))
+		     addq foo@gottpoff(%rip), %rax
+		     For largepic, change:
+		     leaq foo@tlsgd(%rip), %rdi
+		     movabsq $__tls_get_addr@pltoff, %rax
+		     addq %rbx, %rax
+		     call *%rax
+		     into:
+		     movq %fs:0, %rax
+		     addq foo@gottpoff(%rax), %rax
+		     nopw 0x0(%rax,%rax,1) */
+		  int largepic = 0;
+		  if (ABI_64_P (output_bfd)
+		      && contents[roff + 5] == (bfd_byte) '\xb8')
+		    {
+		      memcpy (contents + roff - 3,
+			      "\x64\x48\x8b\x04\x25\0\0\0\0\x48\x03\x05"
+			      "\0\0\0\0\x66\x0f\x1f\x44\0", 22);
+		      largepic = 1;
+		    }
+		  else if (ABI_64_P (output_bfd))
 		    memcpy (contents + roff - 4,
 			    "\x64\x48\x8b\x04\x25\0\0\0\0\x48\x03\x05\0\0\0",
 			    16);
@@ -4207,12 +4277,13 @@ direct:
 		  relocation = (htab->elf.sgot->output_section->vma
 				+ htab->elf.sgot->output_offset + off
 				- roff
+				- largepic
 				- input_section->output_section->vma
 				- input_section->output_offset
 				- 12);
 		  bfd_put_32 (output_bfd, relocation,
-			      contents + roff + 8);
-		  /* Skip R_X86_64_PLT32.  */
+			      contents + roff + 8 + largepic);
+		  /* Skip R_X86_64_PLT32/R_X86_64_PLTOFF64.  */
 		  rel++;
 		  continue;
 		}
@@ -4274,16 +4345,29 @@ direct:
 		 For 64bit, we change it into:
 		 .word 0x6666; .byte 0x66; movq %fs:0, %rax.
 		 For 32bit, we change it into:
-		 nopl 0x0(%rax); movl %fs:0, %eax.  */
+		 nopl 0x0(%rax); movl %fs:0, %eax.
+		 For largepic, change:
+		 leaq foo@tlsgd(%rip), %rdi
+		 movabsq $__tls_get_addr@pltoff, %rax
+		 addq %rbx, %rax
+		 call *%rax
+		 into:
+		 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1)
+		 movq %fs:0, %eax */
 
 	      BFD_ASSERT (r_type == R_X86_64_TPOFF32);
-	      if (ABI_64_P (output_bfd))
+	      if (ABI_64_P (output_bfd)
+		  && contents[rel->r_offset + 5] == (bfd_byte) '\xb8')
+		memcpy (contents + rel->r_offset - 3,
+			"\x66\x66\x66\x66\x2e\x0f\x1f\x84\0\0\0\0\0"
+			"\x64\x48\x8b\x04\x25\0\0\0", 22);
+	      else if (ABI_64_P (output_bfd))
 		memcpy (contents + rel->r_offset - 3,
 			"\x66\x66\x66\x64\x48\x8b\x04\x25\0\0\0", 12);
 	      else
 		memcpy (contents + rel->r_offset - 3,
 			"\x0f\x1f\x40\x00\x64\x8b\x04\x25\0\0\0", 12);
-	      /* Skip R_X86_64_PC32/R_X86_64_PLT32.  */
+	      /* Skip R_X86_64_PC32/R_X86_64_PLT32/R_X86_64_PLTOFF64.  */
 	      rel++;
 	      continue;
 	    }
__thread int a;
static __thread int b;
extern __thread int c;

int
foo ()
{
  return a++ + b++ + c++;
}

int
main ()
{
  a = 4;
  b = 5;
  c = 6;
  return foo () + foo () - 33;
}
__thread int c;

Comments

Uros Bizjak Aug. 14, 2013, 7:03 a.m. UTC | #1
On Tue, Aug 13, 2013 at 9:42 PM, Jakub Jelinek <jakub@redhat.com> wrote:

> We right now ICE with -mcmodel=large -fpic on x86_64 on TLS GD and LD
> sequences, because obviously we can't call __tls_get_addr@plt there from code
> potentially more than 2GB away from the PLT slot.
>
> The attached patches add support for that in gcc and also teaches linker
> about those, because otherwise the linker will fail if you try to link such
> -mcmodel=large -fpic code into binaries or PIEs.
>
> To make transitions possible, we emit always
> leaq foo@tlsgd(%rip), %rdi
> movabsq $__tls_get_addr@pltoff, %rax
> addq $rbx, %rax
> call *%rax
> resp.
> leaq foo@tlsld(%rip), %rdi
> movabsq $__tls_get_addr@pltoff, %rax
> addq $rbx, %rax
> call *%rax
> sequences (22 bytes, 6 bytes longer than what we do for TLSGD for normal
> libraries).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, attached is also the
> sources I've used to test all the 3 different transitions.
>
> Ok for trunk and 4.8 branch (and binutils trunk)?

The implementation for x86 is technically OK, but I wonder if these
sequences should be documented in some authoritative document about
TLS relocations. The "ELF Handling For Thread-Local Storage" document
[1] doesn't mention various code models fo x86_64, so I was not able
to cross-check the implementaton vs. documentation.

[1] http://www.akkadia.org/drepper/tls.pdf

Thanks,
Uros.
H.J. Lu Aug. 19, 2013, 5:38 p.m. UTC | #2
On Wed, Aug 14, 2013 at 12:03 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Tue, Aug 13, 2013 at 9:42 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>
>> We right now ICE with -mcmodel=large -fpic on x86_64 on TLS GD and LD
>> sequences, because obviously we can't call __tls_get_addr@plt there from code
>> potentially more than 2GB away from the PLT slot.
>>
>> The attached patches add support for that in gcc and also teaches linker
>> about those, because otherwise the linker will fail if you try to link such
>> -mcmodel=large -fpic code into binaries or PIEs.
>>
>> To make transitions possible, we emit always
>> leaq foo@tlsgd(%rip), %rdi
>> movabsq $__tls_get_addr@pltoff, %rax
>> addq $rbx, %rax
>> call *%rax
>> resp.
>> leaq foo@tlsld(%rip), %rdi
>> movabsq $__tls_get_addr@pltoff, %rax
>> addq $rbx, %rax
>> call *%rax
>> sequences (22 bytes, 6 bytes longer than what we do for TLSGD for normal
>> libraries).
>>
>> Bootstrapped/regtested on x86_64-linux and i686-linux, attached is also the
>> sources I've used to test all the 3 different transitions.
>>
>> Ok for trunk and 4.8 branch (and binutils trunk)?
>
> The implementation for x86 is technically OK, but I wonder if these
> sequences should be documented in some authoritative document about
> TLS relocations. The "ELF Handling For Thread-Local Storage" document
> [1] doesn't mention various code models fo x86_64, so I was not able
> to cross-check the implementaton vs. documentation.
>
> [1] http://www.akkadia.org/drepper/tls.pdf
>

I agree.  We need to document the TLS code sequences
for PIC/non-PIC medium/large models first.
Jakub Jelinek Aug. 28, 2013, 9:37 a.m. UTC | #3
On Wed, Aug 14, 2013 at 09:03:24AM +0200, Uros Bizjak wrote:
> The implementation for x86 is technically OK, but I wonder if these
> sequences should be documented in some authoritative document about
> TLS relocations. The "ELF Handling For Thread-Local Storage" document
> [1] doesn't mention various code models fo x86_64, so I was not able
> to cross-check the implementaton vs. documentation.
> 
> [1] http://www.akkadia.org/drepper/tls.pdf

Ping, are the patches ok for gcc trunk and binutils trunk?
Uli has kindly updated the docs some time ago.

	Jakub
Uros Bizjak Aug. 28, 2013, 9:39 a.m. UTC | #4
On Wed, Aug 28, 2013 at 11:37 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Wed, Aug 14, 2013 at 09:03:24AM +0200, Uros Bizjak wrote:
>> The implementation for x86 is technically OK, but I wonder if these
>> sequences should be documented in some authoritative document about
>> TLS relocations. The "ELF Handling For Thread-Local Storage" document
>> [1] doesn't mention various code models fo x86_64, so I was not able
>> to cross-check the implementaton vs. documentation.
>>
>> [1] http://www.akkadia.org/drepper/tls.pdf
>
> Ping, are the patches ok for gcc trunk and binutils trunk?
> Uli has kindly updated the docs some time ago.

OK for gcc.

Thanks,
Uros.
H.J. Lu Aug. 28, 2013, 3:30 p.m. UTC | #5
On Wed, Aug 28, 2013 at 2:37 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Wed, Aug 14, 2013 at 09:03:24AM +0200, Uros Bizjak wrote:
>> The implementation for x86 is technically OK, but I wonder if these
>> sequences should be documented in some authoritative document about
>> TLS relocations. The "ELF Handling For Thread-Local Storage" document
>> [1] doesn't mention various code models fo x86_64, so I was not able
>> to cross-check the implementaton vs. documentation.
>>
>> [1] http://www.akkadia.org/drepper/tls.pdf
>
> Ping, are the patches ok for gcc trunk and binutils trunk?
> Uli has kindly updated the docs some time ago.
>

Linker change is OK with testcases for GD and LD.

Thanks.
diff mbox

Patch

--- gcc/config/i386/i386.md.jj	2013-08-13 12:20:20.000000000 +0200
+++ gcc/config/i386/i386.md	2013-08-13 15:03:55.632194607 +0200
@@ -12303,11 +12303,33 @@  (define_insn "*tls_global_dynamic_64_<mo
    (set (attr "length")
 	(symbol_ref "TARGET_X32 ? 15 : 16"))])
 
+(define_insn "*tls_global_dynamic_64_largepic"
+  [(set (match_operand:DI 0 "register_operand" "=a")
+	(call:DI
+	 (mem:QI (plus:DI (match_operand:DI 2 "register_operand" "b")
+			  (match_operand:DI 3 "immediate_operand" "i")))
+	 (match_operand 4)))
+   (unspec:DI [(match_operand 1 "tls_symbolic_operand")]
+	     UNSPEC_TLS_GD)]
+  "TARGET_64BIT && ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF
+   && GET_CODE (operands[3]) == CONST
+   && GET_CODE (XEXP (operands[3], 0)) == UNSPEC
+   && XINT (XEXP (operands[3], 0), 1) == UNSPEC_PLTOFF"
+{
+  output_asm_insn
+    ("lea{q}\t{%E1@tlsgd(%%rip), %%rdi|rdi, %E1@tlsgd[rip]}", operands);
+  output_asm_insn ("movabs{q}\t{%3, %%rax|rax, %3}", operands);
+  output_asm_insn ("add{q}\t{%2, %%rax|rax, %2}", operands);
+  return "call\t{*%%rax|rax}";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "22")])
+
 (define_expand "tls_global_dynamic_64_<mode>"
   [(parallel
     [(set (match_operand:P 0 "register_operand")
 	  (call:P
-	   (mem:QI (match_operand 2 "constant_call_address_operand"))
+	   (mem:QI (match_operand 2))
 	   (const_int 0)))
      (unspec:P [(match_operand 1 "tls_symbolic_operand")]
 	       UNSPEC_TLS_GD)])]
@@ -12365,11 +12387,32 @@  (define_insn "*tls_local_dynamic_base_64
   [(set_attr "type" "multi")
    (set_attr "length" "12")])
 
+(define_insn "*tls_local_dynamic_base_64_largepic"
+  [(set (match_operand:DI 0 "register_operand" "=a")
+	(call:DI
+	 (mem:QI (plus:DI (match_operand:DI 1 "register_operand" "b")
+			  (match_operand:DI 2 "immediate_operand" "i")))
+	 (match_operand 3)))
+   (unspec:DI [(const_int 0)] UNSPEC_TLS_LD_BASE)]
+  "TARGET_64BIT && ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF
+   && GET_CODE (operands[2]) == CONST
+   && GET_CODE (XEXP (operands[2], 0)) == UNSPEC
+   && XINT (XEXP (operands[2], 0), 1) == UNSPEC_PLTOFF"
+{
+  output_asm_insn
+    ("lea{q}\t{%&@tlsld(%%rip), %%rdi|rdi, %&@tlsld[rip]}", operands);
+  output_asm_insn ("movabs{q}\t{%2, %%rax|rax, %2}", operands);
+  output_asm_insn ("add{q}\t{%1, %%rax|rax, %1}", operands);
+  return "call\t{*%%rax|rax}";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "22")])
+
 (define_expand "tls_local_dynamic_base_64_<mode>"
   [(parallel
      [(set (match_operand:P 0 "register_operand")
 	   (call:P
-	    (mem:QI (match_operand 1 "constant_call_address_operand"))
+	    (mem:QI (match_operand 1))
 	    (const_int 0)))
       (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)])]
   "TARGET_64BIT")
--- gcc/config/i386/i386.c.jj	2013-08-13 12:20:20.000000000 +0200
+++ gcc/config/i386/i386.c	2013-08-13 14:42:32.449334139 +0200
@@ -13220,6 +13220,14 @@  ix86_tls_get_addr (void)
       ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
     }
 
+  if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
+    {
+      rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
+				   UNSPEC_PLTOFF);
+      return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
+			   gen_rtx_CONST (Pmode, unspec));
+    }
+
   return ix86_tls_symbol;
 }