This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.

Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

[PATCH] Fix sparc64 .plt[32768+] handling, optimize


Hi!

sparc64 did not get .plt[32768+] slots right (addend must be added only when
storing the magic value into .plt slot, not when jumping into the function).
In addition to this, this patch optimizes:
LD_BIND_NOW=1 (and prelink conflicts) does not have to worry about thread
safety when writing .plt slots, so it can save setting %g1 and immediately
setting it for the second time (ie. one wasted insn and one wasted cycle).
Also, if plt destination is near (e.g. into the same library), we can use
branch instruction, plus optimize for the common library location
0xfffff80000000000 - 0xfffff800ffffffff.

2001-09-26  Jakub Jelinek  <jakub@redhat.com>

	* sysdeps/sparc/sparc64/dl-machine.h (elf_machine_fixup_plt): Call
	sparc64_fixup_plt.
	(sparc64_fixup_plt): Moved from elf_machine_fixup_plt. Optimize
	near jumps and 0xfffff800XXXXXXXX target addresses, no thread safety
	for non-lazy binding. Fix .plt[32768+] handling.
	(elf_machine_plt_value): Don't add addend.
	(elf_machine_rela): Call sparc64_fixup_plt instead of
	elf_machine_fixup_plt.
	(elf_machine_runtime_setup, TRAMPOLINE_TEMPLATE): Optimize for
	dynamic linker at 0xfffff800XXXXXXXX.
	* sysdeps/sparc/sparc32/fpu/libm-test-ulps: Update.

--- libc/sysdeps/sparc/sparc64/dl-machine.h.jj	Wed Sep 12 09:57:12 2001
+++ libc/sysdeps/sparc/sparc64/dl-machine.h	Wed Sep 26 21:02:20 2001
@@ -84,41 +84,51 @@ elf_machine_load_address (void)
 
 /* We have 4 cases to handle.  And we code different code sequences
    for each one.  I love V9 code models...  */
-static inline Elf64_Addr
-elf_machine_fixup_plt (struct link_map *map, lookup_t t,
-		       const Elf64_Rela *reloc,
-		       Elf64_Addr *reloc_addr, Elf64_Addr value)
+static inline void
+sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
+		   Elf64_Addr *reloc_addr, Elf64_Addr value,
+		   Elf64_Addr high, int t)
 {
   unsigned int *insns = (unsigned int *) reloc_addr;
   Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
+  Elf64_Sxword disp = value - plt_vaddr;
 
   /* Now move plt_vaddr up to the call instruction.  */
-  plt_vaddr += (2 * 4);
+  plt_vaddr += ((t + 1) * 4);
 
   /* PLT entries .PLT32768 and above look always the same.  */
-  if (__builtin_expect (reloc->r_addend, 0) != 0)
+  if (__builtin_expect (high, 0) != 0)
     {
       *reloc_addr = value - map->l_addr;
     }
+  /* Near destination.  */
+  else if (disp >= -0x800000 && disp < 0x800000)
+    {
+      /* As this is just one instruction, it is thread safe and so
+	 we can avoid the unnecessary sethi FOO, %g1.
+	 b,a target  */
+      insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff);
+      __asm __volatile ("flush %0" : : "r" (insns));
+    }
   /* 32-bit Sparc style, the target is in the lower 32-bits of
      address space.  */
-  else if ((value >> 32) == 0)
+  else if (insns += t, (value >> 32) == 0)
     {
       /* sethi	%hi(target), %g1
 	 jmpl	%g1 + %lo(target), %g0  */
 
-      insns[2] = 0x81c06000 | (value & 0x3ff);
-      __asm __volatile ("flush %0 + 8" : : "r" (insns));
-
-      insns[1] = 0x03000000 | ((unsigned int)(value >> 10));
+      insns[1] = 0x81c06000 | (value & 0x3ff);
       __asm __volatile ("flush %0 + 4" : : "r" (insns));
+
+      insns[0] = 0x03000000 | ((unsigned int)(value >> 10));
+      __asm __volatile ("flush %0" : : "r" (insns));
     }
   /* We can also get somewhat simple sequences if the distance between
      the target and the PLT entry is within +/- 2GB.  */
   else if ((plt_vaddr > value
-	    && ((plt_vaddr - value) >> 32) == 0)
+	    && ((plt_vaddr - value) >> 31) == 0)
 	   || (value > plt_vaddr
-	       && ((value - plt_vaddr) >> 32) == 0))
+	       && ((value - plt_vaddr) >> 31) == 0))
     {
       unsigned int displacement;
 
@@ -131,14 +141,14 @@ elf_machine_fixup_plt (struct link_map *
 	 call	displacement
 	  mov	%g1, %o7  */
 
-      insns[3] = 0x9e100001;
-      __asm __volatile ("flush %0 + 12" : : "r" (insns));
-
-      insns[2] = 0x40000000 | (displacement >> 2);
+      insns[2] = 0x9e100001;
       __asm __volatile ("flush %0 + 8" : : "r" (insns));
 
-      insns[1] = 0x8210000f;
+      insns[1] = 0x40000000 | (displacement >> 2);
       __asm __volatile ("flush %0 + 4" : : "r" (insns));
+
+      insns[t] = 0x8210000f;
+      __asm __volatile ("flush %0" : : "r" (insns));
     }
   /* Worst case, ho hum...  */
   else
@@ -149,33 +159,62 @@ elf_machine_fixup_plt (struct link_map *
       /* ??? Some tricks can be stolen from the sparc64 egcs backend
 	     constant formation code I wrote.  -DaveM  */
 
-      /* sethi	%hh(value), %g1
-	 sethi	%lm(value), %g5
-	 or	%g1, %hm(value), %g1
-	 or	%g5, %lo(value), %g5
-	 sllx	%g1, 32, %g1
-	 jmpl	%g1 + %g5, %g0
-	  nop  */
-
-      insns[6] = 0x81c04005;
-      __asm __volatile ("flush %0 + 24" : : "r" (insns));
-
-      insns[5] = 0x83287020;
-      __asm __volatile ("flush %0 + 20" : : "r" (insns));
+      if (__builtin_expect (high32 & 0x3ff, 0))
+	{
+	  /* sethi	%hh(value), %g1
+	     sethi	%lm(value), %g5
+	     or		%g1, %hm(value), %g1
+	     or		%g5, %lo(value), %g5
+	     sllx	%g1, 32, %g1
+	     jmpl	%g1 + %g5, %g0
+	      nop  */
+
+	  insns[5] = 0x81c04005;
+	  __asm __volatile ("flush %0 + 20" : : "r" (insns));
+
+	  insns[4] = 0x83287020;
+	  __asm __volatile ("flush %0 + 16" : : "r" (insns));
+
+	  insns[3] = 0x8a116000 | (low32 & 0x3ff);
+	  __asm __volatile ("flush %0 + 12" : : "r" (insns));
+
+	  insns[2] = 0x82106000 | (high32 & 0x3ff);
+	}
+      else
+	{
+	  /* sethi	%hh(value), %g1
+	     sethi	%lm(value), %g5
+	     sllx	%g1, 32, %g1
+	     or		%g5, %lo(value), %g5
+	     jmpl	%g1 + %g5, %g0
+	      nop  */
+
+	  insns[4] = 0x81c04005;
+	  __asm __volatile ("flush %0 + 16" : : "r" (insns));
 
-      insns[4] = 0x8a116000 | (low32 & 0x3ff);
-      __asm __volatile ("flush %0 + 16" : : "r" (insns));
+	  insns[3] = 0x8a116000 | (low32 & 0x3ff);
+	  __asm __volatile ("flush %0 + 12" : : "r" (insns));
 
-      insns[3] = 0x82106000 | (high32 & 0x3ff);
-      __asm __volatile ("flush %0 + 12" : : "r" (insns));
+	  insns[2] = 0x83287020;
+	}
 
-      insns[2] = 0x0b000000 | (low32 >> 10);
       __asm __volatile ("flush %0 + 8" : : "r" (insns));
 
-      insns[1] = 0x03000000 | (high32 >> 10);
+      insns[1] = 0x0b000000 | (low32 >> 10);
       __asm __volatile ("flush %0 + 4" : : "r" (insns));
+
+      insns[0] = 0x03000000 | (high32 >> 10);
+      __asm __volatile ("flush %0" : : "r" (insns));
     }
+}
 
+static inline Elf64_Addr
+elf_machine_fixup_plt (struct link_map *map, lookup_t t,
+		       const Elf64_Rela *reloc,
+		       Elf64_Addr *reloc_addr, Elf64_Addr value)
+{
+  sparc64_fixup_plt (map, reloc, reloc_addr, value + reloc->r_addend,
+		     reloc->r_addend, 1);
   return value;
 }
 
@@ -184,7 +223,10 @@ static inline Elf64_Addr
 elf_machine_plt_value (struct link_map *map, const Elf64_Rela *reloc,
 		       Elf64_Addr value)
 {
-  return value + reloc->r_addend;
+  /* Don't add addend here, but in elf_machine_fixup_plt instead.
+     value + reloc->r_addend is the value which should actually be
+     stored into .plt data slot.  */
+  return value;
 }
 
 #ifdef RESOLVE
@@ -329,7 +371,8 @@ elf_machine_rela (struct link_map *map, 
 	  break;
 #endif
 	case R_SPARC_JMP_SLOT:
-	  elf_machine_fixup_plt(map, 0, reloc, reloc_addr, value);
+	  sparc64_fixup_plt (map, reloc, reloc_addr, value,
+			     reloc->r_addend, 0);
 	  break;
 #ifndef RTLD_BOOTSTRAP
 	case R_SPARC_UA16:
@@ -425,6 +468,7 @@ elf_machine_runtime_setup (struct link_m
       extern void _dl_runtime_profile_1 (void);
       Elf64_Addr res0_addr, res1_addr;
       unsigned int *plt = (void *) D_PTR (l, l_info[DT_PLTGOT]);
+      int i = 0;
 
       if (! profile)
 	{
@@ -473,13 +517,21 @@ elf_machine_runtime_setup (struct link_m
        */
 
       plt[8 + 0] = 0x9de3bf40;
+      if (__builtin_expect (((res1_addr + 4) >> 32) & 0x3ff, 0))
+	i = 1;
+      else
+	res1_addr += 4;
       plt[8 + 1] = 0x21000000 | (res1_addr >> (64 - 22));
       plt[8 + 2] = 0x23000000 | ((res1_addr >> 10) & 0x003fffff);
-      plt[8 + 3] = 0xa0142000 | ((res1_addr >> 32) & 0x3ff);
+      if (__builtin_expect (i, 0))
+	plt[8 + 3] = 0xa0142000 | ((res1_addr >> 32) & 0x3ff);
+      else
+	plt[8 + 3] = 0xa12c3020;
       plt[8 + 4] = 0xa2146000 | (res1_addr & 0x3ff);
-      plt[8 + 5] = 0xa12c3020;
-      plt[8 + 6] = 0xadc40011;
-      plt[8 + 7] = 0x9330700c;
+      if (__builtin_expect (i, 0))
+	plt[8 + 5] = 0xa12c3020;
+      plt[8 + 5 + i] = 0xadc40011;
+      plt[8 + 6 + i] = 0x9330700c;
 
       /* Now put the magic cookie at the beginning of .PLT2
 	 Entry .PLT3 is unused by this implementation.  */
@@ -526,10 +578,11 @@ elf_machine_runtime_setup (struct link_m
 "\n"							\
 "	.globl	" #tramp_name "_1\n"			\
 "	.type	" #tramp_name "_1, @function\n"		\
-"	.align	32\n"					\
+"	! tramp_name_1 + 4 needs to be .align 32\n"	\
 "\t" #tramp_name "_1:\n"				\
+"	sub	%l6, 4, %l6\n"				\
 "	! srlx	%g1, 12, %o1 - Done in .PLT1\n"		\
-"	ldx	[%l6 + 8], %o0\n"			\
+"	ldx	[%l6 + 12], %o0\n"			\
 "	add	%o1, %o1, %o3\n"			\
 "	sub	%o1, 96, %o1	! No thanks to Sun for not obeying their own ABI\n" \
 "	mov	%i7, %o2\n"				\
--- libc/sysdeps/sparc/sparc32/fpu/libm-test-ulps.jj	Mon Jun 25 10:35:00 2001
+++ libc/sysdeps/sparc/sparc32/fpu/libm-test-ulps	Wed Sep 26 17:51:50 2001
@@ -451,6 +451,16 @@ ifloat: 1
 Test "j0 (2.0) == 0.22389077914123566805":
 float: 2
 ifloat: 2
+Test "j0 (4.0) == -3.9714980986384737228659076845169804197562E-1"
+double: 1
+idouble: 1
+float: 1
+ifloat: 1
+Test "j0 (-4.0) == -3.9714980986384737228659076845169804197562E-1"
+double: 1
+idouble: 1
+float: 1
+ifloat: 1
 Test "j0 (8.0) == 0.17165080713755390609":
 float: 1
 ifloat: 1

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]