[v2,5/6] Refactor hp-timing rtld usage
diff mbox series

Message ID 20190218211128.1869-5-adhemerval.zanella@linaro.org
State New
Headers show
Series
  • [v2,1/6] nptl: Remove pthread_clock_gettime pthread_clock_settime
Related show

Commit Message

Adhemerval Zanella Feb. 18, 2019, 9:11 p.m. UTC
Changes from previous version:

  - Add rtld_timer_start, rtld_timer_stop, and rtld_timer_accum to simplify
    the profiling timer calculation.  It allows remove the declaration of
    some temporary variables.

  - Simplify print_statistics and refactor HP_TIMING_PRINT to remove 'cycle'
    print.

---

This patch refactor how hp-timing is used on loader code for statistics
report.  The HP_TIMING_AVAIL and HP_SMALL_TIMING_AVAIL are removed and
HP_TIMING_INLINE is used instead to check for hp-timing avaliability.
For alpha, which only defines HP_SMALL_TIMING_AVAIL, the HP_TIMING_INLINE
is set iff for IS_IN(rtld).

Checked on aarch64-linux-gnu, x86_64-linux-gnu, and i686-linux-gnu. I also
checked the builds for all afected ABIs.

	* benchtests/bench-timing.h: Replace HP_TIMING_AVAIL with
	HP_TIMING_INLINE.
	* nptl/descr.h: Likewise.
	* elf/rtld.c (RLTD_TIMING_DECLARE, RTLD_TIMING_NOW, RTLD_TIMING_DIFF,
	RTLD_TIMING_ACCUM_NT, RTLD_TIMING_SET): Define.
	(dl_start_final_info, _dl_start_final, dl_main, print_statistics):
	Abstract hp-timing usage with RTLD_* macros.
	* sysdeps/alpha/hp-timing.h (HP_TIMING_INLINE): Define iff IS_IN(rtld).
	(HP_TIMING_AVAIL, HP_SMALL_TIMING_AVAIL): Remove.
	* sysdeps/generic/hp-timing.h (HP_TIMING_AVAIL, HP_SMALL_TIMING_AVAIL,
	HP_TIMING_NONAVAIL): Likewise.
	* sysdeps/ia64/hp-timing.h (HP_TIMING_AVAIL, HP_SMALL_TIMING_AVAIL):
	Likewise.
	* sysdeps/powerpc/powerpc32/power4/hp-timing.h (HP_TIMING_AVAIL,
	HP_SMALL_TIMING_AVAIL): Likewise.
	* sysdeps/powerpc/powerpc64/hp-timing.h (HP_TIMING_AVAIL,
	HP_SMALL_TIMING_AVAIL): Likewise.
	* sysdeps/sparc/sparc32/sparcv9/hp-timing.h (HP_TIMING_AVAIL,
	HP_SMALL_TIMING_AVAIL): Likewise.
	* sysdeps/sparc/sparc64/hp-timing.h (HP_TIMING_AVAIL,
	HP_SMALL_TIMING_AVAIL): Likewise.
	* sysdeps/x86/hp-timing.h (HP_TIMING_AVAIL, HP_SMALL_TIMING_AVAIL):
	Likewise.
	* sysdeps/generic/hp-timing-common.h: Update comment with
	HP_TIMING_AVAIL removal.
	(HP_TIMING_PRINT_SIZE): New define.
	(HP_TIMING_PRINT): Remove cycle printing.
---
 benchtests/bench-timing.h                    |   2 +-
 elf/rtld.c                                   | 281 +++++++++----------
 nptl/descr.h                                 |   2 +-
 sysdeps/alpha/hp-timing.h                    |  18 +-
 sysdeps/generic/hp-timing-common.h           |   8 +-
 sysdeps/generic/hp-timing.h                  |   5 -
 sysdeps/ia64/hp-timing.h                     |   4 -
 sysdeps/powerpc/powerpc32/power4/hp-timing.h |   4 -
 sysdeps/powerpc/powerpc64/hp-timing.h        |   4 -
 sysdeps/sparc/sparc32/sparcv9/hp-timing.h    |   2 -
 sysdeps/sparc/sparc64/hp-timing.h            |   2 -
 sysdeps/x86/hp-timing.h                      |   4 -
 12 files changed, 144 insertions(+), 192 deletions(-)

Patch
diff mbox series

diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h
index 41b7324527..93fe379f99 100644
--- a/benchtests/bench-timing.h
+++ b/benchtests/bench-timing.h
@@ -21,7 +21,7 @@ 
 #include <hp-timing.h>
 #include <stdint.h>
 
-#if HP_TIMING_AVAIL && !defined USE_CLOCK_GETTIME
+#if HP_TIMING_INLINE && !defined USE_CLOCK_GETTIME
 # define GL(x) _##x
 # define GLRO(x) _##x
 typedef hp_timing_t timing_t;
diff --git a/elf/rtld.c b/elf/rtld.c
index 1f124b31fc..18d30074ea 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -46,6 +46,49 @@ 
 
 #include <assert.h>
 
+/* Only enables rtld profiling for architectures which provides non generic
+   hp-timing support.  The generic support requires either syscall
+   (clock_gettime), which will incur in extra overhead  on loading time.
+   Using vDSO is also an option, but it will require extra support on loader
+   to setup the vDSO pointer before its usage.  */
+#if HP_TIMING_INLINE
+# define RLTD_TIMING_DECLARE(var, classifier,...) \
+  classifier hp_timing_t var __VA_ARGS__
+# define RTLD_TIMING_VAR(var)        RLTD_TIMING_DECLARE (var, )
+# define RTLD_TIMING_SET(var, value) (var) = (value)
+# define RTLD_TIMING_REF(var)        &(var)
+
+static inline void
+rtld_timer_start (hp_timing_t *var)
+{
+  HP_TIMING_NOW (*var);
+}
+
+static inline void
+rtld_timer_stop (hp_timing_t *var, hp_timing_t start)
+{
+  hp_timing_t stop;
+  HP_TIMING_NOW (stop);
+  HP_TIMING_DIFF (*var, start, stop);
+}
+
+static inline void
+rtld_timer_accum (hp_timing_t *sum, hp_timing_t start)
+{
+  hp_timing_t stop;
+  rtld_timer_stop (&stop, start);
+  HP_TIMING_ACCUM_NT(*sum, stop);
+}
+#else
+# define RLTD_TIMING_DECLARE(var, classifier...)
+# define RTLD_TIMING_SET(var, value)
+# define RTLD_TIMING_VAR(var)
+# define RTLD_TIMING_REF(var)			 0
+# define rtld_timer_start(var)
+# define rtld_timer_stop(var, start)
+# define rtld_timer_accum(sum, start)
+#endif
+
 /* Avoid PLT use for our local calls at startup.  */
 extern __typeof (__mempcpy) __mempcpy attribute_hidden;
 
@@ -62,7 +105,7 @@  static void print_missing_version (int errcode, const char *objname,
 				   const char *errsting);
 
 /* Print the various times we collected.  */
-static void print_statistics (hp_timing_t *total_timep);
+static void print_statistics (const hp_timing_t *total_timep);
 
 /* Add audit objects.  */
 static void process_dl_audit (char *str);
@@ -303,11 +346,9 @@  static struct libname_list _dl_rtld_libname;
 static struct libname_list _dl_rtld_libname2;
 
 /* Variable for statistics.  */
-#ifndef HP_TIMING_NONAVAIL
-static hp_timing_t relocate_time;
-static hp_timing_t load_time attribute_relro;
-static hp_timing_t start_time attribute_relro;
-#endif
+RLTD_TIMING_DECLARE (relocate_time, static);
+RLTD_TIMING_DECLARE (load_time,     static, attribute_relro);
+RLTD_TIMING_DECLARE (start_time,    static, attribute_relro);
 
 /* Additional definitions needed by TLS initialization.  */
 #ifdef TLS_INIT_HELPER
@@ -335,9 +376,7 @@  static ElfW(Addr) _dl_start_final (void *arg);
 struct dl_start_final_info
 {
   struct link_map l;
-#if !defined HP_TIMING_NONAVAIL && HP_TIMING_INLINE
-  hp_timing_t start_time;
-#endif
+  RTLD_TIMING_VAR (start_time);
 };
 static ElfW(Addr) _dl_start_final (void *arg,
 				   struct dl_start_final_info *info);
@@ -371,16 +410,11 @@  _dl_start_final (void *arg, struct dl_start_final_info *info)
 {
   ElfW(Addr) start_addr;
 
-  if (HP_SMALL_TIMING_AVAIL)
-    {
-      /* If it hasn't happen yet record the startup time.  */
-      if (! HP_TIMING_INLINE)
-	HP_TIMING_NOW (start_time);
-#if !defined DONT_USE_BOOTSTRAP_MAP && !defined HP_TIMING_NONAVAIL
-      else
-	start_time = info->start_time;
+  /* If it hasn't happen yet record the startup time.  */
+  rtld_timer_start (&start_time);
+#if !defined DONT_USE_BOOTSTRAP_MAP
+  RTLD_TIMING_SET (start_time, info->start_time);
 #endif
-    }
 
   /* Transfer data about ourselves to the permanent link_map structure.  */
 #ifndef DONT_USE_BOOTSTRAP_MAP
@@ -412,27 +446,11 @@  _dl_start_final (void *arg, struct dl_start_final_info *info)
      entry point on the same stack we entered on.  */
   start_addr = _dl_sysdep_start (arg, &dl_main);
 
-#ifndef HP_TIMING_NONAVAIL
-  hp_timing_t rtld_total_time;
-  if (HP_SMALL_TIMING_AVAIL)
-    {
-      hp_timing_t end_time;
-
-      /* Get the current time.  */
-      HP_TIMING_NOW (end_time);
-
-      /* Compute the difference.  */
-      HP_TIMING_DIFF (rtld_total_time, start_time, end_time);
-    }
-#endif
-
   if (__glibc_unlikely (GLRO(dl_debug_mask) & DL_DEBUG_STATISTICS))
     {
-#ifndef HP_TIMING_NONAVAIL
-      print_statistics (&rtld_total_time);
-#else
-      print_statistics (NULL);
-#endif
+      RTLD_TIMING_VAR (rtld_total_time);
+      rtld_timer_stop (&rtld_total_time, start_time);
+      print_statistics (RTLD_TIMING_REF(rtld_total_time));
     }
 
   return start_addr;
@@ -457,11 +475,10 @@  _dl_start (void *arg)
 #define RESOLVE_MAP(sym, version, flags) BOOTSTRAP_MAP
 #include "dynamic-link.h"
 
-  if (HP_TIMING_INLINE && HP_SMALL_TIMING_AVAIL)
 #ifdef DONT_USE_BOOTSTRAP_MAP
-    HP_TIMING_NOW (start_time);
+  rtld_timer_start (&start_time);
 #else
-    HP_TIMING_NOW (info.start_time);
+  rtld_timer_start (&info.start_time);
 #endif
 
   /* Partly clean the `bootstrap_map' structure up.  Don't use
@@ -1078,11 +1095,6 @@  dl_main (const ElfW(Phdr) *phdr,
   unsigned int i;
   bool prelinked = false;
   bool rtld_is_main = false;
-#ifndef HP_TIMING_NONAVAIL
-  hp_timing_t start;
-  hp_timing_t stop;
-  hp_timing_t diff;
-#endif
   void *tcbp = NULL;
 
   GL(dl_init_static_tls) = &_dl_nothread_init_static_tls;
@@ -1258,12 +1270,11 @@  of this helper program; chances are you did not intend to run this program.\n\
 	}
       else
 	{
-	  HP_TIMING_NOW (start);
+	  RTLD_TIMING_VAR (start);
+	  rtld_timer_start (&start);
 	  _dl_map_object (NULL, rtld_progname, lt_executable, 0,
 			  __RTLD_OPENEXEC, LM_ID_BASE);
-	  HP_TIMING_NOW (stop);
-
-	  HP_TIMING_DIFF (load_time, start, stop);
+	  rtld_timer_stop (&load_time, start);
 	}
 
       /* Now the map for the main executable is available.  */
@@ -1666,20 +1677,18 @@  ERROR: '%s': cannot process note segment.\n", _dl_argv[0]);
 
   if (__glibc_unlikely (preloadlist != NULL))
     {
-      HP_TIMING_NOW (start);
+      RTLD_TIMING_VAR (start);
+      rtld_timer_start (&start);
       npreloads += handle_preload_list (preloadlist, main_map, "LD_PRELOAD");
-      HP_TIMING_NOW (stop);
-      HP_TIMING_DIFF (diff, start, stop);
-      HP_TIMING_ACCUM_NT (load_time, diff);
+      rtld_timer_accum (&load_time, start);
     }
 
   if (__glibc_unlikely (preloadarg != NULL))
     {
-      HP_TIMING_NOW (start);
+      RTLD_TIMING_VAR (start);
+      rtld_timer_start (&start);
       npreloads += handle_preload_list (preloadarg, main_map, "--preload");
-      HP_TIMING_NOW (stop);
-      HP_TIMING_DIFF (diff, start, stop);
-      HP_TIMING_ACCUM_NT (load_time, diff);
+      rtld_timer_accum (&load_time, start);
     }
 
   /* There usually is no ld.so.preload file, it should only be used
@@ -1739,7 +1748,8 @@  ERROR: '%s': cannot process note segment.\n", _dl_argv[0]);
 	      file[file_size - 1] = '\0';
 	    }
 
-	  HP_TIMING_NOW (start);
+	  RTLD_TIMING_VAR (start);
+	  rtld_timer_start (&start);
 
 	  if (file != problem)
 	    {
@@ -1757,9 +1767,7 @@  ERROR: '%s': cannot process note segment.\n", _dl_argv[0]);
 	      npreloads += do_preload (p, main_map, preload_file);
 	    }
 
-	  HP_TIMING_NOW (stop);
-	  HP_TIMING_DIFF (diff, start, stop);
-	  HP_TIMING_ACCUM_NT (load_time, diff);
+	  rtld_timer_accum (&load_time, start);
 
 	  /* We don't need the file anymore.  */
 	  __munmap (file, file_size);
@@ -1783,11 +1791,12 @@  ERROR: '%s': cannot process note segment.\n", _dl_argv[0]);
   /* Load all the libraries specified by DT_NEEDED entries.  If LD_PRELOAD
      specified some libraries to load, these are inserted before the actual
      dependencies in the executable's searchlist for symbol resolution.  */
-  HP_TIMING_NOW (start);
-  _dl_map_object_deps (main_map, preloads, npreloads, mode == trace, 0);
-  HP_TIMING_NOW (stop);
-  HP_TIMING_DIFF (diff, start, stop);
-  HP_TIMING_ACCUM_NT (load_time, diff);
+  {
+    RTLD_TIMING_VAR (start);
+    rtld_timer_start (&start);
+    _dl_map_object_deps (main_map, preloads, npreloads, mode == trace, 0);
+    rtld_timer_accum (&load_time, start);
+  }
 
   /* Mark all objects as being in the global scope.  */
   for (i = main_map->l_searchlist.r_nlist; i > 0; )
@@ -2180,12 +2189,10 @@  ERROR: '%s': cannot process note segment.\n", _dl_argv[0]);
       if (main_map->l_info [ADDRIDX (DT_GNU_CONFLICT)] != NULL)
 	{
 	  ElfW(Rela) *conflict, *conflictend;
-#ifndef HP_TIMING_NONAVAIL
-	  hp_timing_t start;
-	  hp_timing_t stop;
-#endif
 
-	  HP_TIMING_NOW (start);
+	  RTLD_TIMING_VAR (start);
+	  rtld_timer_start (&start);
+
 	  assert (main_map->l_info [VALIDX (DT_GNU_CONFLICTSZ)] != NULL);
 	  conflict = (ElfW(Rela) *)
 	    main_map->l_info [ADDRIDX (DT_GNU_CONFLICT)]->d_un.d_ptr;
@@ -2193,8 +2200,8 @@  ERROR: '%s': cannot process note segment.\n", _dl_argv[0]);
 	    ((char *) conflict
 	     + main_map->l_info [VALIDX (DT_GNU_CONFLICTSZ)]->d_un.d_val);
 	  _dl_resolve_conflicts (main_map, conflict, conflictend);
-	  HP_TIMING_NOW (stop);
-	  HP_TIMING_DIFF (relocate_time, start, stop);
+
+	  rtld_timer_stop (&relocate_time, start);
 	}
 
 
@@ -2222,15 +2229,12 @@  ERROR: '%s': cannot process note segment.\n", _dl_argv[0]);
 	 know that because it is self-contained).  */
 
       int consider_profiling = GLRO(dl_profile) != NULL;
-#ifndef HP_TIMING_NONAVAIL
-      hp_timing_t start;
-      hp_timing_t stop;
-#endif
 
       /* If we are profiling we also must do lazy reloaction.  */
       GLRO(dl_lazy) |= consider_profiling;
 
-      HP_TIMING_NOW (start);
+      RTLD_TIMING_VAR (start);
+      rtld_timer_start (&start);
       unsigned i = main_map->l_searchlist.r_nlist;
       while (i-- > 0)
 	{
@@ -2257,9 +2261,7 @@  ERROR: '%s': cannot process note segment.\n", _dl_argv[0]);
 	  if (l->l_tls_blocksize != 0 && tls_init_tp_called)
 	    _dl_add_to_slotinfo (l);
 	}
-      HP_TIMING_NOW (stop);
-
-      HP_TIMING_DIFF (relocate_time, start, stop);
+      rtld_timer_stop (&relocate_time, start);
 
       /* Now enable profiling if needed.  Like the previous call,
 	 this has to go here because the calls it makes should use the
@@ -2302,19 +2304,14 @@  ERROR: '%s': cannot process note segment.\n", _dl_argv[0]);
 	 re-relocation, we might call a user-supplied function
 	 (e.g. calloc from _dl_relocate_object) that uses TLS data.  */
 
-#ifndef HP_TIMING_NONAVAIL
-      hp_timing_t start;
-      hp_timing_t stop;
-      hp_timing_t add;
-#endif
+      RTLD_TIMING_VAR (start);
+      rtld_timer_start (&start);
 
-      HP_TIMING_NOW (start);
       /* Mark the link map as not yet relocated again.  */
       GL(dl_rtld_map).l_relocated = 0;
       _dl_relocate_object (&GL(dl_rtld_map), main_map->l_scope, 0, 0);
-      HP_TIMING_NOW (stop);
-      HP_TIMING_DIFF (add, start, stop);
-      HP_TIMING_ACCUM_NT (relocate_time, add);
+
+      rtld_timer_accum (&relocate_time, start);
     }
 
   /* Do any necessary cleanups for the startup OS interface code.
@@ -2746,46 +2743,51 @@  process_envvars (enum mode *modep)
     }
 }
 
+#if HP_TIMING_INLINE
+static void
+print_statistics_item (const char *title, hp_timing_t time,
+		       hp_timing_t total)
+{
+  char cycles[HP_TIMING_PRINT_SIZE];
+  HP_TIMING_PRINT (cycles, sizeof (cycles), time);
+
+  char relative[3 * sizeof (hp_timing_t) + 2];
+  char *cp = _itoa ((1000ULL * time) / total, relative + sizeof (relative),
+		    10, 0);
+  /* Sets the decimal point.  */
+  char *wp = relative;
+  switch (relative + sizeof (relative) - cp)
+    {
+    case 3:
+      *wp++ = *cp++;
+      /* Fall through.  */
+    case 2:
+      *wp++ = *cp++;
+      /* Fall through.  */
+    case 1:
+      *wp++ = '.';
+      *wp++ = *cp++;
+    }
+  *wp = '\0';
+  _dl_debug_printf ("%s: %s cycles (%s%%)\n", title, cycles, relative);
+}
+#endif
 
 /* Print the various times we collected.  */
 static void
 __attribute ((noinline))
-print_statistics (hp_timing_t *rtld_total_timep)
+print_statistics (const hp_timing_t *rtld_total_timep)
 {
-#ifndef HP_TIMING_NONAVAIL
-  char buf[200];
-  char *cp;
-  char *wp;
-
-  /* Total time rtld used.  */
-  if (HP_SMALL_TIMING_AVAIL)
-    {
-      HP_TIMING_PRINT (buf, sizeof (buf), *rtld_total_timep);
-      _dl_debug_printf ("\nruntime linker statistics:\n"
-			"  total startup time in dynamic loader: %s\n", buf);
-
-      /* Print relocation statistics.  */
-      char pbuf[30];
-      HP_TIMING_PRINT (buf, sizeof (buf), relocate_time);
-      cp = _itoa ((1000ULL * relocate_time) / *rtld_total_timep,
-		  pbuf + sizeof (pbuf), 10, 0);
-      wp = pbuf;
-      switch (pbuf + sizeof (pbuf) - cp)
-	{
-	case 3:
-	  *wp++ = *cp++;
-	  /* Fall through.  */
-	case 2:
-	  *wp++ = *cp++;
-	  /* Fall through.  */
-	case 1:
-	  *wp++ = '.';
-	  *wp++ = *cp++;
-	}
-      *wp = '\0';
-      _dl_debug_printf ("\
-	    time needed for relocation: %s (%s%%)\n", buf, pbuf);
-    }
+#if HP_TIMING_INLINE
+  {
+    char cycles[HP_TIMING_PRINT_SIZE];
+    HP_TIMING_PRINT (cycles, sizeof (cycles), *rtld_total_timep);
+    _dl_debug_printf ("\nruntime linker statistics:\n"
+		      "  total startup time in dynamic loader: %s cycles\n",
+		      cycles);
+    print_statistics_item ("            time needed for relocation",
+			   relocate_time, *rtld_total_timep);
+  }
 #endif
 
   unsigned long int num_relative_relocations = 0;
@@ -2826,31 +2828,8 @@  print_statistics (hp_timing_t *rtld_total_timep)
 		    GL(dl_num_cache_relocations),
 		    num_relative_relocations);
 
-#ifndef HP_TIMING_NONAVAIL
-  /* Time spend while loading the object and the dependencies.  */
-  if (HP_SMALL_TIMING_AVAIL)
-    {
-      char pbuf[30];
-      HP_TIMING_PRINT (buf, sizeof (buf), load_time);
-      cp = _itoa ((1000ULL * load_time) / *rtld_total_timep,
-		  pbuf + sizeof (pbuf), 10, 0);
-      wp = pbuf;
-      switch (pbuf + sizeof (pbuf) - cp)
-	{
-	case 3:
-	  *wp++ = *cp++;
-	  /* Fall through.  */
-	case 2:
-	  *wp++ = *cp++;
-	  /* Fall through.  */
-	case 1:
-	  *wp++ = '.';
-	  *wp++ = *cp++;
-	}
-      *wp = '\0';
-      _dl_debug_printf ("\
-	   time needed to load objects: %s (%s%%)\n",
-				buf, pbuf);
-    }
+#if HP_TIMING_INLINE
+  print_statistics_item ("           time needed to load objects",
+			 load_time, *rtld_total_timep);
 #endif
 }
diff --git a/nptl/descr.h b/nptl/descr.h
index cb7d4c2282..b4db99fe3f 100644
--- a/nptl/descr.h
+++ b/nptl/descr.h
@@ -342,7 +342,7 @@  struct pthread
   /* Lock for synchronizing setxid calls.  */
   unsigned int setxid_futex;
 
-#if HP_TIMING_AVAIL
+#if HP_TIMING_INLINE
   hp_timing_t cpuclock_offset_ununsed;
 #endif
 
diff --git a/sysdeps/alpha/hp-timing.h b/sysdeps/alpha/hp-timing.h
index 481132663c..08d8d6627c 100644
--- a/sysdeps/alpha/hp-timing.h
+++ b/sysdeps/alpha/hp-timing.h
@@ -17,16 +17,13 @@ 
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef _HP_TIMING_H
-#define _HP_TIMING_H	1
+#ifndef _HP_TIMING_ALPHA_H
+#define _HP_TIMING_ALPHA_H	1
 
+#if IS_IN(rtld)
 /* We always have the timestamp register, but it's got only a 4 second
    range.  Use it for ld.so profiling only.  */
-#define HP_TIMING_AVAIL		(0)
-#define HP_SMALL_TIMING_AVAIL	(1)
-
-/* We indeed have inlined functions.  */
-#define HP_TIMING_INLINE	(1)
+# define HP_TIMING_INLINE	(1)
 
 /* We use 32 bit values for the times.  */
 typedef unsigned int hp_timing_t;
@@ -34,13 +31,16 @@  typedef unsigned int hp_timing_t;
 /* The "rpcc" instruction returns a 32-bit counting half and a 32-bit
    "virtual cycle counter displacement".  Subtracting the two gives us
    a virtual cycle count.  */
-#define HP_TIMING_NOW(VAR) \
+# define HP_TIMING_NOW(VAR) \
   do {									      \
     unsigned long int x_;						      \
     asm volatile ("rpcc %0" : "=r"(x_));				      \
     (VAR) = (int) (x_) - (int) (x_ >> 32);				      \
   } while (0)
+# include <hp-timing-common.h>
 
-#include <hp-timing-common.h>
+#else
+# include <sysdeps/generic/hp-timing.h>
+#endif /* IS_IN(rtld)  */
 
 #endif	/* hp-timing.h */
diff --git a/sysdeps/generic/hp-timing-common.h b/sysdeps/generic/hp-timing-common.h
index 0ffb853444..8749d25647 100644
--- a/sysdeps/generic/hp-timing-common.h
+++ b/sysdeps/generic/hp-timing-common.h
@@ -20,8 +20,6 @@ 
 /* In case a platform supports timers in the hardware the following macros
    and types must be defined:
 
-   - HP_TIMING_AVAIL: test for availability.
-
    - HP_TIMING_INLINE: this macro is non-zero if the functionality is not
      implemented using function calls but instead uses some inlined code
      which might simply consist of a few assembler instructions.  We have to
@@ -47,16 +45,16 @@ 
 /* Accumulate ADD into SUM.  No attempt is made to be thread-safe.  */
 #define HP_TIMING_ACCUM_NT(Sum, Diff)		((Sum) += (Diff))
 
+#define HP_TIMING_PRINT_SIZE (3 * sizeof (hp_timing_t) + 1)
+
 /* Write a decimal representation of the timing value into the given string.  */
 #define HP_TIMING_PRINT(Dest, Len, Val) 				\
   do {									\
-    char __buf[20];							\
+    char __buf[HP_TIMING_PRINT_SIZE];					\
     char *__dest = (Dest);						\
     size_t __len = (Len);						\
     char *__cp = _itoa ((Val), __buf + sizeof (__buf), 10, 0);		\
     size_t __cp_len = MIN (__buf + sizeof (__buf) - __cp, __len);	\
     memcpy (__dest, __cp, __cp_len);					\
-    memcpy (__dest + __cp_len, " cycles",				\
-	    MIN (__len - __cp_len, sizeof (" cycles")));		\
     __dest[__len - 1] = '\0';						\
   } while (0)
diff --git a/sysdeps/generic/hp-timing.h b/sysdeps/generic/hp-timing.h
index fb2a6036fc..278998d2c2 100644
--- a/sysdeps/generic/hp-timing.h
+++ b/sysdeps/generic/hp-timing.h
@@ -25,8 +25,6 @@ 
    the system call might be too high.  */
 
 /* Provide dummy definitions.  */
-#define HP_TIMING_AVAIL		(0)
-#define HP_SMALL_TIMING_AVAIL	(0)
 #define HP_TIMING_INLINE	(0)
 typedef int hp_timing_t;
 #define HP_TIMING_NOW(var)
@@ -34,7 +32,4 @@  typedef int hp_timing_t;
 #define HP_TIMING_ACCUM_NT(Sum, Diff)
 #define HP_TIMING_PRINT(Buf, Len, Val)
 
-/* Since this implementation is not available we tell the user about it.  */
-#define HP_TIMING_NONAVAIL	1
-
 #endif	/* hp-timing.h */
diff --git a/sysdeps/ia64/hp-timing.h b/sysdeps/ia64/hp-timing.h
index 17ef023a11..2ca248a530 100644
--- a/sysdeps/ia64/hp-timing.h
+++ b/sysdeps/ia64/hp-timing.h
@@ -20,10 +20,6 @@ 
 #ifndef _HP_TIMING_H
 #define _HP_TIMING_H	1
 
-/* We always assume having the timestamp register.  */
-#define HP_TIMING_AVAIL		(1)
-#define HP_SMALL_TIMING_AVAIL	(1)
-
 /* We indeed have inlined functions.  */
 #define HP_TIMING_INLINE	(1)
 
diff --git a/sysdeps/powerpc/powerpc32/power4/hp-timing.h b/sysdeps/powerpc/powerpc32/power4/hp-timing.h
index 0d77aa0992..ed2ca9bc7a 100644
--- a/sysdeps/powerpc/powerpc32/power4/hp-timing.h
+++ b/sysdeps/powerpc/powerpc32/power4/hp-timing.h
@@ -20,10 +20,6 @@ 
 #ifndef _HP_TIMING_H
 #define _HP_TIMING_H	1
 
-/* We always assume having the timestamp register.  */
-#define HP_TIMING_AVAIL		(1)
-#define HP_SMALL_TIMING_AVAIL	(1)
-
 /* We indeed have inlined functions.  */
 #define HP_TIMING_INLINE	(1)
 
diff --git a/sysdeps/powerpc/powerpc64/hp-timing.h b/sysdeps/powerpc/powerpc64/hp-timing.h
index fb9ac1ce2a..01678cd634 100644
--- a/sysdeps/powerpc/powerpc64/hp-timing.h
+++ b/sysdeps/powerpc/powerpc64/hp-timing.h
@@ -20,10 +20,6 @@ 
 #ifndef _HP_TIMING_H
 #define _HP_TIMING_H	1
 
-/* We always assume having the timestamp register.  */
-#define HP_TIMING_AVAIL		(1)
-#define HP_SMALL_TIMING_AVAIL	(1)
-
 /* We indeed have inlined functions.  */
 #define HP_TIMING_INLINE	(1)
 
diff --git a/sysdeps/sparc/sparc32/sparcv9/hp-timing.h b/sysdeps/sparc/sparc32/sparcv9/hp-timing.h
index 6a4ab08679..3270c5b40a 100644
--- a/sysdeps/sparc/sparc32/sparcv9/hp-timing.h
+++ b/sysdeps/sparc/sparc32/sparcv9/hp-timing.h
@@ -20,8 +20,6 @@ 
 #ifndef _HP_TIMING_H
 #define _HP_TIMING_H	1
 
-#define HP_TIMING_AVAIL		(1)
-#define HP_SMALL_TIMING_AVAIL	(1)
 #define HP_TIMING_INLINE	(1)
 
 typedef unsigned long long int hp_timing_t;
diff --git a/sysdeps/sparc/sparc64/hp-timing.h b/sysdeps/sparc/sparc64/hp-timing.h
index db95c02a8d..0f02b2a98f 100644
--- a/sysdeps/sparc/sparc64/hp-timing.h
+++ b/sysdeps/sparc/sparc64/hp-timing.h
@@ -20,8 +20,6 @@ 
 #ifndef _HP_TIMING_H
 #define _HP_TIMING_H	1
 
-#define HP_TIMING_AVAIL		(1)
-#define HP_SMALL_TIMING_AVAIL	(1)
 #define HP_TIMING_INLINE	(1)
 
 typedef unsigned long int hp_timing_t;
diff --git a/sysdeps/x86/hp-timing.h b/sysdeps/x86/hp-timing.h
index 9b6a998bcd..ccb9f9250a 100644
--- a/sysdeps/x86/hp-timing.h
+++ b/sysdeps/x86/hp-timing.h
@@ -22,10 +22,6 @@ 
 #include <isa.h>
 
 #if MINIMUM_ISA == 686 || MINIMUM_ISA == 8664
-/* We always assume having the timestamp register.  */
-# define HP_TIMING_AVAIL	(1)
-# define HP_SMALL_TIMING_AVAIL	(1)
-
 /* We indeed have inlined functions.  */
 # define HP_TIMING_INLINE	(1)