diff mbox

RFC: Rewrite x86-64 IFUNC selector in C

Message ID CAOVZoAOYJ7zNWRZkgEj2Pq=v=GHs2j4XkuFw8077M3vZnxBy0w@mail.gmail.com
State New
Headers show

Commit Message

Erich Elsen May 25, 2017, 9:25 p.m. UTC
Ok, I'll get started then.

Are there any general comments about the attached conversion for
memcpy?  Just so I don't repeat the same wrong thing many times.

Thanks,
Erich

On Thu, May 25, 2017 at 5:04 AM, Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
> On 25/05/2017 02:50, Siddhesh Poyarekar wrote:
>> On Thursday 25 May 2017 03:19 AM, H.J. Lu wrote:
>>> Most of time, C is easier to maintain than assembly code.  I think
>>> it is a good idea in general.  What do other people think?
>>
>> +1, C > assembly wherever possible.  Use assembly only if the compiler
>> is incapable of producing the output you want (and if that actually
>> matters) and then reevaluate every few releases.
>>
>> Siddhesh
>>
>
>> +1 as well, now we have the required infrastructure and other ports
>> are already using C implementation I see no reason to keep x86 using
>> assembly ones.

Comments

H.J. Lu May 25, 2017, 9:38 p.m. UTC | #1
On Thu, May 25, 2017 at 2:25 PM, Erich Elsen <eriche@google.com> wrote:
> Ok, I'll get started then.
>
> Are there any general comments about the attached conversion for
> memcpy?  Just so I don't repeat the same wrong thing many times.

You missed:

/* Define multiple versions only for the definition in lib and for
   DSO.  In static binaries we need memcpy before the initialization
   happened.  */
#if defined SHARED && IS_IN (libc)

+typedef void * (*memcpy_fn)(void *, const void *, size_t);
+
+extern void * __memcpy_erms(void *dest, const void *src, size_t n);
+extern void * __memcpy_sse2_unaligned(void *dest, const void *src, size_t n);
+extern void * __memcpy_sse2_unaligned_erms(void *dest, const void
*src, size_t n);
+extern void * __memcpy_ssse3(void *dest, const void *src, size_t n);
+extern void * __memcpy_ssse3_back(void *dest, const void *src, size_t n);
+extern void * __memcpy_avx_unaligned(void *dest, const void *src, size_t n);
+extern void * __memcpy_avx_unaligned_erms(void *dest, const void
*src, size_t n);
+extern void * __memcpy_avx512_unaligned(void *dest, const void *src, size_t n);
+extern void * __memcpy_avx512_unaligned_erms(void *dest, const void
*src, size_t n);

Please use something similar to multiarch/strstr.c:

/* Redefine strstr so that the compiler won't complain about the type
   mismatch with the IFUNC selector in strong_alias, below.  */
#undef  strstr
#define strstr __redirect_strstr
#include <string.h>
#undef  strstr
...
extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;

+/* Defined in cacheinfo.c */
+extern long int __x86_shared_cache_size attribute_hidden;
+extern long int __x86_shared_cache_size_half attribute_hidden;
+extern long int __x86_data_cache_size attribute_hidden;
+extern long int __x86_data_cache_size_half attribute_hidden;
+extern long int __x86_shared_non_temporal_threshold attribute_hidden;

Remove them.
static void * select_memcpy_impl(void) {
+  const struct cpu_features* cpu_features_struct_p = __get_cpu_features ();
+
+  if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Prefer_ERMS)) {
+    return __memcpy_erms;
+  }
+
+  if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, AVX512F_Usable)) {
+    if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Prefer_No_VZEROUPPER))
+      return __memcpy_avx512_unaligned_erms;
+    return __memcpy_avx512_unaligned;
+  }
+
+  if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, AVX_Fast_Unaligned_Load)) {
+    if (CPU_FEATURES_CPU_P(cpu_features_struct_p, ERMS)) {
+      return __memcpy_avx_unaligned_erms;
+
+    }
+    return __memcpy_avx_unaligned;
+  }
+  else {
+    if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Fast_Unaligned_Copy)) {
+      if (CPU_FEATURES_CPU_P(cpu_features_struct_p, ERMS)) {
+        return __memcpy_sse2_unaligned_erms;
+
+      }
+      return __memcpy_sse2_unaligned;
+    }
+    else {
+      if (!CPU_FEATURES_CPU_P(cpu_features_struct_p, SSSE3)) {
+        return __memcpy_sse2_unaligned;
+
+      }
+      if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Fast_Copy_Backward)) {
+        return __memcpy_ssse3_back;
+
+      }
+      return __memcpy_ssse3;
+    }
+  }
+}

Please

1. Fix formatting.
2. Remove unnecessary {}.
3. Don't use "else".

+void *__new_memcpy(void *dest, const void *src, size_t n)
+  __attribute__ ((ifunc ("select_memcpy_impl")));

Use "typeof" here.
Adhemerval Zanella Netto May 25, 2017, 9:55 p.m. UTC | #2
On 25/05/2017 18:38, H.J. Lu wrote:
> On Thu, May 25, 2017 at 2:25 PM, Erich Elsen <eriche@google.com> wrote:
>> Ok, I'll get started then.
>>
>> Are there any general comments about the attached conversion for
>> memcpy?  Just so I don't repeat the same wrong thing many times.
> 
> You missed:
> 
> /* Define multiple versions only for the definition in lib and for
>    DSO.  In static binaries we need memcpy before the initialization
>    happened.  */
> #if defined SHARED && IS_IN (libc)
> 
> +typedef void * (*memcpy_fn)(void *, const void *, size_t);
> +
> +extern void * __memcpy_erms(void *dest, const void *src, size_t n);
> +extern void * __memcpy_sse2_unaligned(void *dest, const void *src, size_t n);
> +extern void * __memcpy_sse2_unaligned_erms(void *dest, const void
> *src, size_t n);
> +extern void * __memcpy_ssse3(void *dest, const void *src, size_t n);
> +extern void * __memcpy_ssse3_back(void *dest, const void *src, size_t n);
> +extern void * __memcpy_avx_unaligned(void *dest, const void *src, size_t n);
> +extern void * __memcpy_avx_unaligned_erms(void *dest, const void
> *src, size_t n);
> +extern void * __memcpy_avx512_unaligned(void *dest, const void *src, size_t n);
> +extern void * __memcpy_avx512_unaligned_erms(void *dest, const void
> *src, size_t n);
> 
> Please use something similar to multiarch/strstr.c:
> 
> /* Redefine strstr so that the compiler won't complain about the type
>    mismatch with the IFUNC selector in strong_alias, below.  */
> #undef  strstr
> #define strstr __redirect_strstr
> #include <string.h>
> #undef  strstr
> ...
> extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
> 
> +/* Defined in cacheinfo.c */
> +extern long int __x86_shared_cache_size attribute_hidden;
> +extern long int __x86_shared_cache_size_half attribute_hidden;
> +extern long int __x86_data_cache_size attribute_hidden;
> +extern long int __x86_data_cache_size_half attribute_hidden;
> +extern long int __x86_shared_non_temporal_threshold attribute_hidden;

It seems it will be used not only for memcpy, so I would suggest to add
on a common header on multiarch.

> 
> Remove them.
> static void * select_memcpy_impl(void) {
> +  const struct cpu_features* cpu_features_struct_p = __get_cpu_features ();
> +
> +  if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Prefer_ERMS)) {
> +    return __memcpy_erms;
> +  }
> +
> +  if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, AVX512F_Usable)) {
> +    if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Prefer_No_VZEROUPPER))
> +      return __memcpy_avx512_unaligned_erms;
> +    return __memcpy_avx512_unaligned;
> +  }
> +
> +  if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, AVX_Fast_Unaligned_Load)) {
> +    if (CPU_FEATURES_CPU_P(cpu_features_struct_p, ERMS)) {
> +      return __memcpy_avx_unaligned_erms;
> +
> +    }
> +    return __memcpy_avx_unaligned;
> +  }
> +  else {
> +    if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Fast_Unaligned_Copy)) {
> +      if (CPU_FEATURES_CPU_P(cpu_features_struct_p, ERMS)) {
> +        return __memcpy_sse2_unaligned_erms;
> +
> +      }
> +      return __memcpy_sse2_unaligned;
> +    }
> +    else {
> +      if (!CPU_FEATURES_CPU_P(cpu_features_struct_p, SSSE3)) {
> +        return __memcpy_sse2_unaligned;
> +
> +      }
> +      if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Fast_Copy_Backward)) {
> +        return __memcpy_ssse3_back;
> +
> +      }
> +      return __memcpy_ssse3;
> +    }
> +  }
> +}
> 
> Please
> 
> 1. Fix formatting.
> 2. Remove unnecessary {}.
> 3. Don't use "else".
> 
> +void *__new_memcpy(void *dest, const void *src, size_t n)
> +  __attribute__ ((ifunc ("select_memcpy_impl")));
> 
> Use "typeof" here.

We have the libc_ifunc{_redirect} to handle the __attribute__ ((ifunc)) support
from compiler.  I think you can use:

# include <string.h>

// extern __typeof (memcpy) __memcpy_<each supported one> attribute_hidden;

static void *memcpy_selector (void)
{
  // fill me.
}

libc_ifunc_hidden (memcpy, memcpy, memcpy_selector);
libc_hidden_def (memcpy)
diff mbox

Patch

From a2957f5a0b21f9588e8756228b11b86f886b0f4c Mon Sep 17 00:00:00 2001
From: Erich Elsen <eriche@google.com>
Date: Tue, 23 May 2017 12:29:24 -0700
Subject: [PATCH] add memcpy.c

---
 sysdeps/x86_64/multiarch/memcpy.c | 70 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/memcpy.c

diff --git a/sysdeps/x86_64/multiarch/memcpy.c b/sysdeps/x86_64/multiarch/memcpy.c
new file mode 100644
index 0000000000..b0ff8c71fd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy.c
@@ -0,0 +1,70 @@ 
+#include "cpu-features.h"
+#include "init-arch.h"
+#include "shlib-compat.h"
+#include <stdlib.h>
+
+typedef void * (*memcpy_fn)(void *, const void *, size_t);
+
+extern void * __memcpy_erms(void *dest, const void *src, size_t n);
+extern void * __memcpy_sse2_unaligned(void *dest, const void *src, size_t n);
+extern void * __memcpy_sse2_unaligned_erms(void *dest, const void *src, size_t n);
+extern void * __memcpy_ssse3(void *dest, const void *src, size_t n);
+extern void * __memcpy_ssse3_back(void *dest, const void *src, size_t n);
+extern void * __memcpy_avx_unaligned(void *dest, const void *src, size_t n);
+extern void * __memcpy_avx_unaligned_erms(void *dest, const void *src, size_t n);
+extern void * __memcpy_avx512_unaligned(void *dest, const void *src, size_t n);
+extern void * __memcpy_avx512_unaligned_erms(void *dest, const void *src, size_t n);
+
+/* Defined in cacheinfo.c */
+extern long int __x86_shared_cache_size attribute_hidden;
+extern long int __x86_shared_cache_size_half attribute_hidden;
+extern long int __x86_data_cache_size attribute_hidden;
+extern long int __x86_data_cache_size_half attribute_hidden;
+extern long int __x86_shared_non_temporal_threshold attribute_hidden;
+
+static void * select_memcpy_impl(void) {
+  const struct cpu_features* cpu_features_struct_p = __get_cpu_features ();
+
+  if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Prefer_ERMS)) {
+    return __memcpy_erms;
+  }
+
+  if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, AVX512F_Usable)) {
+    if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Prefer_No_VZEROUPPER))
+      return __memcpy_avx512_unaligned_erms;
+    return __memcpy_avx512_unaligned;
+  }
+
+  if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, AVX_Fast_Unaligned_Load)) {
+    if (CPU_FEATURES_CPU_P(cpu_features_struct_p, ERMS)) {
+      return __memcpy_avx_unaligned_erms;
+
+    }
+    return __memcpy_avx_unaligned;
+  }
+  else {
+    if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Fast_Unaligned_Copy)) {
+      if (CPU_FEATURES_CPU_P(cpu_features_struct_p, ERMS)) {
+        return __memcpy_sse2_unaligned_erms;
+
+      }
+      return __memcpy_sse2_unaligned;
+    }
+    else {
+      if (!CPU_FEATURES_CPU_P(cpu_features_struct_p, SSSE3)) {
+        return __memcpy_sse2_unaligned;
+
+      }
+      if (CPU_FEATURES_ARCH_P(cpu_features_struct_p, Fast_Copy_Backward)) {
+        return __memcpy_ssse3_back;
+
+      }
+      return __memcpy_ssse3;
+    }
+  }
+}
+
+void *__new_memcpy(void *dest, const void *src, size_t n)
+  __attribute__ ((ifunc ("select_memcpy_impl")));
+
+versioned_symbol(libc, __new_memcpy, memcpy, GLIBC_2_14);
-- 
2.13.0.219.gdb65acc882-goog