diff mbox

[v3,1/3] cutils: add avx2 instruction optimization

Message ID 1449576535-3369-2-git-send-email-liang.z.li@intel.com
State New
Headers show

Commit Message

Li, Liang Z Dec. 8, 2015, 12:08 p.m. UTC
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 intructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, excute the AVX2 instructions,
else, excute the original code.

Signed-off-by: Liang Li <liang.z.li@intel.com>
---
 include/qemu-common.h   | 13 +++++-----
 util/Makefile.objs      |  2 ++
 util/buffer-zero-avx2.c | 54 ++++++++++++++++++++++++++++++++++++++++
 util/cutils.c           | 65 +++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 125 insertions(+), 9 deletions(-)
 create mode 100644 util/buffer-zero-avx2.c

Comments

Richard Henderson Dec. 8, 2015, 4:09 p.m. UTC | #1
On 12/08/2015 04:08 AM, Liang Li wrote:
> +++ b/util/buffer-zero-avx2.c
> @@ -0,0 +1,54 @@
> +#include "qemu-common.h"
> +
> +#if defined CONFIG_IFUNC && defined CONFIG_AVX2
> +#include <immintrin.h>
> +#define AVX2_VECTYPE        __m256i
> +#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
> +#define AVX2_ALL_EQ(v1, v2) \
> +    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
> +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
> +
> +inline bool
> +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> +                   * sizeof(AVX2_VECTYPE)) == 0
> +            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
> +}

I'm not keen on adding a new file for this.  You ought to be able to use
__attribute__((target("-mavx2"))) on any compiler that supports the
command-line option.  Which means you can do this all in one file with static
functions.

Nor am I keen on marking a function inline when we know it must be out-of-line
because of the ifunc usage.


r~
Li, Liang Z Dec. 9, 2015, 9:32 a.m. UTC | #2
> On 12/08/2015 04:08 AM, Liang Li wrote:
> > +++ b/util/buffer-zero-avx2.c
> > @@ -0,0 +1,54 @@
> > +#include "qemu-common.h"
> > +
> > +#if defined CONFIG_IFUNC && defined CONFIG_AVX2 #include
> > +<immintrin.h>
> > +#define AVX2_VECTYPE        __m256i
> > +#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
> > +#define AVX2_ALL_EQ(v1, v2) \
> > +    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) ==
> 0xFFFFFFFF)
> > +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
> > +
> > +inline bool
> > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> > +{
> > +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> > +                   * sizeof(AVX2_VECTYPE)) == 0
> > +            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); }
> 
> I'm not keen on adding a new file for this.  You ought to be able to use
> __attribute__((target("-mavx2"))) on any compiler that supports the
> command-line option.  Which means you can do this all in one file with static
> functions.
> 

I think you means the ' __attribute__((target("avx2")))', I have tried this way, the issue here is:
 without the ' -mavx2' option for gcc, there are compiling error:  '__m256i undeclared', the __attribute__((target("avx2")))
can't solve this issue.  Any idea?

If I put these avx2 Intrinsics and the sse2 Intrinsics in a single file, the sse2  Intrinsics will be compiled to the avx2 instructions, this is not we want.

> Nor am I keen on marking a function inline when we know it must be out-of-
> line because of the ifunc usage.

Inline can be removed.

Thanks 

Liang
> 
> r~
Richard Henderson Dec. 9, 2015, 2:57 p.m. UTC | #3
On 12/09/2015 01:32 AM, Li, Liang Z wrote:
> I think you means the ' __attribute__((target("avx2")))', I have tried this way, the issue here is:
>   without the ' -mavx2' option for gcc, there are compiling error:  '__m256i undeclared', the __attribute__((target("avx2")))
> can't solve this issue.  Any idea?

You're right that you can't use the normal __m256i, as it doesn't get declared. 
  But you can define the same type within the function itself.

Which is a simple matter of

   typedef long long __m256i __attribute__((vector_size(32)));

 From there, you might as well rely on other gcc extensions to instead write

    __m256i tmp0 = p[i + 0] | p[i + 1];

rather than obfuscating the code with AVX2_VEC_OR.



r~
Li, Liang Z Dec. 10, 2015, 1:10 a.m. UTC | #4
> On 12/09/2015 01:32 AM, Li, Liang Z wrote:
> > I think you means the ' __attribute__((target("avx2")))', I have tried this
> way, the issue here is:
> >   without the ' -mavx2' option for gcc, there are compiling error:
> > '__m256i undeclared', the __attribute__((target("avx2"))) can't solve this
> issue.  Any idea?
> 
> You're right that you can't use the normal __m256i, as it doesn't get declared.
>   But you can define the same type within the function itself.
> 
> Which is a simple matter of
> 
>    typedef long long __m256i __attribute__((vector_size(32)));
> 
>  From there, you might as well rely on other gcc extensions to instead write
> 
>     __m256i tmp0 = p[i + 0] | p[i + 1];
> 
> rather than obfuscating the code with AVX2_VEC_OR.
> 
>
Comparing this way to  putting the related code to a separate file, I think the latter is more simple.

Thanks
Liang
Paolo Bonzini Dec. 10, 2015, 9:03 a.m. UTC | #5
On 09/12/2015 15:57, Richard Henderson wrote:
>> I think you means the ' __attribute__((target("avx2")))', I have tried
>> this way, the issue here is:
>>   without the ' -mavx2' option for gcc, there are compiling error: 
>> '__m256i undeclared', the __attribute__((target("avx2")))
>> can't solve this issue.  Any idea?
> 
> You're right that you can't use the normal __m256i, as it doesn't get
> declared.

It should be declared.  *intrin.h uses #pragma GCC target and always
defines all vector types.

In fact, the following compiles for me with just "gcc foo.c" under
GCC 5.x:

#include <immintrin.h>

// #if defined CONFIG_IFUNC && defined CONFIG_AVX2
#pragma GCC push_options
#pragma GCC target("avx2")
#define AVX2_VECTYPE        __m256i
#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
#define AVX2_ALL_EQ(v1, v2) \
    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))

size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
{
    const AVX2_VECTYPE *p = buf;
    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
    size_t i;

    if (!len) {
        return 0;
    }

    for (i = 0; i < 4; i++) {
        if (!AVX2_ALL_EQ(p[i], zero)) {
            return i * sizeof(AVX2_VECTYPE);
        }
    }

    for (i = 4; i < len / sizeof(AVX2_VECTYPE); i += 4) {
        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
            break;
        }
    }

    return i * sizeof(AVX2_VECTYPE);
}

#pragma GCC pop_options
// #endif

so perhaps the configure test is testing the wrong thing?

Paolo
Li, Liang Z Dec. 10, 2015, 9:22 a.m. UTC | #6
> >>   without the ' -mavx2' option for gcc, there are compiling error:
> >> '__m256i undeclared', the __attribute__((target("avx2"))) can't solve
> >> this issue.  Any idea?
> >
> > You're right that you can't use the normal __m256i, as it doesn't get
> > declared.
> 
> It should be declared.  *intrin.h uses #pragma GCC target and always defines
> all vector types.
> 
> In fact, the following compiles for me with just "gcc foo.c" under GCC 5.x:
> 
> #include <immintrin.h>
> 
> // #if defined CONFIG_IFUNC && defined CONFIG_AVX2 #pragma GCC
> push_options #pragma GCC target("avx2")
> #define AVX2_VECTYPE        __m256i
> #define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
> #define AVX2_ALL_EQ(v1, v2) \
>     (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
> #define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
> 
> size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) {
>     const AVX2_VECTYPE *p = buf;
>     const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
>     size_t i;
> 
>     if (!len) {
>         return 0;
>     }
> 
>     for (i = 0; i < 4; i++) {
>         if (!AVX2_ALL_EQ(p[i], zero)) {
>             return i * sizeof(AVX2_VECTYPE);
>         }
>     }
> 
>     for (i = 4; i < len / sizeof(AVX2_VECTYPE); i += 4) {
>         AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
>         AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
>         AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
>         AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
>         AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
>         AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
>         if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
>             break;
>         }
>     }
> 
>     return i * sizeof(AVX2_VECTYPE);
> }
> 
> #pragma GCC pop_options
> // #endif
> 
> so perhaps the configure test is testing the wrong thing?
> 
> Paolo

Hi Paolo,

what's your opinion?  putting the AVX2 related code to util/cutils.c and use the "#pragma ..." you referred?
The configure test is ok, it use the "-mavx2".

Liang
Paolo Bonzini Dec. 10, 2015, 9:51 a.m. UTC | #7
On 10/12/2015 10:22, Li, Liang Z wrote:
>>>>   without the ' -mavx2' option for gcc, there are compiling error:
>>>> '__m256i undeclared', the __attribute__((target("avx2"))) can't solve
>>>> this issue.  Any idea?
>>>
>>> You're right that you can't use the normal __m256i, as it doesn't get
>>> declared.
>>
>> It should be declared.  *intrin.h uses #pragma GCC target and always defines
>> all vector types.
>>
>> In fact, the following compiles for me with just "gcc foo.c" under GCC 5.x:
>>
>> #include <immintrin.h>
>>
>> // #if defined CONFIG_IFUNC && defined CONFIG_AVX2 #pragma GCC
>> push_options #pragma GCC target("avx2")
>> #define AVX2_VECTYPE        __m256i
>> #define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
>> #define AVX2_ALL_EQ(v1, v2) \
>>     (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
>> #define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
>>
>> size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) {
>>     const AVX2_VECTYPE *p = buf;
>>     const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
>>     size_t i;
>>
>>     if (!len) {
>>         return 0;
>>     }
>>
>>     for (i = 0; i < 4; i++) {
>>         if (!AVX2_ALL_EQ(p[i], zero)) {
>>             return i * sizeof(AVX2_VECTYPE);
>>         }
>>     }
>>
>>     for (i = 4; i < len / sizeof(AVX2_VECTYPE); i += 4) {
>>         AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
>>         AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
>>         AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
>>         AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
>>         AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
>>         AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
>>         if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
>>             break;
>>         }
>>     }
>>
>>     return i * sizeof(AVX2_VECTYPE);
>> }
>>
>> #pragma GCC pop_options
>> // #endif
>>
>> so perhaps the configure test is testing the wrong thing?
>>
>> Paolo
> 
> Hi Paolo,
> 
> what's your opinion?  putting the AVX2 related code to util/cutils.c and use the "#pragma ..." you referred?

Yes, that's best.  And you can keep using __m256i if you prefer that.

Paolo
diff mbox

Patch

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 405364f..be8ba79 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -484,15 +484,14 @@  void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-                   * sizeof(VECTYPE)) == 0
-            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+bool can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+#endif
+
 /*
  * helper to parse debug environment variables
  */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 89dd80e..a130b35 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -1,4 +1,5 @@ 
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
+util-obj-$(CONFIG_AVX2) += buffer-zero-avx2.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
@@ -30,3 +31,4 @@  util-obj-y += qemu-coroutine-sleep.o
 util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
 util-obj-y += buffer.o
 util-obj-y += timed-average.o
+buffer-zero-avx2.o-cflags      := $(AVX2_CFLAGS)
diff --git a/util/buffer-zero-avx2.c b/util/buffer-zero-avx2.c
new file mode 100644
index 0000000..b9da0e3
--- /dev/null
+++ b/util/buffer-zero-avx2.c
@@ -0,0 +1,54 @@ 
+#include "qemu-common.h"
+
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+#include <immintrin.h>
+#define AVX2_VECTYPE        __m256i
+#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+inline bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(AVX2_VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    const AVX2_VECTYPE *p = buf;
+    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+    size_t i;
+
+    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+    if (!len) {
+        return 0;
+    }
+
+    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+        if (!AVX2_ALL_EQ(p[i], zero)) {
+            return i * sizeof(AVX2_VECTYPE);
+        }
+    }
+
+    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+         i < len / sizeof(AVX2_VECTYPE);
+         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
+        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
+            break;
+        }
+    }
+
+    return i * sizeof(AVX2_VECTYPE);
+}
+
+#endif
diff --git a/util/cutils.c b/util/cutils.c
index cfeb848..3631c02 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -26,6 +26,7 @@ 
 #include <math.h>
 #include <limits.h>
 #include <errno.h>
+#include <cpuid.h>
 
 #include "qemu/sockets.h"
 #include "qemu/iov.h"
@@ -161,6 +162,14 @@  int qemu_fdatasync(int fd)
 #endif
 }
 
+static inline bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+
 /*
  * Searches for an area with non-zero content in a buffer
  *
@@ -181,13 +190,13 @@  int qemu_fdatasync(int fd)
  * If the buffer is all zero the return value is equal to len.
  */
 
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
 {
     const VECTYPE *p = buf;
     const VECTYPE zero = (VECTYPE){0};
     size_t i;
 
-    assert(can_use_buffer_find_nonzero_offset(buf, len));
+    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
 
     if (!len) {
         return 0;
@@ -216,6 +225,58 @@  size_t buffer_find_nonzero_offset(const void *buf, size_t len)
     return i * sizeof(VECTYPE);
 }
 
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+/* old compiler maynot define bit_AVX2 */
+#ifndef bit_AVX2
+#define bit_AVX2 (1 << 5)
+#endif
+
+static bool avx2_support(void)
+{
+    int a, b, c, d;
+
+    if (__get_cpuid_max(0, NULL) < 7) {
+        return false;
+    }
+
+    __cpuid_count(7, 0, a, b, c, d);
+    return b & bit_AVX2;
+}
+
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
+         __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
+size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
+         __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
+
+static void *buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+
+static void *can_use_buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        can_use_buffer_find_nonzero_offset_avx2 :
+        can_use_buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+#else
+
+inline bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return can_use_buffer_find_nonzero_offset_inner(buf, len);
+}
+
+size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return buffer_find_nonzero_offset_inner(buf, len);
+}
+#endif
+
 /*
  * Checks if a buffer is all zeroes
  *