diff mbox

[v2,1/2] cutils: add avx2 instruction optimization

Message ID 1447123907-26750-2-git-send-email-liang.z.li@intel.com
State New
Headers show

Commit Message

Li, Liang Z Nov. 10, 2015, 2:51 a.m. UTC
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 intructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, excute the AVX2 instructions,
else, excute the original code.

Signed-off-by: Liang Li <liang.z.li@intel.com>
---
 include/qemu-common.h | 28 +++++++++++++++------
 util/Makefile.objs    |  2 ++
 util/avx2.c           | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++
 util/cutils.c         | 53 +++++++++++++++++++++++++++++++++++++--
 4 files changed, 143 insertions(+), 9 deletions(-)
 create mode 100644 util/avx2.c

Comments

Paolo Bonzini Nov. 12, 2015, 10:08 a.m. UTC | #1
On 10/11/2015 03:51, Liang Li wrote:
> buffer_find_nonzero_offset() is a hot function during live migration.
> Now it use SSE2 intructions for optimization. For platform supports
> AVX2 instructions, use the AVX2 instructions for optimization can help
> to improve the performance about 30% comparing to SSE2.
> Zero page check can be faster with this optimization, the test result
> shows that for an 8GB RAM idle guest, this patch can help to shorten
> the total live migration time about 6%.
> 
> This patch use the ifunc mechanism to select the proper function when
> running, for platform supports AVX2, excute the AVX2 instructions,
> else, excute the original code.
> 
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> ---
>  include/qemu-common.h | 28 +++++++++++++++------
>  util/Makefile.objs    |  2 ++
>  util/avx2.c           | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  util/cutils.c         | 53 +++++++++++++++++++++++++++++++++++++--
>  4 files changed, 143 insertions(+), 9 deletions(-)
>  create mode 100644 util/avx2.c
> 
> diff --git a/include/qemu-common.h b/include/qemu-common.h
> index 2f74540..9fa7501 100644
> --- a/include/qemu-common.h
> +++ b/include/qemu-common.h
> @@ -484,15 +484,29 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
>  #endif
>  
>  #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
> -static inline bool
> -can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> -{
> -    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> -                   * sizeof(VECTYPE)) == 0
> -            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
> -}
> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
> +
>  size_t buffer_find_nonzero_offset(const void *buf, size_t len);
>  
> +extern bool
> +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
> +
> +extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
> +
> +extern bool
> +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len);
> +
> +extern size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len);
> +
> +__asm__(".type can_use_buffer_find_nonzero_offset, \%gnu_indirect_function");
> +__asm__(".type buffer_find_nonzero_offset, \%gnu_indirect_function");
> +
> +
> +void *can_use_buffer_find_nonzero_offset_ifunc(void) \
> +                     __asm__("can_use_buffer_find_nonzero_offset");
> +
> +void *buffer_find_nonzero_offset_ifunc(void) \
> +                     __asm__("buffer_find_nonzero_offset");
>  /*
>   * helper to parse debug environment variables
>   */
> diff --git a/util/Makefile.objs b/util/Makefile.objs
> index d7cc399..6aacad7 100644
> --- a/util/Makefile.objs
> +++ b/util/Makefile.objs
> @@ -1,4 +1,5 @@
>  util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
> +util-obj-y += avx2.o
>  util-obj-$(CONFIG_POSIX) += compatfd.o
>  util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
>  util-obj-$(CONFIG_POSIX) += mmap-alloc.o
> @@ -29,3 +30,4 @@ util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
>  util-obj-y += qemu-coroutine-sleep.o
>  util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
>  util-obj-y += buffer.o
> +avx2.o-cflags      := $(AVX2_CFLAGS)
> diff --git a/util/avx2.c b/util/avx2.c
> new file mode 100644
> index 0000000..0e6915a
> --- /dev/null
> +++ b/util/avx2.c
> @@ -0,0 +1,69 @@
> +#include "qemu-common.h"
> +
> +#ifdef __AVX2__
> +#include <immintrin.h>
> +#define AVX2_VECTYPE        __m256i
> +#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
> +#define AVX2_ALL_EQ(v1, v2) \
> +    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
> +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
> +
> +inline bool
> +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> +                   * sizeof(AVX2_VECTYPE)) == 0
> +            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
> +}
> +
> +size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> +    const AVX2_VECTYPE *p = buf;
> +    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
> +    size_t i;
> +
> +    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
> +
> +    if (!len) {
> +        return 0;
> +    }
> +
> +    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
> +        if (!AVX2_ALL_EQ(p[i], zero)) {
> +            return i * sizeof(AVX2_VECTYPE);
> +        }
> +    }
> +
> +    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
> +         i < len / sizeof(AVX2_VECTYPE);
> +         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
> +        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
> +        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
> +        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
> +        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
> +        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
> +        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
> +        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
> +            break;
> +        }
> +    }
> +
> +    return i * sizeof(AVX2_VECTYPE);
> +}
> +
> +#else
> +/* use the original functions if avx2 is not enabled when buiding*/
> +
> +inline bool
> +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> +    return can_use_buffer_find_nonzero_offset_inner(buf, len);
> +}
> +
> +inline size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> +    return buffer_find_nonzero_offset_inner(buf, len);
> +}
> +
> +#endif
> +
> diff --git a/util/cutils.c b/util/cutils.c
> index cfeb848..cd478ce 100644
> --- a/util/cutils.c
> +++ b/util/cutils.c
> @@ -26,6 +26,7 @@
>  #include <math.h>
>  #include <limits.h>
>  #include <errno.h>
> +#include <cpuid.h>
>  
>  #include "qemu/sockets.h"
>  #include "qemu/iov.h"
> @@ -161,6 +162,54 @@ int qemu_fdatasync(int fd)
>  #endif
>  }
>  
> +/* old compiler maynot define bit_AVX2 */
> +#ifndef bit_AVX2
> +#define bit_AVX2 (1 << 5)
> +#endif
> +
> +static inline bool avx2_support(void)
> +{
> +    int a, b, c, d;
> +
> +    if (__get_cpuid_max(0, NULL) < 7) {
> +        printf("max cpuid < 7\n");
> +        return false;
> +    }
> +
> +    __cpuid_count(7, 0, a, b, c, d);
> +    printf("b = %x\n", b);
> +    return b & bit_AVX2;
> +}
> +
> +void *buffer_find_nonzero_offset_ifunc(void)
> +{
> +    printf("deciding %s\n", __func__);
> +
> +    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
> +        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
> +
> +    return func;
> +}
> +
> +void *can_use_buffer_find_nonzero_offset_ifunc(void)
> +{
> +    printf("deciding %s\n", __func__);
> +
> +    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
> +        can_use_buffer_find_nonzero_offset_avx2 :
> +        can_use_buffer_find_nonzero_offset_inner;
> +
> +    return func;
> +}
> +
> +inline bool
> +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
> +{
> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> +                   * sizeof(VECTYPE)) == 0
> +            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
> +}
> +
>  /*
>   * Searches for an area with non-zero content in a buffer
>   *
> @@ -181,13 +230,13 @@ int qemu_fdatasync(int fd)
>   * If the buffer is all zero the return value is equal to len.
>   */
>  
> -size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> +size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
>  {
>      const VECTYPE *p = buf;
>      const VECTYPE zero = (VECTYPE){0};
>      size_t i;
>  
> -    assert(can_use_buffer_find_nonzero_offset(buf, len));
> +    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
>  
>      if (!len) {
>          return 0;
> 

The main issue here is that you are not testing whether the compiler 
supports gnu_indirect_function.

I suggest that you start by moving the functions to util/buffer-zero.c

Then the structure should be something like

#ifdef CONFIG_HAVE_AVX2
#include <immintrin.h>
#endif

... define buffer_find_nonzero_offset_inner ...
... define can_use_buffer_find_nonzero_offset_inner ...

#if defined CONFIG_HAVE_GNU_IFUNC && defined CONFIG_HAVE_AVX2
... define buffer_find_nonzero_offset_avx2 ...
... define can_use_buffer_find_nonzero_offset_avx2 ...
... define the indirect functions ...
#else
... define buffer_find_nonzero_offset that just calls buffer_find_nonzero_offset_inner ...
... define can_use_buffer_find_nonzero_offset that just calls can_use_buffer_find_nonzero_offset_inner ...
#endif

Thanks,

Paolo
Li, Liang Z Nov. 12, 2015, 10:12 a.m. UTC | #2
> 

> The main issue here is that you are not testing whether the compiler supports

> gnu_indirect_function.

> 

> I suggest that you start by moving the functions to util/buffer-zero.c

> 

> Then the structure should be something like

> 

> #ifdef CONFIG_HAVE_AVX2

> #include <immintrin.h>

> #endif

> 

> ... define buffer_find_nonzero_offset_inner ...

> ... define can_use_buffer_find_nonzero_offset_inner ...

> 

> #if defined CONFIG_HAVE_GNU_IFUNC && defined CONFIG_HAVE_AVX2 ...

> define buffer_find_nonzero_offset_avx2 ...

> ... define can_use_buffer_find_nonzero_offset_avx2 ...

> ... define the indirect functions ...

> #else

> ... define buffer_find_nonzero_offset that just calls

> buffer_find_nonzero_offset_inner ...

> ... define can_use_buffer_find_nonzero_offset that just calls

> can_use_buffer_find_nonzero_offset_inner ...

> #endif

> 

> Thanks,

> 

> Paolo


Got it, thanks.

Liang
Juan Quintela Nov. 12, 2015, 11:30 a.m. UTC | #3
Paolo Bonzini <pbonzini@redhat.com> wrote:

>
> The main issue here is that you are not testing whether the compiler 
> supports gnu_indirect_function.
>
> I suggest that you start by moving the functions to util/buffer-zero.c
>
> Then the structure should be something like
>
> #ifdef CONFIG_HAVE_AVX2
> #include <immintrin.h>
> #endif
>
> ... define buffer_find_nonzero_offset_inner ...
> ... define can_use_buffer_find_nonzero_offset_inner ...
>
> #if defined CONFIG_HAVE_GNU_IFUNC && defined CONFIG_HAVE_AVX2
> ... define buffer_find_nonzero_offset_avx2 ...
> ... define can_use_buffer_find_nonzero_offset_avx2 ...
> ... define the indirect functions ...
> #else
> ... define buffer_find_nonzero_offset that just calls
> buffer_find_nonzero_offset_inner ...
> ... define can_use_buffer_find_nonzero_offset that just calls
> can_use_buffer_find_nonzero_offset_inner ...
> #endif

My understanding for this was that glibc is better than hand made asm,
and paolo4_memzero (or whatever was it called) was the best approach.
And just remove SSE.  Have I missed something?


Later, Juan.
Richard Henderson Nov. 12, 2015, 2:43 p.m. UTC | #4
On 11/10/2015 03:51 AM, Liang Li wrote:
> +__asm__(".type can_use_buffer_find_nonzero_offset, \%gnu_indirect_function");
> +__asm__(".type buffer_find_nonzero_offset, \%gnu_indirect_function");
> +
> +
> +void *can_use_buffer_find_nonzero_offset_ifunc(void) \
> +                     __asm__("can_use_buffer_find_nonzero_offset");
> +
> +void *buffer_find_nonzero_offset_ifunc(void) \
> +                     __asm__("buffer_find_nonzero_offset");


Not keen on this.  You can use the ifunc attribute instead of inline asm, and 
the target attribute to enable per-function use of avx2.  And if neither are 
supported, due to compiler limitations, I don't think you should attempt to 
work around that.


r~
Li, Liang Z Nov. 13, 2015, 2:49 a.m. UTC | #5
> > This patch use the ifunc mechanism to select the proper function when

> > running, for platform supports AVX2, excute the AVX2 instructions,

> > else, excute the original code.

> >

> > Signed-off-by: Liang Li <liang.z.li@intel.com>

> > ---

> >  include/qemu-common.h | 28 +++++++++++++++------

> >  util/Makefile.objs    |  2 ++

> >  util/avx2.c           | 69

> +++++++++++++++++++++++++++++++++++++++++++++++++++

> >  util/cutils.c         | 53 +++++++++++++++++++++++++++++++++++++--

> >  4 files changed, 143 insertions(+), 9 deletions(-)  create mode

> > 100644 util/avx2.c

> >

> > diff --git a/include/qemu-common.h b/include/qemu-common.h index

> > 2f74540..9fa7501 100644

> > --- a/include/qemu-common.h

> > +++ b/include/qemu-common.h

> > @@ -484,15 +484,29 @@ void qemu_hexdump(const char *buf, FILE *fp,

> > const char *prefix, size_t size);  #endif

> >

> >  #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline

> > bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len)

> > -{

> > -    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR

> > -                   * sizeof(VECTYPE)) == 0

> > -            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);

> > -}

> > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);

> > +

> >  size_t buffer_find_nonzero_offset(const void *buf, size_t len);

> >

> > +extern bool

> > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);

> > +

> > +extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t

> > +len);

> > +

> > +extern bool

> > +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t

> > +len);

> > +

> > +extern size_t buffer_find_nonzero_offset_inner(const void *buf,

> > +size_t len);

> > +

> > +__asm__(".type can_use_buffer_find_nonzero_offset,

> > +\%gnu_indirect_function"); __asm__(".type buffer_find_nonzero_offset,

> > +\%gnu_indirect_function");

> > +

> > +

> > +void *can_use_buffer_find_nonzero_offset_ifunc(void) \

> > +                     __asm__("can_use_buffer_find_nonzero_offset");

> > +

> > +void *buffer_find_nonzero_offset_ifunc(void) \

> > +                     __asm__("buffer_find_nonzero_offset");

> >  /*

> >   * helper to parse debug environment variables

> >   */

> > diff --git a/util/Makefile.objs b/util/Makefile.objs index

> > d7cc399..6aacad7 100644

> > --- a/util/Makefile.objs

> > +++ b/util/Makefile.objs

> > @@ -1,4 +1,5 @@

> >  util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o

> > +util-obj-y += avx2.o

> >  util-obj-$(CONFIG_POSIX) += compatfd.o

> >  util-obj-$(CONFIG_POSIX) += event_notifier-posix.o

> >  util-obj-$(CONFIG_POSIX) += mmap-alloc.o @@ -29,3 +30,4 @@ util-obj-y

> > += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o

> > util-obj-y += qemu-coroutine-sleep.o  util-obj-y +=

> > coroutine-$(CONFIG_COROUTINE_BACKEND).o

> >  util-obj-y += buffer.o

> > +avx2.o-cflags      := $(AVX2_CFLAGS)

> > diff --git a/util/avx2.c b/util/avx2.c new file mode 100644 index

> > 0000000..0e6915a

> > --- /dev/null

> > +++ b/util/avx2.c

> > @@ -0,0 +1,69 @@

> > +#include "qemu-common.h"

> > +

> > +#ifdef __AVX2__

> > +#include <immintrin.h>

> > +#define AVX2_VECTYPE        __m256i

> > +#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))

> > +#define AVX2_ALL_EQ(v1, v2) \

> > +    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)

> > +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))

> > +

> > +inline bool

> > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)

> > +{

> > +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR

> > +                   * sizeof(AVX2_VECTYPE)) == 0

> > +            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); }

> > +

> > +size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) {

> > +    const AVX2_VECTYPE *p = buf;

> > +    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};

> > +    size_t i;

> > +

> > +    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));

> > +

> > +    if (!len) {

> > +        return 0;

> > +    }

> > +

> > +    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {

> > +        if (!AVX2_ALL_EQ(p[i], zero)) {

> > +            return i * sizeof(AVX2_VECTYPE);

> > +        }

> > +    }

> > +

> > +    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;

> > +         i < len / sizeof(AVX2_VECTYPE);

> > +         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {

> > +        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);

> > +        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);

> > +        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);

> > +        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);

> > +        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);

> > +        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);

> > +        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {

> > +            break;

> > +        }

> > +    }

> > +

> > +    return i * sizeof(AVX2_VECTYPE);

> > +}

> > +

> > +#else

> > +/* use the original functions if avx2 is not enabled when buiding*/

> > +

> > +inline bool

> > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)

> > +{

> > +    return can_use_buffer_find_nonzero_offset_inner(buf, len); }

> > +

> > +inline size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t

> > +len) {

> > +    return buffer_find_nonzero_offset_inner(buf, len); }

> > +

> > +#endif

> > +

> > diff --git a/util/cutils.c b/util/cutils.c index cfeb848..cd478ce

> > 100644

> > --- a/util/cutils.c

> > +++ b/util/cutils.c

> > @@ -26,6 +26,7 @@

> >  #include <math.h>

> >  #include <limits.h>

> >  #include <errno.h>

> > +#include <cpuid.h>

> >

> >  #include "qemu/sockets.h"

> >  #include "qemu/iov.h"

> > @@ -161,6 +162,54 @@ int qemu_fdatasync(int fd)  #endif  }

> >

> > +/* old compiler maynot define bit_AVX2 */ #ifndef bit_AVX2 #define

> > +bit_AVX2 (1 << 5) #endif

> > +

> > +static inline bool avx2_support(void) {

> > +    int a, b, c, d;

> > +

> > +    if (__get_cpuid_max(0, NULL) < 7) {

> > +        printf("max cpuid < 7\n");

> > +        return false;

> > +    }

> > +

> > +    __cpuid_count(7, 0, a, b, c, d);

> > +    printf("b = %x\n", b);

> > +    return b & bit_AVX2;

> > +}

> > +

> > +void *buffer_find_nonzero_offset_ifunc(void)

> > +{

> > +    printf("deciding %s\n", __func__);

> > +

> > +    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?

> > +        buffer_find_nonzero_offset_avx2 :

> > + buffer_find_nonzero_offset_inner;

> > +

> > +    return func;

> > +}

> > +

> > +void *can_use_buffer_find_nonzero_offset_ifunc(void)

> > +{

> > +    printf("deciding %s\n", __func__);

> > +

> > +    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?

> > +        can_use_buffer_find_nonzero_offset_avx2 :

> > +        can_use_buffer_find_nonzero_offset_inner;

> > +

> > +    return func;

> > +}

> > +

> > +inline bool

> > +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)

> > +{

> > +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR

> > +                   * sizeof(VECTYPE)) == 0

> > +            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); }

> > +

> >  /*

> >   * Searches for an area with non-zero content in a buffer

> >   *

> > @@ -181,13 +230,13 @@ int qemu_fdatasync(int fd)

> >   * If the buffer is all zero the return value is equal to len.

> >   */

> >

> > -size_t buffer_find_nonzero_offset(const void *buf, size_t len)

> > +size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)

> >  {

> >      const VECTYPE *p = buf;

> >      const VECTYPE zero = (VECTYPE){0};

> >      size_t i;

> >

> > -    assert(can_use_buffer_find_nonzero_offset(buf, len));

> > +    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));

> >

> >      if (!len) {

> >          return 0;

> >

> 

> The main issue here is that you are not testing whether the compiler supports

> gnu_indirect_function.

> 

> I suggest that you start by moving the functions to util/buffer-zero.c

> 

> Then the structure should be something like

> 

> #ifdef CONFIG_HAVE_AVX2

> #include <immintrin.h>

> #endif

> 

> ... define buffer_find_nonzero_offset_inner ...

> ... define can_use_buffer_find_nonzero_offset_inner ...


> #if defined CONFIG_HAVE_GNU_IFUNC && defined CONFIG_HAVE_AVX2 ...

> define buffer_find_nonzero_offset_avx2 ...

> ... define can_use_buffer_find_nonzero_offset_avx2 ...

> ... define the indirect functions ...

> #else

> ... define buffer_find_nonzero_offset that just calls

> buffer_find_nonzero_offset_inner ...

> ... define can_use_buffer_find_nonzero_offset that just calls

> can_use_buffer_find_nonzero_offset_inner ...

> #endif

> 

> Thanks,

> 

> Paolo


The buffer_find_nonzero_offset_inner  & buffer_find_nonzero_offset_avx2  can't defined in the same .c file.
Or, if the '-maxv2' is enabled, the " buffer_find_nonzero_offset_inner  ()" will be compiled to AVX2 instructions.

Liang
Paolo Bonzini Nov. 13, 2015, 9:30 a.m. UTC | #6
> > ... define buffer_find_nonzero_offset_inner ...
> > ... define can_use_buffer_find_nonzero_offset_inner ...
> 
> > #if defined CONFIG_HAVE_GNU_IFUNC && defined CONFIG_HAVE_AVX2 ...
> > define buffer_find_nonzero_offset_avx2 ...
> > ... define can_use_buffer_find_nonzero_offset_avx2 ...
> > ... define the indirect functions ...
> > #else
> > ... define buffer_find_nonzero_offset that just calls
> > buffer_find_nonzero_offset_inner ...
> > ... define can_use_buffer_find_nonzero_offset that just calls
> > can_use_buffer_find_nonzero_offset_inner ...
> > #endif
> > 
> > Thanks,
> > 
> > Paolo
> 
> The buffer_find_nonzero_offset_inner  & buffer_find_nonzero_offset_avx2
> can't defined in the same .c file.
> Or, if the '-maxv2' is enabled, the " buffer_find_nonzero_offset_inner  ()"
> will be compiled to AVX2 instructions.

You can use __attribute__((__target__("avx2"))) on the avx2 version,
instead of compiling the whole file with -mavx2.

Paolo
diff mbox

Patch

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 2f74540..9fa7501 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -484,15 +484,29 @@  void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-                   * sizeof(VECTYPE)) == 0
-            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
+
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
+extern bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+
+extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+
+extern bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len);
+
+extern size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len);
+
+__asm__(".type can_use_buffer_find_nonzero_offset, \%gnu_indirect_function");
+__asm__(".type buffer_find_nonzero_offset, \%gnu_indirect_function");
+
+
+void *can_use_buffer_find_nonzero_offset_ifunc(void) \
+                     __asm__("can_use_buffer_find_nonzero_offset");
+
+void *buffer_find_nonzero_offset_ifunc(void) \
+                     __asm__("buffer_find_nonzero_offset");
 /*
  * helper to parse debug environment variables
  */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index d7cc399..6aacad7 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -1,4 +1,5 @@ 
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
+util-obj-y += avx2.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
@@ -29,3 +30,4 @@  util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
 util-obj-y += qemu-coroutine-sleep.o
 util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
 util-obj-y += buffer.o
+avx2.o-cflags      := $(AVX2_CFLAGS)
diff --git a/util/avx2.c b/util/avx2.c
new file mode 100644
index 0000000..0e6915a
--- /dev/null
+++ b/util/avx2.c
@@ -0,0 +1,69 @@ 
+#include "qemu-common.h"
+
+#ifdef __AVX2__
+#include <immintrin.h>
+#define AVX2_VECTYPE        __m256i
+#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+inline bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(AVX2_VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    const AVX2_VECTYPE *p = buf;
+    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+    size_t i;
+
+    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+    if (!len) {
+        return 0;
+    }
+
+    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+        if (!AVX2_ALL_EQ(p[i], zero)) {
+            return i * sizeof(AVX2_VECTYPE);
+        }
+    }
+
+    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+         i < len / sizeof(AVX2_VECTYPE);
+         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
+        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
+            break;
+        }
+    }
+
+    return i * sizeof(AVX2_VECTYPE);
+}
+
+#else
+/* use the original functions if avx2 is not enabled when buiding*/
+
+inline bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return can_use_buffer_find_nonzero_offset_inner(buf, len);
+}
+
+inline size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return buffer_find_nonzero_offset_inner(buf, len);
+}
+
+#endif
+
diff --git a/util/cutils.c b/util/cutils.c
index cfeb848..cd478ce 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -26,6 +26,7 @@ 
 #include <math.h>
 #include <limits.h>
 #include <errno.h>
+#include <cpuid.h>
 
 #include "qemu/sockets.h"
 #include "qemu/iov.h"
@@ -161,6 +162,54 @@  int qemu_fdatasync(int fd)
 #endif
 }
 
+/* old compiler maynot define bit_AVX2 */
+#ifndef bit_AVX2
+#define bit_AVX2 (1 << 5)
+#endif
+
+static inline bool avx2_support(void)
+{
+    int a, b, c, d;
+
+    if (__get_cpuid_max(0, NULL) < 7) {
+        printf("max cpuid < 7\n");
+        return false;
+    }
+
+    __cpuid_count(7, 0, a, b, c, d);
+    printf("b = %x\n", b);
+    return b & bit_AVX2;
+}
+
+void *buffer_find_nonzero_offset_ifunc(void)
+{
+    printf("deciding %s\n", __func__);
+
+    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+
+void *can_use_buffer_find_nonzero_offset_ifunc(void)
+{
+    printf("deciding %s\n", __func__);
+
+    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        can_use_buffer_find_nonzero_offset_avx2 :
+        can_use_buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+
+inline bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+
 /*
  * Searches for an area with non-zero content in a buffer
  *
@@ -181,13 +230,13 @@  int qemu_fdatasync(int fd)
  * If the buffer is all zero the return value is equal to len.
  */
 
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
 {
     const VECTYPE *p = buf;
     const VECTYPE zero = (VECTYPE){0};
     size_t i;
 
-    assert(can_use_buffer_find_nonzero_offset(buf, len));
+    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
 
     if (!len) {
         return 0;