[RFC,v2,1/3] target-arm: Use Neon for zero checking
diff mbox

Message ID 1460023087-31509-2-git-send-email-vijayak@caviumnetworks.com
State New
Headers show

Commit Message

vijayak@caviumnetworks.com April 7, 2016, 9:58 a.m. UTC
From: Vijay <vijayak@cavium.com>

Use Neon instructions to perform zero checking of
buffer. This is helps in reducing downtime during
live migration.

Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com>
Signed-off-by: Suresh <ksuresh@caviumnetworks.com>
---
 util/cutils.c |   74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

Comments

Paolo Bonzini April 7, 2016, 10:30 a.m. UTC | #1
> +#elif defined __aarch64__
> +#include "arm_neon.h"
> +
> +#define NEON_VECTYPE               uint64x2_t
> +#define NEON_LOAD_N_ORR(v1, v2)    (vld1q_u64(&v1) | vld1q_u64(&v2))

Why is the load and orr necessary?  Is ((v1) | (v2)) enough?

> +#define NEON_ORR(v1, v2)           ((v1) | (v2))
> +#define NEON_NOT_EQ_ZERO(v1) \
> +        ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0))
> +
> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16

Unless you have numbers saying that a 16-unroll is better than an 8-unroll
(and then you should put those in the commit message), you do not need to
duplicate code, just add aarch64 definitions for the existing code.

---

I've now read the rest of the patches, and you're adding prefetch code
that is ARM-specific.  Please provide numbers separately for each
patch, not just in the cover letter.  The cover letter is lost when the
patch is committed, while the commit messages remain.

On top of this, "With these changes, total migration time comes down
from 10 seconds to 2.5 seconds" is not a reproducible experiment.
What was the RAM size?  Was the guest just booted and idle, or was
there a workload?  What was the host?

Thanks,

Paolo

> +/*
> + * Zero page/buffer checking using SIMD(Neon)
> + */
> +
> +static bool
> +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
> +                   * sizeof(NEON_VECTYPE)) == 0
> +            && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
> +}
> +
> +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +    size_t i;
> +    NEON_VECTYPE qword0, qword1, qword2, qword3, qword4, qword5, qword6;
> +    uint64_t const *data = buf;
> +
> +    if (!len) {
> +        return 0;
> +    }
> +
> +    assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
> +    len /= sizeof(unsigned long);
> +
> +    for (i = 0; i < len; i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON)
> {
> +        qword0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
> +        qword1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
> +        qword2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
> +        qword3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
> +        qword4 = NEON_ORR(qword0, qword1);
> +        qword5 = NEON_ORR(qword2, qword3);
> +        qword6 = NEON_ORR(qword4, qword5);
> +
> +        if (NEON_NOT_EQ_ZERO(qword6)) {
> +            break;
> +        }
> +    }
> +
> +    return i * sizeof(unsigned long);
> +}
> +
> +static inline bool neon_support(void)
> +{
> +    /*
> +     * Check if neon feature is supported.
> +     * By default neon is supported for aarch64.
> +     */
> +    return true;

Then everything below this function is not necessary.

Paolo

> +}
> +
> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf,
> len) :
> +           can_use_buffer_find_nonzero_offset_inner(buf, len);
> +}
> +
> +size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) :
> +           buffer_find_nonzero_offset_inner(buf, len);
> +}
>  #else
>  bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
>  {
> --
> 1.7.9.5
> 
>
Peter Maydell April 7, 2016, 10:44 a.m. UTC | #2
On 7 April 2016 at 11:30, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>> +#elif defined __aarch64__
>> +#include "arm_neon.h"
>> +
>> +#define NEON_VECTYPE               uint64x2_t
>> +#define NEON_LOAD_N_ORR(v1, v2)    (vld1q_u64(&v1) | vld1q_u64(&v2))
>
> Why is the load and orr necessary?  Is ((v1) | (v2)) enough?
>
>> +#define NEON_ORR(v1, v2)           ((v1) | (v2))
>> +#define NEON_NOT_EQ_ZERO(v1) \
>> +        ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0))
>> +
>> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
>
> Unless you have numbers saying that a 16-unroll is better than an 8-unroll
> (and then you should put those in the commit message), you do not need to
> duplicate code, just add aarch64 definitions for the existing code.

This pure-neon code is also not doing the initial short-loop to
test for non-zero buffers, which means it's not an apples-to-apples
comparison. It seems unlikely that workload balances are going
to be different on ARM vs x86 such that it's worth doing the
small loop on one but not the other. (This is also why it's helpful
to explain your benchmarking method -- the short loop will slow
things down for some cases like "large and untouched RAM", but be
faster again for cases like "large RAM of which most pages have
been dirtied".)

thanks
-- PMM
Peter Maydell April 7, 2016, 10:44 a.m. UTC | #3
On 7 April 2016 at 10:58,  <vijayak@caviumnetworks.com> wrote:
> From: Vijay <vijayak@cavium.com>
>
> Use Neon instructions to perform zero checking of
> buffer. This is helps in reducing downtime during
> live migration.
>
> Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com>
> Signed-off-by: Suresh <ksuresh@caviumnetworks.com>
> ---
>  util/cutils.c |   74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 74 insertions(+)
>
> diff --git a/util/cutils.c b/util/cutils.c
> index 43d1afb..bb61c91 100644
> --- a/util/cutils.c
> +++ b/util/cutils.c
> @@ -352,6 +352,80 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void)
>      return func;
>  }
>  #pragma GCC pop_options
> +
> +#elif defined __aarch64__
> +#include "arm_neon.h"
> +
> +#define NEON_VECTYPE               uint64x2_t
> +#define NEON_LOAD_N_ORR(v1, v2)    (vld1q_u64(&v1) | vld1q_u64(&v2))
> +#define NEON_ORR(v1, v2)           ((v1) | (v2))
> +#define NEON_NOT_EQ_ZERO(v1) \
> +        ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0))
> +
> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16

This says 16 lots of loads of uint64x2_t...

> +    for (i = 0; i < len; i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON) {
> +        qword0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
> +        qword1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
> +        qword2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
> +        qword3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
> +        qword4 = NEON_ORR(qword0, qword1);
> +        qword5 = NEON_ORR(qword2, qword3);
> +        qword6 = NEON_ORR(qword4, qword5);

...but the loop is only loading 8 lots of uint64x2_t.


thanks
-- PMM
Richard Henderson April 9, 2016, 10:45 p.m. UTC | #4
On 04/07/2016 02:58 AM, vijayak@caviumnetworks.com wrote:
> +#elif defined __aarch64__
> +#include "arm_neon.h"

A better test is __NEON__, which asserts that neon is available at compile time 
(which will be true basically always for aarch64), and then you don't need a 
runime test for neon.

You also get support for armv7 with neon.

> +#define NEON_VECTYPE               uint64x2_t
> +#define NEON_LOAD_N_ORR(v1, v2)    (vld1q_u64(&v1) | vld1q_u64(&v2))
> +#define NEON_ORR(v1, v2)           ((v1) | (v2))
> +#define NEON_NOT_EQ_ZERO(v1) \
> +        ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0))

FWIW, I think that vmaxvq_u32 would be a better reduction for aarch64. 
Extracting the individual lanes isn't as efficient as one would like.

For armv7, folding via vget_lane_u64(vget_high_u64(v1) | vget_low_u64(v1), 0) 
is probably best.


r~
Peter Maydell April 11, 2016, 10:40 a.m. UTC | #5
On 9 April 2016 at 23:45, Richard Henderson <rth@twiddle.net> wrote:
> On 04/07/2016 02:58 AM, vijayak@caviumnetworks.com wrote:
>>
>> +#elif defined __aarch64__
>> +#include "arm_neon.h"
>
>
> A better test is __NEON__, which asserts that neon is available at compile
> time (which will be true basically always for aarch64), and then you don't
> need a runime test for neon.

You don't need a runtime test for neon on aarch64 anyway, because
it will always be present.

> You also get support for armv7 with neon.

But if you do care about armv7 then you do need a runtime test,
because the defacto standard compile options are for armhf which
has FP but doesn't assume Neon.

Personally I think we should not worry about armv7 here, because
it's not actually a likely virtualization server platform, and
we shouldn't include code in QEMU we're not even compile testing.
So I think __aarch64__ here is fine.

thanks
-- PMM

Patch
diff mbox

diff --git a/util/cutils.c b/util/cutils.c
index 43d1afb..bb61c91 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -352,6 +352,80 @@  static void *can_use_buffer_find_nonzero_offset_ifunc(void)
     return func;
 }
 #pragma GCC pop_options
+
+#elif defined __aarch64__
+#include "arm_neon.h"
+
+#define NEON_VECTYPE               uint64x2_t
+#define NEON_LOAD_N_ORR(v1, v2)    (vld1q_u64(&v1) | vld1q_u64(&v2))
+#define NEON_ORR(v1, v2)           ((v1) | (v2))
+#define NEON_NOT_EQ_ZERO(v1) \
+        ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0))
+
+#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
+
+/*
+ * Zero page/buffer checking using SIMD(Neon)
+ */
+
+static bool
+can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
+                   * sizeof(NEON_VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
+}
+
+static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
+{
+    size_t i;
+    NEON_VECTYPE qword0, qword1, qword2, qword3, qword4, qword5, qword6;
+    uint64_t const *data = buf;
+
+    if (!len) {
+        return 0;
+    }
+
+    assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
+    len /= sizeof(unsigned long);
+
+    for (i = 0; i < len; i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON) {
+        qword0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
+        qword1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
+        qword2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
+        qword3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
+        qword4 = NEON_ORR(qword0, qword1);
+        qword5 = NEON_ORR(qword2, qword3);
+        qword6 = NEON_ORR(qword4, qword5);
+
+        if (NEON_NOT_EQ_ZERO(qword6)) {
+            break;
+        }
+    }
+
+    return i * sizeof(unsigned long);
+}
+
+static inline bool neon_support(void)
+{
+    /*
+     * Check if neon feature is supported.
+     * By default neon is supported for aarch64.
+     */
+    return true;
+}
+
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, len) :
+           can_use_buffer_find_nonzero_offset_inner(buf, len);
+}
+
+size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) :
+           buffer_find_nonzero_offset_inner(buf, len);
+}
 #else
 bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
 {