diff mbox

[PATCHv5,03/10] cutils: add a function to find non-zero content in a buffer

Message ID 1364291919-19563-4-git-send-email-pl@kamp.de
State New
Headers show

Commit Message

Peter Lieven March 26, 2013, 9:58 a.m. UTC
this adds buffer_find_nonzero_offset() which is a SSE2/Altivec
optimized function that searches for non-zero content in a
buffer.

the function starts full unrolling only after the first few chunks have
been checked one by one. analyzing real memory page data has revealed
that non-zero pages are non-zero within the first 256-512 bits in
most cases. as this function is also heavily used to check for zero memory
pages this tweak has been made to avoid the high setup costs of the fully
unrolled check for non-zero pages.

due to the optimizations used in the function there are restrictions
on buffer address and search length. the function
can_use_buffer_find_nonzero_content() can be used to check if
the function can be used safely.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 include/qemu-common.h |   13 ++++++++++++
 util/cutils.c         |   55 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

--
1.7.9.5

Comments

Juan Quintela March 26, 2013, 10:38 a.m. UTC | #1
Peter Lieven <pl@kamp.de> wrote:
> this adds buffer_find_nonzero_offset() which is a SSE2/Altivec
> optimized function that searches for non-zero content in a
> buffer.
>
> the function starts full unrolling only after the first few chunks have
> been checked one by one. analyzing real memory page data has revealed
> that non-zero pages are non-zero within the first 256-512 bits in
> most cases. as this function is also heavily used to check for zero memory
> pages this tweak has been made to avoid the high setup costs of the fully
> unrolled check for non-zero pages.
>
> due to the optimizations used in the function there are restrictions
> on buffer address and search length. the function
> can_use_buffer_find_nonzero_content() can be used to check if
> the function can be used safely.
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  include/qemu-common.h |   13 ++++++++++++
>  util/cutils.c         |   55 +++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 68 insertions(+)
>
> diff --git a/include/qemu-common.h b/include/qemu-common.h
> index 9022646..7c7c244 100644
> --- a/include/qemu-common.h
> +++ b/include/qemu-common.h
> @@ -472,4 +472,17 @@ void hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
>  #define ALL_EQ(v1, v2) ((v1) == (v2))
>  #endif
>
> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
> +static inline bool
> +can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    if (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> +               * sizeof(VECTYPE)) == 0
> +        && ((uintptr_t) buf) % sizeof(VECTYPE) == 0) {
> +        return true;
> +    }
> +    return false;
> +}
> +size_t buffer_find_nonzero_offset(const void *buf, size_t len);
> +
>  #endif
> diff --git a/util/cutils.c b/util/cutils.c
> index 1439da4..0314a18 100644
> --- a/util/cutils.c
> +++ b/util/cutils.c
> @@ -143,6 +143,61 @@ int qemu_fdatasync(int fd)
>  }
>
>  /*
> + * Searches for an area with non-zero content in a buffer
> + *
> + * Attention! The len must be a multiple of
> + * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
> + * and addr must be a multiple of sizeof(VECTYPE) due to
> + * restriction of optimizations in this function.
> + *
> + * can_use_buffer_find_nonzero_offset() can be used to check
> + * these requirements.
> + *
> + * The return value is the offset of the non-zero area rounded
> + * down to a multiple of sizeof(VECTYPE) for the first
> + * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
> + * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
> + * afterwards.
> + *
> + * If the buffer is all zero the return value is equal to len.
> + */
> +
> +size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    VECTYPE *p = (VECTYPE *)buf;
> +    VECTYPE zero = ZERO_SPLAT;

If you have to resplit it anyways, what about changing this to:

-    VECTYPE *p = (VECTYPE *)buf;
-    VECTYPE zero = ZERO_SPLAT;
+    const VECTYPE *p = buf;
+    const VECTYPE zero = ZERO_SPLAT;
     size_t i;

From the "I hate casts" film?

Thanks, Juan.
Juan Quintela March 26, 2013, 10:41 a.m. UTC | #2
Peter Lieven <pl@kamp.de> wrote:
> this adds buffer_find_nonzero_offset() which is a SSE2/Altivec
> optimized function that searches for non-zero content in a
> buffer.
>
> the function starts full unrolling only after the first few chunks have
> been checked one by one. analyzing real memory page data has revealed
> that non-zero pages are non-zero within the first 256-512 bits in
> most cases. as this function is also heavily used to check for zero memory
> pages this tweak has been made to avoid the high setup costs of the fully
> unrolled check for non-zero pages.
>
> due to the optimizations used in the function there are restrictions
> on buffer address and search length. the function
> can_use_buffer_find_nonzero_content() can be used to check if
> the function can be used safely.
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  include/qemu-common.h |   13 ++++++++++++
>  util/cutils.c         |   55 +++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 68 insertions(+)
>
> diff --git a/include/qemu-common.h b/include/qemu-common.h
> index 9022646..7c7c244 100644
> --- a/include/qemu-common.h
> +++ b/include/qemu-common.h
> @@ -472,4 +472,17 @@ void hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
>  #define ALL_EQ(v1, v2) ((v1) == (v2))
>  #endif
>
> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
> +static inline bool
> +can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    if (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> +               * sizeof(VECTYPE)) == 0
> +        && ((uintptr_t) buf) % sizeof(VECTYPE) == 0) {
> +        return true;
> +    }
> +    return false;
> +}

This can be spelled as:

return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
               * sizeof(VECTYPE)) == 0
       && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);;

But I don't care too much.
Peter Lieven March 26, 2013, 10:42 a.m. UTC | #3
Am 26.03.2013 um 11:38 schrieb Juan Quintela <quintela@redhat.com>:

> Peter Lieven <pl@kamp.de> wrote:
>> this adds buffer_find_nonzero_offset() which is a SSE2/Altivec
>> optimized function that searches for non-zero content in a
>> buffer.
>> 
>> the function starts full unrolling only after the first few chunks have
>> been checked one by one. analyzing real memory page data has revealed
>> that non-zero pages are non-zero within the first 256-512 bits in
>> most cases. as this function is also heavily used to check for zero memory
>> pages this tweak has been made to avoid the high setup costs of the fully
>> unrolled check for non-zero pages.
>> 
>> due to the optimizations used in the function there are restrictions
>> on buffer address and search length. the function
>> can_use_buffer_find_nonzero_content() can be used to check if
>> the function can be used safely.
>> 
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>> include/qemu-common.h |   13 ++++++++++++
>> util/cutils.c         |   55 +++++++++++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 68 insertions(+)
>> 
>> diff --git a/include/qemu-common.h b/include/qemu-common.h
>> index 9022646..7c7c244 100644
>> --- a/include/qemu-common.h
>> +++ b/include/qemu-common.h
>> @@ -472,4 +472,17 @@ void hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
>> #define ALL_EQ(v1, v2) ((v1) == (v2))
>> #endif
>> 
>> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
>> +static inline bool
>> +can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
>> +{
>> +    if (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
>> +               * sizeof(VECTYPE)) == 0
>> +        && ((uintptr_t) buf) % sizeof(VECTYPE) == 0) {
>> +        return true;
>> +    }
>> +    return false;
>> +}
>> +size_t buffer_find_nonzero_offset(const void *buf, size_t len);
>> +
>> #endif
>> diff --git a/util/cutils.c b/util/cutils.c
>> index 1439da4..0314a18 100644
>> --- a/util/cutils.c
>> +++ b/util/cutils.c
>> @@ -143,6 +143,61 @@ int qemu_fdatasync(int fd)
>> }
>> 
>> /*
>> + * Searches for an area with non-zero content in a buffer
>> + *
>> + * Attention! The len must be a multiple of
>> + * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
>> + * and addr must be a multiple of sizeof(VECTYPE) due to
>> + * restriction of optimizations in this function.
>> + *
>> + * can_use_buffer_find_nonzero_offset() can be used to check
>> + * these requirements.
>> + *
>> + * The return value is the offset of the non-zero area rounded
>> + * down to a multiple of sizeof(VECTYPE) for the first
>> + * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
>> + * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
>> + * afterwards.
>> + *
>> + * If the buffer is all zero the return value is equal to len.
>> + */
>> +
>> +size_t buffer_find_nonzero_offset(const void *buf, size_t len)
>> +{
>> +    VECTYPE *p = (VECTYPE *)buf;
>> +    VECTYPE zero = ZERO_SPLAT;
> 
> If you have to resplit it anyways, what about changing this to:
> 
> -    VECTYPE *p = (VECTYPE *)buf;
> -    VECTYPE zero = ZERO_SPLAT;
> +    const VECTYPE *p = buf;
> +    const VECTYPE zero = ZERO_SPLAT;

i would change it to 

const VECTYPE zero = (VECTYPE) {0};

and drop patch 2 again.

Peter


>     size_t i;
> 
> From the "I hate casts" film?
> 
> Thanks, Juan.
diff mbox

Patch

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 9022646..7c7c244 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -472,4 +472,17 @@  void hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 #define ALL_EQ(v1, v2) ((v1) == (v2))
 #endif

+#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
+static inline bool
+can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    if (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+               * sizeof(VECTYPE)) == 0
+        && ((uintptr_t) buf) % sizeof(VECTYPE) == 0) {
+        return true;
+    }
+    return false;
+}
+size_t buffer_find_nonzero_offset(const void *buf, size_t len);
+
 #endif
diff --git a/util/cutils.c b/util/cutils.c
index 1439da4..0314a18 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -143,6 +143,61 @@  int qemu_fdatasync(int fd)
 }

 /*
+ * Searches for an area with non-zero content in a buffer
+ *
+ * Attention! The len must be a multiple of
+ * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
+ * and addr must be a multiple of sizeof(VECTYPE) due to
+ * restriction of optimizations in this function.
+ *
+ * can_use_buffer_find_nonzero_offset() can be used to check
+ * these requirements.
+ *
+ * The return value is the offset of the non-zero area rounded
+ * down to a multiple of sizeof(VECTYPE) for the first
+ * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
+ * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
+ * afterwards.
+ *
+ * If the buffer is all zero the return value is equal to len.
+ */
+
+size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    VECTYPE *p = (VECTYPE *)buf;
+    VECTYPE zero = ZERO_SPLAT;
+    size_t i;
+
+    assert(can_use_buffer_find_nonzero_offset(buf, len));
+
+    if (!len) {
+        return 0;
+    }
+
+    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+        if (!ALL_EQ(p[i], zero)) {
+            return i * sizeof(VECTYPE);
+        }
+    }
+
+    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+         i < len / sizeof(VECTYPE);
+         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+        VECTYPE tmp0 = p[i + 0] | p[i + 1];
+        VECTYPE tmp1 = p[i + 2] | p[i + 3];
+        VECTYPE tmp2 = p[i + 4] | p[i + 5];
+        VECTYPE tmp3 = p[i + 6] | p[i + 7];
+        VECTYPE tmp01 = tmp0 | tmp1;
+        VECTYPE tmp23 = tmp2 | tmp3;
+        if (!ALL_EQ(tmp01 | tmp23, zero)) {
+            break;
+        }
+    }
+
+    return i * sizeof(VECTYPE);
+}
+
+/*
  * Checks if a buffer is all zeroes
  *
  * Attention! The len must be a multiple of 4 * sizeof(long) due to