diff mbox

[RFC,v2,1/2] utils: Add helper to read arm MIDR_EL1 register

Message ID 1471348968-4614-2-git-send-email-vijay.kilari@gmail.com
State New
Headers show

Commit Message

Vijay Kilari Aug. 16, 2016, 12:02 p.m. UTC
From: Vijaya Kumar K <Vijaya.Kumar@cavium.com>

Add helper API to read MIDR_EL1 registers to fetch
cpu identification information. This helps in
adding errata's and architecture specific features.

This is implemented only for arm architecture.

Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
---
 include/qemu/aarch64-cpuid.h |  9 +++++
 util/Makefile.objs           |  1 +
 util/aarch64-cpuid.c         | 94 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+)

Comments

Paolo Bonzini Aug. 17, 2016, 1:39 p.m. UTC | #1
On 16/08/2016 14:02, vijay.kilari@gmail.com wrote:
> From: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
> 
> Add helper API to read MIDR_EL1 registers to fetch
> cpu identification information. This helps in
> adding errata's and architecture specific features.
> 
> This is implemented only for arm architecture.
> 
> Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
> ---
>  include/qemu/aarch64-cpuid.h |  9 +++++
>  util/Makefile.objs           |  1 +
>  util/aarch64-cpuid.c         | 94 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 104 insertions(+)
> 
> diff --git a/include/qemu/aarch64-cpuid.h b/include/qemu/aarch64-cpuid.h
> new file mode 100644
> index 0000000..3c11057
> --- /dev/null
> +++ b/include/qemu/aarch64-cpuid.h
> @@ -0,0 +1,9 @@
> +#ifndef QEMU_AARCH64_CPUID_H
> +#define QEMU_AARCH64_CPUID_H
> +
> +#if defined (__aarch64__)
> +uint64_t get_aarch64_cpu_id(void);
> +bool is_thunderx_pass2_cpu(void);
> +#endif
> +
> +#endif
> diff --git a/util/Makefile.objs b/util/Makefile.objs
> index 96cb1e0..aa07bc3 100644
> --- a/util/Makefile.objs
> +++ b/util/Makefile.objs
> @@ -35,3 +35,4 @@ util-obj-y += log.o
>  util-obj-y += qdist.o
>  util-obj-y += qht.o
>  util-obj-y += range.o
> +util-obj-y += aarch64-cpuid.o
> diff --git a/util/aarch64-cpuid.c b/util/aarch64-cpuid.c
> new file mode 100644
> index 0000000..42af704
> --- /dev/null
> +++ b/util/aarch64-cpuid.c
> @@ -0,0 +1,94 @@
> +/*
> + * Dealing with arm cpu identification information.
> + *
> + * Copyright (C) 2016 Cavium, Inc.
> + *
> + * Authors:
> + *  Vijaya Kumar K <Vijaya.Kumar@cavium.com>
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2.1
> + * or later.  See the COPYING.LIB file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include "qemu/cutils.h"
> +#include "qemu/aarch64-cpuid.h"
> +
> +#if defined (__aarch64__)
> +#define MIDR_IMPLEMENTER_SHIFT  24
> +#define MIDR_IMPLEMENTER_MASK   (0xffULL << MIDR_IMPLEMENTER_SHIFT)
> +#define MIDR_ARCHITECTURE_SHIFT 16
> +#define MIDR_ARCHITECTURE_MASK  (0xf << MIDR_ARCHITECTURE_SHIFT)
> +#define MIDR_PARTNUM_SHIFT      4
> +#define MIDR_PARTNUM_MASK       (0xfff << MIDR_PARTNUM_SHIFT)
> +
> +#define MIDR_CPU_PART(imp, partnum) \
> +        (((imp)                 << MIDR_IMPLEMENTER_SHIFT)  | \
> +        (0xf                    << MIDR_ARCHITECTURE_SHIFT) | \
> +        ((partnum)              << MIDR_PARTNUM_SHIFT))
> +
> +#define ARM_CPU_IMP_CAVIUM        0x43
> +#define CAVIUM_CPU_PART_THUNDERX  0x0A1
> +
> +#define MIDR_THUNDERX_PASS2  \
> +               MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
> +#define CPU_MODEL_MASK  (MIDR_IMPLEMENTER_MASK | MIDR_ARCHITECTURE_MASK | \
> +                         MIDR_PARTNUM_MASK)
> +
> +static uint64_t qemu_read_aarch64_midr_el1(void)
> +{
> +#ifdef CONFIG_LINUX
> +    const char *file = "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1";
> +    char *buf;
> +    uint64_t midr = 0;
> +
> +#define BUF_SIZE 32
> +    buf = g_malloc0(BUF_SIZE);
> +    if (!buf) {
> +        return 0;
> +    }
> +
> +    if (!g_file_get_contents(file, &buf, 0, NULL)) {
> +        goto out;
> +    }
> +
> +    if (qemu_strtoull(buf, NULL, 0, &midr) < 0) {
> +        goto out;
> +    }
> +
> +out:
> +    g_free(buf);
> +
> +    return midr;
> +#else
> +    return 0;
> +#endif
> +}
> +
> +static bool is_thunderx_cpu;
> +static uint64_t aarch64_midr_val;
> +uint64_t get_aarch64_cpu_id(void)
> +{
> +#ifdef CONFIG_LINUX
> +    static bool cpu_info_read;
> +
> +    if (unlikely(!cpu_info_read)) {
> +        aarch64_midr_val = qemu_read_aarch64_midr_el1();
> +        aarch64_midr_val &= CPU_MODEL_MASK;
> +        cpu_info_read = 1;
> +        if (aarch64_midr_val == MIDR_THUNDERX_PASS2) {
> +            is_thunderx_cpu = 1;
> +        }
> +    }
> +    return aarch64_midr_val;
> +#else
> +    return 0;
> +#endif
> +}
> +
> +bool is_thunderx_pass2_cpu(void)
> +{
> +   return is_thunderx_cpu;

This can be:

   return get_aarch64_cpu_id() == MIDR_THUNDERX_PASS2;

without the is_thunderx_cpu variable.

Paolo

> +}
> +#endif
>
Vijay Kilari Aug. 18, 2016, 7:56 a.m. UTC | #2
On Wed, Aug 17, 2016 at 7:09 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 16/08/2016 14:02, vijay.kilari@gmail.com wrote:
>> From: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
>>
>> Add helper API to read MIDR_EL1 registers to fetch
>> cpu identification information. This helps in
>> adding errata's and architecture specific features.
>>
>> This is implemented only for arm architecture.
>>
>> Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
>> ---
>>  include/qemu/aarch64-cpuid.h |  9 +++++
>>  util/Makefile.objs           |  1 +
>>  util/aarch64-cpuid.c         | 94 ++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 104 insertions(+)
>>
>> diff --git a/include/qemu/aarch64-cpuid.h b/include/qemu/aarch64-cpuid.h
>> new file mode 100644
>> index 0000000..3c11057
>> --- /dev/null
>> +++ b/include/qemu/aarch64-cpuid.h
>> @@ -0,0 +1,9 @@
>> +#ifndef QEMU_AARCH64_CPUID_H
>> +#define QEMU_AARCH64_CPUID_H
>> +
>> +#if defined (__aarch64__)
>> +uint64_t get_aarch64_cpu_id(void);
>> +bool is_thunderx_pass2_cpu(void);
>> +#endif
>> +
>> +#endif
>> diff --git a/util/Makefile.objs b/util/Makefile.objs
>> index 96cb1e0..aa07bc3 100644
>> --- a/util/Makefile.objs
>> +++ b/util/Makefile.objs
>> @@ -35,3 +35,4 @@ util-obj-y += log.o
>>  util-obj-y += qdist.o
>>  util-obj-y += qht.o
>>  util-obj-y += range.o
>> +util-obj-y += aarch64-cpuid.o
>> diff --git a/util/aarch64-cpuid.c b/util/aarch64-cpuid.c
>> new file mode 100644
>> index 0000000..42af704
>> --- /dev/null
>> +++ b/util/aarch64-cpuid.c
>> @@ -0,0 +1,94 @@
>> +/*
>> + * Dealing with arm cpu identification information.
>> + *
>> + * Copyright (C) 2016 Cavium, Inc.
>> + *
>> + * Authors:
>> + *  Vijaya Kumar K <Vijaya.Kumar@cavium.com>
>> + *
>> + * This work is licensed under the terms of the GNU LGPL, version 2.1
>> + * or later.  See the COPYING.LIB file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +#include "qemu/cutils.h"
>> +#include "qemu/aarch64-cpuid.h"
>> +
>> +#if defined (__aarch64__)
>> +#define MIDR_IMPLEMENTER_SHIFT  24
>> +#define MIDR_IMPLEMENTER_MASK   (0xffULL << MIDR_IMPLEMENTER_SHIFT)
>> +#define MIDR_ARCHITECTURE_SHIFT 16
>> +#define MIDR_ARCHITECTURE_MASK  (0xf << MIDR_ARCHITECTURE_SHIFT)
>> +#define MIDR_PARTNUM_SHIFT      4
>> +#define MIDR_PARTNUM_MASK       (0xfff << MIDR_PARTNUM_SHIFT)
>> +
>> +#define MIDR_CPU_PART(imp, partnum) \
>> +        (((imp)                 << MIDR_IMPLEMENTER_SHIFT)  | \
>> +        (0xf                    << MIDR_ARCHITECTURE_SHIFT) | \
>> +        ((partnum)              << MIDR_PARTNUM_SHIFT))
>> +
>> +#define ARM_CPU_IMP_CAVIUM        0x43
>> +#define CAVIUM_CPU_PART_THUNDERX  0x0A1
>> +
>> +#define MIDR_THUNDERX_PASS2  \
>> +               MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
>> +#define CPU_MODEL_MASK  (MIDR_IMPLEMENTER_MASK | MIDR_ARCHITECTURE_MASK | \
>> +                         MIDR_PARTNUM_MASK)
>> +
>> +static uint64_t qemu_read_aarch64_midr_el1(void)
>> +{
>> +#ifdef CONFIG_LINUX
>> +    const char *file = "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1";
>> +    char *buf;
>> +    uint64_t midr = 0;
>> +
>> +#define BUF_SIZE 32
>> +    buf = g_malloc0(BUF_SIZE);
>> +    if (!buf) {
>> +        return 0;
>> +    }
>> +
>> +    if (!g_file_get_contents(file, &buf, 0, NULL)) {
>> +        goto out;
>> +    }
>> +
>> +    if (qemu_strtoull(buf, NULL, 0, &midr) < 0) {
>> +        goto out;
>> +    }
>> +
>> +out:
>> +    g_free(buf);
>> +
>> +    return midr;
>> +#else
>> +    return 0;
>> +#endif
>> +}
>> +
>> +static bool is_thunderx_cpu;
>> +static uint64_t aarch64_midr_val;
>> +uint64_t get_aarch64_cpu_id(void)
>> +{
>> +#ifdef CONFIG_LINUX
>> +    static bool cpu_info_read;
>> +
>> +    if (unlikely(!cpu_info_read)) {
>> +        aarch64_midr_val = qemu_read_aarch64_midr_el1();
>> +        aarch64_midr_val &= CPU_MODEL_MASK;
>> +        cpu_info_read = 1;
>> +        if (aarch64_midr_val == MIDR_THUNDERX_PASS2) {
>> +            is_thunderx_cpu = 1;
>> +        }
>> +    }
>> +    return aarch64_midr_val;
>> +#else
>> +    return 0;
>> +#endif
>> +}
>> +
>> +bool is_thunderx_pass2_cpu(void)
>> +{
>> +   return is_thunderx_cpu;
>
> This can be:
>
>    return get_aarch64_cpu_id() == MIDR_THUNDERX_PASS2;
>
> without the is_thunderx_cpu variable.

The get_aarch_cpu_id() has check " if (unlikely(!cpu_info_read)) ".
If we call get_aarch_cpu_id() from is_thunderx_pass2_cpu() which is
called from inside the loop, we will be adding one additional check.

What I observed is having extra check inside the loop is adding 100 to
200ms overhead
on live migration time. So I added this variable extra is_thunderx_cpu
static variable
to make it simple single check.

>
> Paolo
>
>> +}
>> +#endif
>>
Paolo Bonzini Aug. 18, 2016, 8:50 a.m. UTC | #3
On 18/08/2016 09:56, Vijay Kilari wrote:
> The get_aarch_cpu_id() has check " if (unlikely(!cpu_info_read)) ".
> If we call get_aarch_cpu_id() from is_thunderx_pass2_cpu() which is
> called from inside the loop, we will be adding one additional check.

On the other hand, you are making an assumption that the caller of
is_thunderx_pass2_cpu() calls get_aarch64_cpu_id() first, and not
documenting it anywhere.

And given that you shouldn't call _any_ function from inside such a hot
loop, your solution is inferior on both counts.

Paolo

> What I observed is having extra check inside the loop is adding 100 to
> 200ms overhead
> on live migration time. So I added this variable extra is_thunderx_cpu
> static variable
> to make it simple single check.
Vijay Kilari Aug. 18, 2016, 9:01 a.m. UTC | #4
On Thu, Aug 18, 2016 at 2:20 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 18/08/2016 09:56, Vijay Kilari wrote:
>> The get_aarch_cpu_id() has check " if (unlikely(!cpu_info_read)) ".
>> If we call get_aarch_cpu_id() from is_thunderx_pass2_cpu() which is
>> called from inside the loop, we will be adding one additional check.
>
> On the other hand, you are making an assumption that the caller of
> is_thunderx_pass2_cpu() calls get_aarch64_cpu_id() first, and not
> documenting it anywhere.
>
> And given that you shouldn't call _any_ function from inside such a hot
> loop, your solution is inferior on both counts.

Yes, but I could not think of better way to get rid of this check. However
as Richard suggested (in another email), to drop this check and let prefetch
be called for all the arm64 architectures. But I don't have any other
arm64 platform
to check the impact of it.

>
> Paolo
>
>> What I observed is having extra check inside the loop is adding 100 to
>> 200ms overhead
>> on live migration time. So I added this variable extra is_thunderx_cpu
>> static variable
>> to make it simple single check.
>
Paolo Bonzini Aug. 18, 2016, 9:39 a.m. UTC | #5
On 18/08/2016 11:01, Vijay Kilari wrote:
> On Thu, Aug 18, 2016 at 2:20 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>
>> On 18/08/2016 09:56, Vijay Kilari wrote:
>>> The get_aarch_cpu_id() has check " if (unlikely(!cpu_info_read)) ".
>>> If we call get_aarch_cpu_id() from is_thunderx_pass2_cpu() which is
>>> called from inside the loop, we will be adding one additional check.
>>
>> On the other hand, you are making an assumption that the caller of
>> is_thunderx_pass2_cpu() calls get_aarch64_cpu_id() first, and not
>> documenting it anywhere.
>>
>> And given that you shouldn't call _any_ function from inside such a hot
>> loop, your solution is inferior on both counts.
> 
> Yes, but I could not think of better way to get rid of this check.

    bool need_aa64_prefetch = is_thunderx_pass2();
    for (...) {
         if (need_aa64_prefetch) {
             ...
         }
    }

The check on cpu_info_read is done just once.

Paolo

 However
> as Richard suggested (in another email), to drop this check and let prefetch
> be called for all the arm64 architectures. But I don't have any other
> arm64 platform
> to check the impact of it.
> 
>>
>> Paolo
>>
>>> What I observed is having extra check inside the loop is adding 100 to
>>> 200ms overhead
>>> on live migration time. So I added this variable extra is_thunderx_cpu
>>> static variable
>>> to make it simple single check.
>>
Richard Henderson Aug. 18, 2016, 2:04 p.m. UTC | #6
On 08/18/2016 02:39 AM, Paolo Bonzini wrote:
>
>
> On 18/08/2016 11:01, Vijay Kilari wrote:
>> On Thu, Aug 18, 2016 at 2:20 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>
>>>
>>> On 18/08/2016 09:56, Vijay Kilari wrote:
>>>> The get_aarch_cpu_id() has check " if (unlikely(!cpu_info_read)) ".
>>>> If we call get_aarch_cpu_id() from is_thunderx_pass2_cpu() which is
>>>> called from inside the loop, we will be adding one additional check.
>>>
>>> On the other hand, you are making an assumption that the caller of
>>> is_thunderx_pass2_cpu() calls get_aarch64_cpu_id() first, and not
>>> documenting it anywhere.
>>>
>>> And given that you shouldn't call _any_ function from inside such a hot
>>> loop, your solution is inferior on both counts.
>>
>> Yes, but I could not think of better way to get rid of this check.
>
>     bool need_aa64_prefetch = is_thunderx_pass2();
>     for (...) {
>          if (need_aa64_prefetch) {
>              ...
>          }
>     }
>
> The check on cpu_info_read is done just once.

Supposing a check is required at all, this is still inferior to either

(1) If completely outside the loop,

   if (is_thunderx_pass2()) {
       for (...)
         ...
   } else {
       for (...)
   }

or (2) ifunc, so that we only check once, not every invocation.


r~
Peter Maydell Aug. 18, 2016, 2:14 p.m. UTC | #7
On 18 August 2016 at 15:04, Richard Henderson <rth@twiddle.net> wrote:
> or (2) ifunc

While we're on the subject, can somebody explain to me why we
use ifuncs at all? I couldn't work out why it would be better than
just using a straightforward function pointer -- when I tried single
stepping through things the ifunc approach still seemed to indirect
through some table or other so it wasn't actually resolving to
a direct function call anyway.

thanks
-- PMM
Richard Henderson Aug. 18, 2016, 2:46 p.m. UTC | #8
On 08/18/2016 07:14 AM, Peter Maydell wrote:
> On 18 August 2016 at 15:04, Richard Henderson <rth@twiddle.net> wrote:
>> or (2) ifunc
>
> While we're on the subject, can somebody explain to me why we
> use ifuncs at all? I couldn't work out why it would be better than
> just using a straightforward function pointer -- when I tried single
> stepping through things the ifunc approach still seemed to indirect
> through some table or other so it wasn't actually resolving to
> a direct function call anyway.

No reason, I suppose.

It's particularly helpful for libraries, where we don't really want the 
overhead of the initialization when it's not used.

But (1) we don't have many of these and (2) we really don't care *that* much 
about startup time.

So a simple function pointer initialized by a constructor has the same effect.


r~
Peter Maydell Aug. 18, 2016, 2:56 p.m. UTC | #9
On 18 August 2016 at 15:46, Richard Henderson <rth@twiddle.net> wrote:
> On 08/18/2016 07:14 AM, Peter Maydell wrote:
>> While we're on the subject, can somebody explain to me why we
>> use ifuncs at all? I couldn't work out why it would be better than
>> just using a straightforward function pointer -- when I tried single
>> stepping through things the ifunc approach still seemed to indirect
>> through some table or other so it wasn't actually resolving to
>> a direct function call anyway.

> No reason, I suppose.
>
> It's particularly helpful for libraries, where we don't really want the
> overhead of the initialization when it's not used.

Ah, I see.

> But (1) we don't have many of these and (2) we really don't care *that* much
> about startup time.
>
> So a simple function pointer initialized by a constructor has the same
> effect.

That seems like it would be a worthwhile change since
(a) I think it's easier to understand than ifunc magic
(b) it means we don't unnecessarily restrict ourselves to a libc
with ifunc support (musl libc doesn't do ifuncs, for instance)

thanks
-- PMM
Vijay Kilari Aug. 19, 2016, 9:05 a.m. UTC | #10
On Thu, Aug 18, 2016 at 8:26 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
> On 18 August 2016 at 15:46, Richard Henderson <rth@twiddle.net> wrote:
>> On 08/18/2016 07:14 AM, Peter Maydell wrote:
>>> While we're on the subject, can somebody explain to me why we
>>> use ifuncs at all? I couldn't work out why it would be better than
>>> just using a straightforward function pointer -- when I tried single
>>> stepping through things the ifunc approach still seemed to indirect
>>> through some table or other so it wasn't actually resolving to
>>> a direct function call anyway.
>
>> No reason, I suppose.
>>
>> It's particularly helpful for libraries, where we don't really want the
>> overhead of the initialization when it's not used.
>
> Ah, I see.
>
>> But (1) we don't have many of these and (2) we really don't care *that* much
>> about startup time.
>>
>> So a simple function pointer initialized by a constructor has the same
>> effect.
>

 The cutils does not have any initialization function that can init
function/constructor pointer
for zero_check function.

Also creating separate function with most of repeated code for prefetch does
not look good. So suggest to put check for prefetch outside the for loop and
code for loop with and without prefetch

I profiled and found that a single check inside the loop is adding 100ms delay
for 8GB RAM migration. So moving check outside the loop is enough.

Ex:

   if (need_prefetch()) {

       prefetch_vector(p, 0);

        for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
             i < len / sizeof(VECTYPE);
             i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {

            prefetch_vector_loop(p, i);

            VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
            VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
            VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
            VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]);
           VECTYPE tmp01 = VEC_OR(tmp0, tmp1);
           VECTYPE tmp23 = VEC_OR(tmp2, tmp3);
            if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) {
                break;
            }
        }

} else {

        for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
             i < len / sizeof(VECTYPE);
             i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {

            VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
            VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
            VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
            VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]);
           VECTYPE tmp01 = VEC_OR(tmp0, tmp1);
           VECTYPE tmp23 = VEC_OR(tmp2, tmp3);
            if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) {
                break;
            }
        }
}

Also,  If you want to make prefetch common for all arm64 platforms,
Then thunder cache line is 128 bytes so the prefetch is performed
at 128 byte index. If the platform has 64 byte cache line, then this
prefetch will fill only 64 byte line instead of 128 bytes required for the loop.

> That seems like it would be a worthwhile change since
> (a) I think it's easier to understand than ifunc magic
> (b) it means we don't unnecessarily restrict ourselves to a libc
> with ifunc support (musl libc doesn't do ifuncs, for instance)
>
> thanks
> -- PMM
Richard Henderson Aug. 19, 2016, 2:57 p.m. UTC | #11
On 08/19/2016 02:05 AM, Vijay Kilari wrote:
> On Thu, Aug 18, 2016 at 8:26 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
>> On 18 August 2016 at 15:46, Richard Henderson <rth@twiddle.net> wrote:
>>> On 08/18/2016 07:14 AM, Peter Maydell wrote:
>>>> While we're on the subject, can somebody explain to me why we
>>>> use ifuncs at all? I couldn't work out why it would be better than
>>>> just using a straightforward function pointer -- when I tried single
>>>> stepping through things the ifunc approach still seemed to indirect
>>>> through some table or other so it wasn't actually resolving to
>>>> a direct function call anyway.
>>
>>> No reason, I suppose.
>>>
>>> It's particularly helpful for libraries, where we don't really want the
>>> overhead of the initialization when it's not used.
>>
>> Ah, I see.
>>
>>> But (1) we don't have many of these and (2) we really don't care *that* much
>>> about startup time.
>>>
>>> So a simple function pointer initialized by a constructor has the same
>>> effect.
>>
>
>  The cutils does not have any initialization function that can init
> function/constructor pointer
> for zero_check function.

static void __attribute__((constructor)) init_buffer_find_nonzero(void)
{
    ...
}

> Also creating separate function with most of repeated code for prefetch does
> not look good.

Why do you say that?

> So suggest to put check for prefetch outside the for loop and
> code for loop with and without prefetch

You're duplicating the inner loop either way, so that can't be your objection 
to creating a separate function.

> I profiled and found that a single check inside the loop is adding 100ms delay
> for 8GB RAM migration.

That's about what I expected.

> Also,  If you want to make prefetch common for all arm64 platforms,
> Then thunder cache line is 128 bytes so the prefetch is performed
> at 128 byte index. If the platform has 64 byte cache line, then this
> prefetch will fill only 64 byte line instead of 128 bytes required for the loop.

Yes, I had thought of that.

It would make sense to create two versions, that prefetch for and iterate over, 
cacheline sizes of 64 and 128 (I don't know of any other common sizes).

Preferably, we should then use sysconf(_SC_LEVEL1_DCACHE_LINESIZE) within the 
init function above to choose the appropriate version.

But I see that glibc doesn't currently implement that for aarch64, so we do 
want to have a fallback.  I know that the "official" cache line data isn't 
(easily) available to userspace, but a close proxy is the size described by 
dczid_el0.  That seems much better than groveling through a file under /sys.


r~
diff mbox

Patch

diff --git a/include/qemu/aarch64-cpuid.h b/include/qemu/aarch64-cpuid.h
new file mode 100644
index 0000000..3c11057
--- /dev/null
+++ b/include/qemu/aarch64-cpuid.h
@@ -0,0 +1,9 @@ 
+#ifndef QEMU_AARCH64_CPUID_H
+#define QEMU_AARCH64_CPUID_H
+
+#if defined (__aarch64__)
+uint64_t get_aarch64_cpu_id(void);
+bool is_thunderx_pass2_cpu(void);
+#endif
+
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 96cb1e0..aa07bc3 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -35,3 +35,4 @@  util-obj-y += log.o
 util-obj-y += qdist.o
 util-obj-y += qht.o
 util-obj-y += range.o
+util-obj-y += aarch64-cpuid.o
diff --git a/util/aarch64-cpuid.c b/util/aarch64-cpuid.c
new file mode 100644
index 0000000..42af704
--- /dev/null
+++ b/util/aarch64-cpuid.c
@@ -0,0 +1,94 @@ 
+/*
+ * Dealing with arm cpu identification information.
+ *
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * Authors:
+ *  Vijaya Kumar K <Vijaya.Kumar@cavium.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1
+ * or later.  See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/cutils.h"
+#include "qemu/aarch64-cpuid.h"
+
+#if defined (__aarch64__)
+#define MIDR_IMPLEMENTER_SHIFT  24
+#define MIDR_IMPLEMENTER_MASK   (0xffULL << MIDR_IMPLEMENTER_SHIFT)
+#define MIDR_ARCHITECTURE_SHIFT 16
+#define MIDR_ARCHITECTURE_MASK  (0xf << MIDR_ARCHITECTURE_SHIFT)
+#define MIDR_PARTNUM_SHIFT      4
+#define MIDR_PARTNUM_MASK       (0xfff << MIDR_PARTNUM_SHIFT)
+
+#define MIDR_CPU_PART(imp, partnum) \
+        (((imp)                 << MIDR_IMPLEMENTER_SHIFT)  | \
+        (0xf                    << MIDR_ARCHITECTURE_SHIFT) | \
+        ((partnum)              << MIDR_PARTNUM_SHIFT))
+
+#define ARM_CPU_IMP_CAVIUM        0x43
+#define CAVIUM_CPU_PART_THUNDERX  0x0A1
+
+#define MIDR_THUNDERX_PASS2  \
+               MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
+#define CPU_MODEL_MASK  (MIDR_IMPLEMENTER_MASK | MIDR_ARCHITECTURE_MASK | \
+                         MIDR_PARTNUM_MASK)
+
+static uint64_t qemu_read_aarch64_midr_el1(void)
+{
+#ifdef CONFIG_LINUX
+    const char *file = "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1";
+    char *buf;
+    uint64_t midr = 0;
+
+#define BUF_SIZE 32
+    buf = g_malloc0(BUF_SIZE);
+    if (!buf) {
+        return 0;
+    }
+
+    if (!g_file_get_contents(file, &buf, 0, NULL)) {
+        goto out;
+    }
+
+    if (qemu_strtoull(buf, NULL, 0, &midr) < 0) {
+        goto out;
+    }
+
+out:
+    g_free(buf);
+
+    return midr;
+#else
+    return 0;
+#endif
+}
+
+static bool is_thunderx_cpu;
+static uint64_t aarch64_midr_val;
+uint64_t get_aarch64_cpu_id(void)
+{
+#ifdef CONFIG_LINUX
+    static bool cpu_info_read;
+
+    if (unlikely(!cpu_info_read)) {
+        aarch64_midr_val = qemu_read_aarch64_midr_el1();
+        aarch64_midr_val &= CPU_MODEL_MASK;
+        cpu_info_read = 1;
+        if (aarch64_midr_val == MIDR_THUNDERX_PASS2) {
+            is_thunderx_cpu = 1;
+        }
+    }
+    return aarch64_midr_val;
+#else
+    return 0;
+#endif
+}
+
+bool is_thunderx_pass2_cpu(void)
+{
+   return is_thunderx_cpu;
+}
+#endif