Patchwork Introduce qemu_madvise()

login
register
mail settings
Submitter Andreas Färber
Date Sept. 11, 2010, 5:05 p.m.
Message ID <1284224755-11299-1-git-send-email-andreas.faerber@web.de>
Download mbox | patch
Permalink /patch/64526/
State New
Headers show

Comments

Andreas Färber - Sept. 11, 2010, 5:05 p.m.
From: Andreas Färber <afaerber@opensolaris.org>

vl.c has a Sun-specific hack to supply a prototype for madvise(),
but the call site has apparently moved to arch_init.c.
The underlying issue is that madvise() is not a POSIX function,
therefore Solaris' _POSIX_C_SOURCE suppresses the prototype.

Haiku doesn't implement madvise() at all.
OpenBSD however doesn't implement posix_madvise().

Check for posix_madvise() in configure and supply qemu_madvise() as wrapper.
Use qemu_madvise() in arch_init.c's ram_load().

http://www.opengroup.org/onlinepubs/9699919799/functions/posix_madvise.html

Remaining madvise() users:
exec.c: limited to __linux__ and/or MADV_MERGEABLE (no POSIX equivalent)
kvm-all.c: limited to MADV_DONTFORK (no POSIX equivalent),
           otherwise runtime error if !kvm_has_sync_mmu()
hw/virtio-balloon.c: limited to __linux__

v1 -> v2:
* Don't rely on posix_madvise() availability, add qemu_madvise().
  Suggested by Blue Swirl.

Signed-off-by: Andreas Färber <afaerber@opensolaris.org>
Cc: Blue Swirl <blauwirbel@gmail.com>
---
 arch_init.c |    2 +-
 configure   |   11 +++++++++++
 osdep.c     |   26 ++++++++++++++++++++++++++
 osdep.h     |    4 ++++
 vl.c        |    3 ---
 5 files changed, 42 insertions(+), 4 deletions(-)
Alexander Graf - Sept. 11, 2010, 9:37 p.m.
On 11.09.2010, at 19:05, Andreas Färber wrote:

> From: Andreas Färber <afaerber@opensolaris.org>
> 
> vl.c has a Sun-specific hack to supply a prototype for madvise(),
> but the call site has apparently moved to arch_init.c.
> The underlying issue is that madvise() is not a POSIX function,
> therefore Solaris' _POSIX_C_SOURCE suppresses the prototype.
> 
> Haiku doesn't implement madvise() at all.
> OpenBSD however doesn't implement posix_madvise().
> 
> Check for posix_madvise() in configure and supply qemu_madvise() as wrapper.
> Use qemu_madvise() in arch_init.c's ram_load().
> 
> http://www.opengroup.org/onlinepubs/9699919799/functions/posix_madvise.html
> 
> Remaining madvise() users:
> exec.c: limited to __linux__ and/or MADV_MERGEABLE (no POSIX equivalent)
> kvm-all.c: limited to MADV_DONTFORK (no POSIX equivalent),
>           otherwise runtime error if !kvm_has_sync_mmu()
> hw/virtio-balloon.c: limited to __linux__
> 
> v1 -> v2:
> * Don't rely on posix_madvise() availability, add qemu_madvise().
>  Suggested by Blue Swirl.
> 
> Signed-off-by: Andreas Färber <afaerber@opensolaris.org>
> Cc: Blue Swirl <blauwirbel@gmail.com>
> ---
> arch_init.c |    2 +-
> configure   |   11 +++++++++++
> osdep.c     |   26 ++++++++++++++++++++++++++
> osdep.h     |    4 ++++
> vl.c        |    3 ---
> 5 files changed, 42 insertions(+), 4 deletions(-)
> 
> diff --git a/arch_init.c b/arch_init.c
> index e468c0c..a910033 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -396,7 +396,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
> #ifndef _WIN32
>             if (ch == 0 &&
>                 (!kvm_enabled() || kvm_has_sync_mmu())) {
> -                madvise(host, TARGET_PAGE_SIZE, MADV_DONTNEED);
> +                qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);

Is this the only occurence in the whole code base? This patch should convert all users, right?

>             }
> #endif
>         } else if (flags & RAM_SAVE_FLAG_PAGE) {
> diff --git a/configure b/configure
> index 4061cb7..5e6f3e1 100755
> --- a/configure
> +++ b/configure
> @@ -2069,6 +2069,17 @@ if compile_prog "" "" ; then
> fi
> 
> ##########################################
> +# check if we have posix_madvise
> +
> +cat > $TMPC << EOF
> +#include <sys/mman.h>
> +int main(void) { posix_madvise(NULL, 0, POSIX_MADV_DONTNEED); return 0; }
> +EOF
> +if compile_prog "" "" ; then
> +    QEMU_CFLAGS="-DCONFIG_POSIX_MADVISE ${QEMU_CFLAGS}"
> +fi
> +
> +##########################################
> # check if trace backend exists
> 
> sh "$source_path/tracetool" "--$trace_backend" --check-backend > /dev/null 2> /dev/null
> diff --git a/osdep.c b/osdep.c
> index 30426ff..8c09597 100644
> --- a/osdep.c
> +++ b/osdep.c
> @@ -28,6 +28,7 @@
> #include <errno.h>
> #include <unistd.h>
> #include <fcntl.h>
> +#include <sys/mman.h>
> 
> /* Needed early for CONFIG_BSD etc. */
> #include "config-host.h"
> @@ -139,6 +140,31 @@ void qemu_vfree(void *ptr)
> 
> #endif
> 
> +int qemu_madvise(void *addr, size_t len, int advice)
> +{
> +#ifdef CONFIG_POSIX_MADVISE
> +    switch (advice) {
> +        case QEMU_MADV_DONTNEED:
> +            advice = POSIX_MADV_DONTNEED;
> +            break;
> +        default:
> +            fprintf(stderr, "Advice %d unhandled.\n", advice);

Advise :) It should probably also be 'madvise' here. Plus, I'd recommend %x as that makes the bits slightly more obvious.

> +            abort();
> +    }
> +    return posix_madvise(addr, len, advice);
> +#else
> +    switch (advice) {
> +        case QEMU_MADV_DONTNEED:
> +            advice = MADV_DONTNEED;
> +            break;
> +        default:
> +            fprintf(stderr, "Advice %d unhandled.\n", advice);
> +            abort();
> +    }
> +    return madvise(addr, len, advice);

So what do you do on haiku where there's no madvise?

> +#endif
> +}
> +
> int qemu_create_pidfile(const char *filename)
> {
>     char buffer[128];
> diff --git a/osdep.h b/osdep.h
> index 1cdc7e2..9f0da46 100644
> --- a/osdep.h
> +++ b/osdep.h
> @@ -90,6 +90,10 @@ void *qemu_memalign(size_t alignment, size_t size);
> void *qemu_vmalloc(size_t size);
> void qemu_vfree(void *ptr);
> 
> +#define QEMU_MADV_DONTNEED 1

(1 << 0)

There are probably more to follow. I'm also not sure if it wouldn't make sense to just directly map qemu_madvise and real madvise bits. Something like

#ifdef MADV_DONTNEED
#define QEMU_MADV_DONTNEED   (1 << 0)
#else
#define QEMU_MADV_DONTNEED   0
#endif

Unless of course a flag is mandatory - but I don't think any of the madvise bits are.

> +
> +int qemu_madvise(void *addr, size_t len, int advice);
> +
> int qemu_create_pidfile(const char *filename);
> 
> #ifdef _WIN32
> diff --git a/vl.c b/vl.c
> index 3f45aa9..d352d18 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -80,9 +80,6 @@
> #include <net/if.h>
> #include <syslog.h>
> #include <stropts.h>
> -/* See MySQL bug #7156 (http://bugs.mysql.com/bug.php?id=7156) for
> -   discussion about Solaris header problems */
> -extern int madvise(caddr_t, size_t, int);

Does Solaris have posix_madvise? Otherwise it would still require this header hack, no?


Alex
Andreas Färber - Sept. 11, 2010, 10:39 p.m.
Am 11.09.2010 um 23:37 schrieb Alexander Graf:

> On 11.09.2010, at 19:05, Andreas Färber wrote:
>
>> Remaining madvise() users:
>> exec.c: limited to __linux__ and/or MADV_MERGEABLE (no POSIX  
>> equivalent)
>> kvm-all.c: limited to MADV_DONTFORK (no POSIX equivalent),
>>          otherwise runtime error if !kvm_has_sync_mmu()
>> hw/virtio-balloon.c: limited to __linux__

>> diff --git a/arch_init.c b/arch_init.c
>> index e468c0c..a910033 100644
>> --- a/arch_init.c
>> +++ b/arch_init.c
>> @@ -396,7 +396,7 @@ int ram_load(QEMUFile *f, void *opaque, int  
>> version_id)
>> #ifndef _WIN32
>>            if (ch == 0 &&
>>                (!kvm_enabled() || kvm_has_sync_mmu())) {
>> -                madvise(host, TARGET_PAGE_SIZE, MADV_DONTNEED);
>> +                qemu_madvise(host, TARGET_PAGE_SIZE,  
>> QEMU_MADV_DONTNEED);
>
> Is this the only occurence in the whole code base? This patch should  
> convert all users, right?

It's the only relevant occurrence, cf. description above.

We could in theory create QEMU_MADV_MERGEABLE and QEMU_MADV_DONTFORK  
and convert the callers, too, but what's the point when only madvise  
supports it?
Using the current #ifdef mechanism we can detect/handle it at compile- 
time; inside qemu_madvise it would be deferred to runtime. Or do you  
have a suggestion how to handle that scenario other than returning an  
error?

>> diff --git a/osdep.c b/osdep.c
>> index 30426ff..8c09597 100644
>> --- a/osdep.c
>> +++ b/osdep.c
>> @@ -139,6 +140,31 @@ void qemu_vfree(void *ptr)
>>
>> #endif
>>
>> +int qemu_madvise(void *addr, size_t len, int advice)
>> +{
>> +#ifdef CONFIG_POSIX_MADVISE
>> +    switch (advice) {
>> +        case QEMU_MADV_DONTNEED:
>> +            advice = POSIX_MADV_DONTNEED;
>> +            break;
>> +        default:
>> +            fprintf(stderr, "Advice %d unhandled.\n", advice);
>
> Advise :)

I'd advise against that advice. ;) (*hint hint*)

> It should probably also be 'madvise' here. Plus, I'd recommend %x as  
> that makes the bits slightly more obvious.

You mean, "qemu_madvise: Advice %x ..."? Or "Advice %x for  
posix_madvise ..."?

>> +            abort();
>> +    }
>> +    return posix_madvise(addr, len, advice);
>> +#else
>> +    switch (advice) {
>> +        case QEMU_MADV_DONTNEED:
>> +            advice = MADV_DONTNEED;
>> +            break;
>> +        default:
>> +            fprintf(stderr, "Advice %d unhandled.\n", advice);
>> +            abort();
>> +    }
>> +    return madvise(addr, len, advice);
>
> So what do you do on haiku where there's no madvise?

It should detect posix_madvise and not run into this code path, just  
like OpenSolaris.
I was hoping for Michael Lotz to resurface though and haven't retried  
myself yet.

Platforms that have neither posix_madvise nor madvise are equally  
broken before and after.

>
>> +#endif
>> +}
>> +
>> int qemu_create_pidfile(const char *filename)
>> {
>>    char buffer[128];
>> diff --git a/osdep.h b/osdep.h
>> index 1cdc7e2..9f0da46 100644
>> --- a/osdep.h
>> +++ b/osdep.h
>> @@ -90,6 +90,10 @@ void *qemu_memalign(size_t alignment, size_t  
>> size);
>> void *qemu_vmalloc(size_t size);
>> void qemu_vfree(void *ptr);
>>
>> +#define QEMU_MADV_DONTNEED 1
>
> (1 << 0)
>
> There are probably more to follow. I'm also not sure if it wouldn't  
> make sense to just directly map qemu_madvise and real madvise bits.  
> Something like
>
> #ifdef MADV_DONTNEED
> #define QEMU_MADV_DONTNEED   (1 << 0)
> #else
> #define QEMU_MADV_DONTNEED   0
> #endif
>
> Unless of course a flag is mandatory - but I don't think any of the  
> madvise bits are.

I don't follow. Something like the following?

#ifdef CONFIG_POSIX_MADVISE
#define QEMU_MADV_DONTNEED POSIX_MADV_DONTNEED
#define qemu_madvise posix_madvise
#else
#ifdef MADV_DONTNEED
#define QEMU_MADV_DONTNEED MADV_DONTNEED
#endif
#define qemu_madvise madvise
#endif

>> +
>> +int qemu_madvise(void *addr, size_t len, int advice);
>> +
>> int qemu_create_pidfile(const char *filename);
>>
>> #ifdef _WIN32
>> diff --git a/vl.c b/vl.c
>> index 3f45aa9..d352d18 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -80,9 +80,6 @@
>> #include <net/if.h>
>> #include <syslog.h>
>> #include <stropts.h>
>> -/* See MySQL bug #7156 (http://bugs.mysql.com/bug.php?id=7156) for
>> -   discussion about Solaris header problems */
>> -extern int madvise(caddr_t, size_t, int);
>
> Does Solaris have posix_madvise? Otherwise it would still require  
> this header hack, no?

I tested on OpenSolaris 2009.06. Haven't tested on Solaris 10 yet,  
don't have access to older ones.
We could wrap it in #ifndef CONFIG_POSIX_MADVISE to be fool-safe.

Thanks for the review,

Andreas
Alexander Graf - Sept. 11, 2010, 10:47 p.m.
On 12.09.2010, at 00:39, Andreas Färber wrote:

> Am 11.09.2010 um 23:37 schrieb Alexander Graf:
> 
>> On 11.09.2010, at 19:05, Andreas Färber wrote:
>> 
>>> Remaining madvise() users:
>>> exec.c: limited to __linux__ and/or MADV_MERGEABLE (no POSIX equivalent)
>>> kvm-all.c: limited to MADV_DONTFORK (no POSIX equivalent),
>>>         otherwise runtime error if !kvm_has_sync_mmu()
>>> hw/virtio-balloon.c: limited to __linux__
> 
>>> diff --git a/arch_init.c b/arch_init.c
>>> index e468c0c..a910033 100644
>>> --- a/arch_init.c
>>> +++ b/arch_init.c
>>> @@ -396,7 +396,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
>>> #ifndef _WIN32
>>>           if (ch == 0 &&
>>>               (!kvm_enabled() || kvm_has_sync_mmu())) {
>>> -                madvise(host, TARGET_PAGE_SIZE, MADV_DONTNEED);
>>> +                qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
>> 
>> Is this the only occurence in the whole code base? This patch should convert all users, right?
> 
> It's the only relevant occurrence, cf. description above.
> 
> We could in theory create QEMU_MADV_MERGEABLE and QEMU_MADV_DONTFORK and convert the callers, too, but what's the point when only madvise supports it?
> Using the current #ifdef mechanism we can detect/handle it at compile-time; inside qemu_madvise it would be deferred to runtime. Or do you have a suggestion how to handle that scenario other than returning an error?

Hrm. I'm not sure. Do we have to fail madvise at all?

> 
>>> diff --git a/osdep.c b/osdep.c
>>> index 30426ff..8c09597 100644
>>> --- a/osdep.c
>>> +++ b/osdep.c
>>> @@ -139,6 +140,31 @@ void qemu_vfree(void *ptr)
>>> 
>>> #endif
>>> 
>>> +int qemu_madvise(void *addr, size_t len, int advice)
>>> +{
>>> +#ifdef CONFIG_POSIX_MADVISE
>>> +    switch (advice) {
>>> +        case QEMU_MADV_DONTNEED:
>>> +            advice = POSIX_MADV_DONTNEED;
>>> +            break;
>>> +        default:
>>> +            fprintf(stderr, "Advice %d unhandled.\n", advice);
>> 
>> Advise :)
> 
> I'd advise against that advice. ;) (*hint hint*)
> 
>> It should probably also be 'madvise' here. Plus, I'd recommend %x as that makes the bits slightly more obvious.
> 
> You mean, "qemu_madvise: Advice %x ..."? Or "Advice %x for posix_madvise ..."?

I'd remove the "Advice" part: "qemu_madvise: Flag %x unknown".

> 
>>> +            abort();
>>> +    }
>>> +    return posix_madvise(addr, len, advice);
>>> +#else
>>> +    switch (advice) {
>>> +        case QEMU_MADV_DONTNEED:
>>> +            advice = MADV_DONTNEED;
>>> +            break;
>>> +        default:
>>> +            fprintf(stderr, "Advice %d unhandled.\n", advice);
>>> +            abort();
>>> +    }
>>> +    return madvise(addr, len, advice);
>> 
>> So what do you do on haiku where there's no madvise?
> 
> It should detect posix_madvise and not run into this code path, just like OpenSolaris.
> I was hoping for Michael Lotz to resurface though and haven't retried myself yet.

Oh, so OpenSolaris and Haiku have posix_madvise? Nice.

> 
> Platforms that have neither posix_madvise nor madvise are equally broken before and after.

Sure :).

> 
>> 
>>> +#endif
>>> +}
>>> +
>>> int qemu_create_pidfile(const char *filename)
>>> {
>>>   char buffer[128];
>>> diff --git a/osdep.h b/osdep.h
>>> index 1cdc7e2..9f0da46 100644
>>> --- a/osdep.h
>>> +++ b/osdep.h
>>> @@ -90,6 +90,10 @@ void *qemu_memalign(size_t alignment, size_t size);
>>> void *qemu_vmalloc(size_t size);
>>> void qemu_vfree(void *ptr);
>>> 
>>> +#define QEMU_MADV_DONTNEED 1
>> 
>> (1 << 0)
>> 
>> There are probably more to follow. I'm also not sure if it wouldn't make sense to just directly map qemu_madvise and real madvise bits. Something like
>> 
>> #ifdef MADV_DONTNEED
>> #define QEMU_MADV_DONTNEED   (1 << 0)
>> #else
>> #define QEMU_MADV_DONTNEED   0
>> #endif
>> 
>> Unless of course a flag is mandatory - but I don't think any of the madvise bits are.
> 
> I don't follow. Something like the following?
> 
> #ifdef CONFIG_POSIX_MADVISE
> #define QEMU_MADV_DONTNEED POSIX_MADV_DONTNEED
> #define qemu_madvise posix_madvise
> #else
> #ifdef MADV_DONTNEED
> #define QEMU_MADV_DONTNEED MADV_DONTNEED
> #endif
> #define qemu_madvise madvise
> #endif

This could work, though in general preprocessor magic is ... too magic. I was thinking:

#ifdef CONFIG_POSIX_MADVISE
#define QEMU_MADV_DONTNEED POSIX_MADV_DONTNEED
#else
#define QEMU_MADV_DONTNEED MADV_DONTNEED
#endif

and keep the qemu_madvise implementation in C. But then there's no need to convert between QEMU_MADV and real MADV bits.

> 
>>> +
>>> +int qemu_madvise(void *addr, size_t len, int advice);
>>> +
>>> int qemu_create_pidfile(const char *filename);
>>> 
>>> #ifdef _WIN32
>>> diff --git a/vl.c b/vl.c
>>> index 3f45aa9..d352d18 100644
>>> --- a/vl.c
>>> +++ b/vl.c
>>> @@ -80,9 +80,6 @@
>>> #include <net/if.h>
>>> #include <syslog.h>
>>> #include <stropts.h>
>>> -/* See MySQL bug #7156 (http://bugs.mysql.com/bug.php?id=7156) for
>>> -   discussion about Solaris header problems */
>>> -extern int madvise(caddr_t, size_t, int);
>> 
>> Does Solaris have posix_madvise? Otherwise it would still require this header hack, no?
> 
> I tested on OpenSolaris 2009.06. Haven't tested on Solaris 10 yet, don't have access to older ones.
> We could wrap it in #ifndef CONFIG_POSIX_MADVISE to be fool-safe.

*shrug* I was just afraid this was there for a reason. But maybe the author of those lines simply didn't know about posix_madvise.

> Thanks for the review,

You're welcome :)


Alex
Blue Swirl - Sept. 12, 2010, 7:20 a.m.
On Sat, Sep 11, 2010 at 5:05 PM, Andreas Färber <andreas.faerber@web.de> wrote:
> From: Andreas Färber <afaerber@opensolaris.org>
>
> vl.c has a Sun-specific hack to supply a prototype for madvise(),
> but the call site has apparently moved to arch_init.c.
> The underlying issue is that madvise() is not a POSIX function,
> therefore Solaris' _POSIX_C_SOURCE suppresses the prototype.
>
> Haiku doesn't implement madvise() at all.
> OpenBSD however doesn't implement posix_madvise().
>
> Check for posix_madvise() in configure and supply qemu_madvise() as wrapper.
> Use qemu_madvise() in arch_init.c's ram_load().
>
> http://www.opengroup.org/onlinepubs/9699919799/functions/posix_madvise.html
>
> Remaining madvise() users:
> exec.c: limited to __linux__ and/or MADV_MERGEABLE (no POSIX equivalent)
> kvm-all.c: limited to MADV_DONTFORK (no POSIX equivalent),
>           otherwise runtime error if !kvm_has_sync_mmu()
> hw/virtio-balloon.c: limited to __linux__

For consistency, I'd convert all users. If an OS doesn't support a
flag, we can return something like -ENOTSUP which may be checked by
the caller if it cares.

>
> v1 -> v2:
> * Don't rely on posix_madvise() availability, add qemu_madvise().
>  Suggested by Blue Swirl.
>
> Signed-off-by: Andreas Färber <afaerber@opensolaris.org>
> Cc: Blue Swirl <blauwirbel@gmail.com>
> ---
>  arch_init.c |    2 +-
>  configure   |   11 +++++++++++
>  osdep.c     |   26 ++++++++++++++++++++++++++
>  osdep.h     |    4 ++++
>  vl.c        |    3 ---
>  5 files changed, 42 insertions(+), 4 deletions(-)
>
> diff --git a/arch_init.c b/arch_init.c
> index e468c0c..a910033 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -396,7 +396,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
>  #ifndef _WIN32
>             if (ch == 0 &&
>                 (!kvm_enabled() || kvm_has_sync_mmu())) {
> -                madvise(host, TARGET_PAGE_SIZE, MADV_DONTNEED);
> +                qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
>             }
>  #endif
>         } else if (flags & RAM_SAVE_FLAG_PAGE) {
> diff --git a/configure b/configure
> index 4061cb7..5e6f3e1 100755
> --- a/configure
> +++ b/configure
> @@ -2069,6 +2069,17 @@ if compile_prog "" "" ; then
>  fi
>
>  ##########################################
> +# check if we have posix_madvise
> +
> +cat > $TMPC << EOF
> +#include <sys/mman.h>
> +int main(void) { posix_madvise(NULL, 0, POSIX_MADV_DONTNEED); return 0; }
> +EOF
> +if compile_prog "" "" ; then
> +    QEMU_CFLAGS="-DCONFIG_POSIX_MADVISE ${QEMU_CFLAGS}"

Please take a look around configure:2444 how to add new config flags.
I'd also add a similar check for madvise() if posix_madvise() check
fails.

> +fi
> +
> +##########################################
>  # check if trace backend exists
>
>  sh "$source_path/tracetool" "--$trace_backend" --check-backend > /dev/null 2> /dev/null
> diff --git a/osdep.c b/osdep.c
> index 30426ff..8c09597 100644
> --- a/osdep.c
> +++ b/osdep.c
> @@ -28,6 +28,7 @@
>  #include <errno.h>
>  #include <unistd.h>
>  #include <fcntl.h>
> +#include <sys/mman.h>
>
>  /* Needed early for CONFIG_BSD etc. */
>  #include "config-host.h"
> @@ -139,6 +140,31 @@ void qemu_vfree(void *ptr)
>
>  #endif
>
> +int qemu_madvise(void *addr, size_t len, int advice)
> +{
> +#ifdef CONFIG_POSIX_MADVISE
> +    switch (advice) {
> +        case QEMU_MADV_DONTNEED:
> +            advice = POSIX_MADV_DONTNEED;
> +            break;
> +        default:
> +            fprintf(stderr, "Advice %d unhandled.\n", advice);
> +            abort();

This should be an assert, it's an internal error. It's also common to
both cases.

> +    }
> +    return posix_madvise(addr, len, advice);
> +#else

#elif defined(CONFIG_MADVISE)

> +    switch (advice) {
> +        case QEMU_MADV_DONTNEED:
> +            advice = MADV_DONTNEED;
> +            break;

case QEMU_MADV_DONTFORK:
#ifndef MADV_DONTFORK
return -ENOTSUP;
#else
advice = MADV_DONTFORK;
break;
#endif

Same goes for MADV_MERGEABLE.

> +        default:
> +            fprintf(stderr, "Advice %d unhandled.\n", advice);
> +            abort();
> +    }
> +    return madvise(addr, len, advice);

#else
return -ENOTSUP;
Andreas Färber - Sept. 12, 2010, 8:50 a.m.
Am 12.09.2010 um 09:20 schrieb Blue Swirl:

> On Sat, Sep 11, 2010 at 5:05 PM, Andreas Färber <andreas.faerber@web.de 
> > wrote:
>> Use qemu_madvise() in arch_init.c's ram_load().
>>
>> http://www.opengroup.org/onlinepubs/9699919799/functions/posix_madvise.html
>>
>> Remaining madvise() users:
>> exec.c: limited to __linux__ and/or MADV_MERGEABLE (no POSIX  
>> equivalent)
>> kvm-all.c: limited to MADV_DONTFORK (no POSIX equivalent),
>>           otherwise runtime error if !kvm_has_sync_mmu()
>> hw/virtio-balloon.c: limited to __linux__
>
> For consistency, I'd convert all users. If an OS doesn't support a
> flag, we can return something like -ENOTSUP which may be checked by
> the caller if it cares.

Will do.

>> diff --git a/configure b/configure
>> index 4061cb7..5e6f3e1 100755
>> --- a/configure
>> +++ b/configure
>> @@ -2069,6 +2069,17 @@ if compile_prog "" "" ; then
>>  fi
>>
>>  ##########################################
>> +# check if we have posix_madvise
>> +
>> +cat > $TMPC << EOF
>> +#include <sys/mman.h>
>> +int main(void) { posix_madvise(NULL, 0, POSIX_MADV_DONTNEED);  
>> return 0; }
>> +EOF
>> +if compile_prog "" "" ; then
>> +    QEMU_CFLAGS="-DCONFIG_POSIX_MADVISE ${QEMU_CFLAGS}"
>
> Please take a look around configure:2444 how to add new config flags.

I did. It's just not obvious that they find their way from the  
Makefile to a C header, unlike autoconf.

> I'd also add a similar check for madvise() if posix_madvise() check
> fails.

Fine with me.

>> diff --git a/osdep.c b/osdep.c
>> index 30426ff..8c09597 100644
>> --- a/osdep.c
>> +++ b/osdep.c
>> @@ -28,6 +28,7 @@
>>  #include <errno.h>
>>  #include <unistd.h>
>>  #include <fcntl.h>
>> +#include <sys/mman.h>
>>
>>  /* Needed early for CONFIG_BSD etc. */
>>  #include "config-host.h"
>> @@ -139,6 +140,31 @@ void qemu_vfree(void *ptr)
>>
>>  #endif
>>
>> +int qemu_madvise(void *addr, size_t len, int advice)
>> +{
>> +#ifdef CONFIG_POSIX_MADVISE
>> +    switch (advice) {
>> +        case QEMU_MADV_DONTNEED:
>> +            advice = POSIX_MADV_DONTNEED;
>> +            break;
>> +        default:
>> +            fprintf(stderr, "Advice %d unhandled.\n", advice);
>> +            abort();
>
> This should be an assert, it's an internal error. It's also common to
> both cases.
>
>> +    }
>> +    return posix_madvise(addr, len, advice);
>> +#else
>
> #elif defined(CONFIG_MADVISE)
>
>> +    switch (advice) {
>> +        case QEMU_MADV_DONTNEED:
>> +            advice = MADV_DONTNEED;
>> +            break;
>
> case QEMU_MADV_DONTFORK:
> #ifndef MADV_DONTFORK
> return -ENOTSUP;
> #else
> advice = MADV_DONTFORK;
> break;
> #endif
>
> Same goes for MADV_MERGEABLE.

So you disagree with or didn't yet read Alex' suggestion of  
eliminating this mapping code in qemu_madvise() altogether?
Doing that in a sensible way would allow code to do:

#ifdef QEMU_MADV_MERGEABLE
...

as before at compile-time. The only qemu_madvise() error condition  
would then be the #else below.

>> +        default:
>> +            fprintf(stderr, "Advice %d unhandled.\n", advice);
>> +            abort();
>> +    }
>> +    return madvise(addr, len, advice);
>
> #else
> return -ENOTSUP;

Thanks,

Andreas
Blue Swirl - Sept. 12, 2010, 9:02 a.m.
On Sun, Sep 12, 2010 at 8:50 AM, Andreas Färber <andreas.faerber@web.de> wrote:
> Am 12.09.2010 um 09:20 schrieb Blue Swirl:
>
>> On Sat, Sep 11, 2010 at 5:05 PM, Andreas Färber <andreas.faerber@web.de>
>> wrote:
>>>
>>> Use qemu_madvise() in arch_init.c's ram_load().
>>>
>>>
>>> http://www.opengroup.org/onlinepubs/9699919799/functions/posix_madvise.html
>>>
>>> Remaining madvise() users:
>>> exec.c: limited to __linux__ and/or MADV_MERGEABLE (no POSIX equivalent)
>>> kvm-all.c: limited to MADV_DONTFORK (no POSIX equivalent),
>>>          otherwise runtime error if !kvm_has_sync_mmu()
>>> hw/virtio-balloon.c: limited to __linux__
>>
>> For consistency, I'd convert all users. If an OS doesn't support a
>> flag, we can return something like -ENOTSUP which may be checked by
>> the caller if it cares.
>
> Will do.
>
>>> diff --git a/configure b/configure
>>> index 4061cb7..5e6f3e1 100755
>>> --- a/configure
>>> +++ b/configure
>>> @@ -2069,6 +2069,17 @@ if compile_prog "" "" ; then
>>>  fi
>>>
>>>  ##########################################
>>> +# check if we have posix_madvise
>>> +
>>> +cat > $TMPC << EOF
>>> +#include <sys/mman.h>
>>> +int main(void) { posix_madvise(NULL, 0, POSIX_MADV_DONTNEED); return 0;
>>> }
>>> +EOF
>>> +if compile_prog "" "" ; then
>>> +    QEMU_CFLAGS="-DCONFIG_POSIX_MADVISE ${QEMU_CFLAGS}"
>>
>> Please take a look around configure:2444 how to add new config flags.
>
> I did. It's just not obvious that they find their way from the Makefile to a
> C header, unlike autoconf.
>
>> I'd also add a similar check for madvise() if posix_madvise() check
>> fails.
>
> Fine with me.
>
>>> diff --git a/osdep.c b/osdep.c
>>> index 30426ff..8c09597 100644
>>> --- a/osdep.c
>>> +++ b/osdep.c
>>> @@ -28,6 +28,7 @@
>>>  #include <errno.h>
>>>  #include <unistd.h>
>>>  #include <fcntl.h>
>>> +#include <sys/mman.h>
>>>
>>>  /* Needed early for CONFIG_BSD etc. */
>>>  #include "config-host.h"
>>> @@ -139,6 +140,31 @@ void qemu_vfree(void *ptr)
>>>
>>>  #endif
>>>
>>> +int qemu_madvise(void *addr, size_t len, int advice)
>>> +{
>>> +#ifdef CONFIG_POSIX_MADVISE
>>> +    switch (advice) {
>>> +        case QEMU_MADV_DONTNEED:
>>> +            advice = POSIX_MADV_DONTNEED;
>>> +            break;
>>> +        default:
>>> +            fprintf(stderr, "Advice %d unhandled.\n", advice);
>>> +            abort();
>>
>> This should be an assert, it's an internal error. It's also common to
>> both cases.
>>
>>> +    }
>>> +    return posix_madvise(addr, len, advice);
>>> +#else
>>
>> #elif defined(CONFIG_MADVISE)
>>
>>> +    switch (advice) {
>>> +        case QEMU_MADV_DONTNEED:
>>> +            advice = MADV_DONTNEED;
>>> +            break;
>>
>> case QEMU_MADV_DONTFORK:
>> #ifndef MADV_DONTFORK
>> return -ENOTSUP;
>> #else
>> advice = MADV_DONTFORK;
>> break;
>> #endif
>>
>> Same goes for MADV_MERGEABLE.
>
> So you disagree with or didn't yet read Alex' suggestion of eliminating this
> mapping code in qemu_madvise() altogether?
> Doing that in a sensible way would allow code to do:
>
> #ifdef QEMU_MADV_MERGEABLE
> ...
>
> as before at compile-time. The only qemu_madvise() error condition would
> then be the #else below.

That's one way too, if nobody cares about madvise() return values for
MADV_MERGEABLE. In any case I'd like to eliminate the #ifdefs in other
places than osdep.[ch].

Patch

diff --git a/arch_init.c b/arch_init.c
index e468c0c..a910033 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -396,7 +396,7 @@  int ram_load(QEMUFile *f, void *opaque, int version_id)
 #ifndef _WIN32
             if (ch == 0 &&
                 (!kvm_enabled() || kvm_has_sync_mmu())) {
-                madvise(host, TARGET_PAGE_SIZE, MADV_DONTNEED);
+                qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
             }
 #endif
         } else if (flags & RAM_SAVE_FLAG_PAGE) {
diff --git a/configure b/configure
index 4061cb7..5e6f3e1 100755
--- a/configure
+++ b/configure
@@ -2069,6 +2069,17 @@  if compile_prog "" "" ; then
 fi
 
 ##########################################
+# check if we have posix_madvise
+
+cat > $TMPC << EOF
+#include <sys/mman.h>
+int main(void) { posix_madvise(NULL, 0, POSIX_MADV_DONTNEED); return 0; }
+EOF
+if compile_prog "" "" ; then
+    QEMU_CFLAGS="-DCONFIG_POSIX_MADVISE ${QEMU_CFLAGS}"
+fi
+
+##########################################
 # check if trace backend exists
 
 sh "$source_path/tracetool" "--$trace_backend" --check-backend > /dev/null 2> /dev/null
diff --git a/osdep.c b/osdep.c
index 30426ff..8c09597 100644
--- a/osdep.c
+++ b/osdep.c
@@ -28,6 +28,7 @@ 
 #include <errno.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <sys/mman.h>
 
 /* Needed early for CONFIG_BSD etc. */
 #include "config-host.h"
@@ -139,6 +140,31 @@  void qemu_vfree(void *ptr)
 
 #endif
 
+int qemu_madvise(void *addr, size_t len, int advice)
+{
+#ifdef CONFIG_POSIX_MADVISE
+    switch (advice) {
+        case QEMU_MADV_DONTNEED:
+            advice = POSIX_MADV_DONTNEED;
+            break;
+        default:
+            fprintf(stderr, "Advice %d unhandled.\n", advice);
+            abort();
+    }
+    return posix_madvise(addr, len, advice);
+#else
+    switch (advice) {
+        case QEMU_MADV_DONTNEED:
+            advice = MADV_DONTNEED;
+            break;
+        default:
+            fprintf(stderr, "Advice %d unhandled.\n", advice);
+            abort();
+    }
+    return madvise(addr, len, advice);
+#endif
+}
+
 int qemu_create_pidfile(const char *filename)
 {
     char buffer[128];
diff --git a/osdep.h b/osdep.h
index 1cdc7e2..9f0da46 100644
--- a/osdep.h
+++ b/osdep.h
@@ -90,6 +90,10 @@  void *qemu_memalign(size_t alignment, size_t size);
 void *qemu_vmalloc(size_t size);
 void qemu_vfree(void *ptr);
 
+#define QEMU_MADV_DONTNEED 1
+
+int qemu_madvise(void *addr, size_t len, int advice);
+
 int qemu_create_pidfile(const char *filename);
 
 #ifdef _WIN32
diff --git a/vl.c b/vl.c
index 3f45aa9..d352d18 100644
--- a/vl.c
+++ b/vl.c
@@ -80,9 +80,6 @@ 
 #include <net/if.h>
 #include <syslog.h>
 #include <stropts.h>
-/* See MySQL bug #7156 (http://bugs.mysql.com/bug.php?id=7156) for
-   discussion about Solaris header problems */
-extern int madvise(caddr_t, size_t, int);
 #endif
 #endif
 #endif