Patchwork [1/2] trace: Add simple tracing support

login
register
mail settings
Submitter Stefan Hajnoczi
Date May 21, 2010, 9:42 a.m.
Message ID <1274434947-2863-2-git-send-email-stefanha@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/53138/
State New
Headers show

Comments

Stefan Hajnoczi - May 21, 2010, 9:42 a.m.
Trace events should be defined in trace.h.  Events are written to
/tmp/trace.log and can be formatted using trace.py.  Remember to add
events to trace.py for pretty-printing.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 Makefile.objs |    2 +-
 trace.c       |   64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 trace.h       |    9 ++++++++
 trace.py      |   30 ++++++++++++++++++++++++++
 4 files changed, 104 insertions(+), 1 deletions(-)
 create mode 100644 trace.c
 create mode 100644 trace.h
 create mode 100755 trace.py
Stefan Hajnoczi - May 21, 2010, 9:49 a.m.
I should have used the "[RFC]" tag to make it clear that I'm not
proposing these patches for merge, sorry.

Stefan
Jan Kiszka - May 21, 2010, 11:13 a.m.
Stefan Hajnoczi wrote:
> Trace events should be defined in trace.h.  Events are written to
> /tmp/trace.log and can be formatted using trace.py.  Remember to add
> events to trace.py for pretty-printing.

When already writing to a file, why not reusing QEMU's logging
infrastructure ("log <foo>" / "-d foo")? Shouldn't make a huge
performance difference if the data is saved in clear-text.

Also, having support for ftrace's user space markers would be a very
nice option (only an option as it's Linux-specific), see
http://lwn.net/Articles/366796. This allows to correlate kernel events
(KVM as well as others) with what goes on in QEMU. It simply enables
integration with the whole kernel tracing infrastructure, e.g.
KernelShark (http://people.redhat.com/srostedt/kernelshark/HTML).

Jan
Anthony Liguori - May 21, 2010, 12:37 p.m.
On 05/21/2010 04:42 AM, Stefan Hajnoczi wrote:
> Trace events should be defined in trace.h.  Events are written to
> /tmp/trace.log and can be formatted using trace.py.  Remember to add
> events to trace.py for pretty-printing.
>
> Signed-off-by: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com>
> ---
>   Makefile.objs |    2 +-
>   trace.c       |   64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   trace.h       |    9 ++++++++
>   trace.py      |   30 ++++++++++++++++++++++++++
>   4 files changed, 104 insertions(+), 1 deletions(-)
>   create mode 100644 trace.c
>   create mode 100644 trace.h
>   create mode 100755 trace.py
>
> diff --git a/Makefile.objs b/Makefile.objs
> index acbaf22..307e989 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -8,7 +8,7 @@ qobject-obj-y += qerror.o
>   # block-obj-y is code used by both qemu system emulation and qemu-img
>
>   block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
> -block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o
> +block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o trace.o
>   block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
>   block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>
> diff --git a/trace.c b/trace.c
> new file mode 100644
> index 0000000..2fec4d3
> --- /dev/null
> +++ b/trace.c
> @@ -0,0 +1,64 @@
> +#include<stdlib.h>
> +#include<stdio.h>
> +#include "trace.h"
> +
> +typedef struct {
> +    unsigned long event;
> +    unsigned long x1;
> +    unsigned long x2;
> +    unsigned long x3;
> +    unsigned long x4;
> +    unsigned long x5;
> +} TraceRecord;
> +
> +enum {
> +    TRACE_BUF_LEN = 64 * 1024 / sizeof(TraceRecord),
> +};
> +
> +static TraceRecord trace_buf[TRACE_BUF_LEN];
> +static unsigned int trace_idx;
> +static FILE *trace_fp;
> +
> +static void trace(TraceEvent event, unsigned long x1,
> +                  unsigned long x2, unsigned long x3,
> +                  unsigned long x4, unsigned long x5) {
> +    TraceRecord *rec =&trace_buf[trace_idx];
> +    rec->event = event;
> +    rec->x1 = x1;
> +    rec->x2 = x2;
> +    rec->x3 = x3;
> +    rec->x4 = x4;
> +    rec->x5 = x5;
> +
> +    if (++trace_idx == TRACE_BUF_LEN) {
> +        trace_idx = 0;
> +
> +        if (!trace_fp) {
> +            trace_fp = fopen("/tmp/trace.log", "w");
> +        }
> +        if (trace_fp) {
> +            size_t result = fwrite(trace_buf, sizeof trace_buf, 1, trace_fp);
> +            result = result;
> +        }
> +    }
> +}
>    

It is probably worth while to read trace points via the monitor or 
through some other mechanism.  My concern would be that writing even 64k 
out to disk would introduce enough performance overhead mainly because 
it runs lock-step with the guest's VCPU.

Maybe it's worth adding a thread that syncs the ring to disk if we want 
to write to disk?

> +void trace1(TraceEvent event, unsigned long x1) {
> +    trace(event, x1, 0, 0, 0, 0);
> +}
> +
> +void trace2(TraceEvent event, unsigned long x1, unsigned long x2) {
> +    trace(event, x1, x2, 0, 0, 0);
> +}
> +
> +void trace3(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3) {
> +    trace(event, x1, x2, x3, 0, 0);
> +}
> +
> +void trace4(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3, unsigned long x4) {
> +    trace(event, x1, x2, x3, x4, 0);
> +}
> +
> +void trace5(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3, unsigned long x4, unsigned long x5) {
> +    trace(event, x1, x2, x3, x4, x5);
> +}
> diff --git a/trace.h b/trace.h
> new file mode 100644
> index 0000000..144aa1e
> --- /dev/null
> +++ b/trace.h
> @@ -0,0 +1,9 @@
> +typedef enum {
> +    TRACE_MAX
> +} TraceEvent;
> +
> +void trace1(TraceEvent event, unsigned long x1);
> +void trace2(TraceEvent event, unsigned long x1, unsigned long x2);
> +void trace3(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3);
> +void trace4(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3, unsigned long x4);
> +void trace5(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3, unsigned long x4, unsigned long x5);
>    

Looks good.  I think we definitely need something like this.

Regards,

Anthony Liguori

> diff --git a/trace.py b/trace.py
> new file mode 100755
> index 0000000..f38ab6b
> --- /dev/null
> +++ b/trace.py
> @@ -0,0 +1,30 @@
> +#!/usr/bin/env python
> +import sys
> +import struct
> +
> +trace_fmt = 'LLLLLL'
> +trace_len = struct.calcsize(trace_fmt)
> +
> +events = {
> +}
> +
> +def read_record(fobj):
> +    s = fobj.read(trace_len)
> +    if len(s) != trace_len:
> +        return None
> +    return struct.unpack(trace_fmt, s)
> +
> +def format_record(rec):
> +    event = events[rec[0]]
> +    fields = [event[0]]
> +    for i in xrange(1, len(event)):
> +        fields.append('%s=0x%x' % (event[i], rec[i]))
> +    return ' '.join(fields)
> +
> +f = open(sys.argv[1], 'rb')
> +while True:
> +    rec = read_record(f)
> +    if rec is None:
> +        break
> +
> +    print format_record(rec)
>
Stefan Hajnoczi - May 21, 2010, 1:10 p.m.
On Fri, May 21, 2010 at 12:13 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> Stefan Hajnoczi wrote:
>> Trace events should be defined in trace.h.  Events are written to
>> /tmp/trace.log and can be formatted using trace.py.  Remember to add
>> events to trace.py for pretty-printing.
>
> When already writing to a file, why not reusing QEMU's logging
> infrastructure ("log <foo>" / "-d foo")? Shouldn't make a huge
> performance difference if the data is saved in clear-text.

> Also, having support for ftrace's user space markers would be a very
> nice option (only an option as it's Linux-specific), see
> http://lwn.net/Articles/366796.

Thanks for the links.

I think using the platform's tracing facility has many advantages.
The main one being that we can focus on QEMU/KVM development rather
than re-implementing tracing infrastructure :).

It may be possible to have SystemTap, DTrace, or nop static trace
event code.  A platform with no tracing support can only use the nop
backend, which results in a build without static trace events.
Platforms with tracing support can build with the appropriate backend
or nop.  The backend tracing facility is abstracted and most of QEMU
doesn't need to know which one is being used.

I hadn't seen trace markers.  However, I suspect they aren't ideal for
static trace events because logging an event requires a write system
call.  They look useful for annotating kernel tracing information, but
less for high frequency/low overhead userspace tracing.

Stefan
Jan Kiszka - May 21, 2010, 1:22 p.m.
Stefan Hajnoczi wrote:
> On Fri, May 21, 2010 at 12:13 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> Stefan Hajnoczi wrote:
>>> Trace events should be defined in trace.h.  Events are written to
>>> /tmp/trace.log and can be formatted using trace.py.  Remember to add
>>> events to trace.py for pretty-printing.
>> When already writing to a file, why not reusing QEMU's logging
>> infrastructure ("log <foo>" / "-d foo")? Shouldn't make a huge
>> performance difference if the data is saved in clear-text.
> 
>> Also, having support for ftrace's user space markers would be a very
>> nice option (only an option as it's Linux-specific), see
>> http://lwn.net/Articles/366796.
> 
> Thanks for the links.
> 
> I think using the platform's tracing facility has many advantages.
> The main one being that we can focus on QEMU/KVM development rather
> than re-implementing tracing infrastructure :).

Indeed. :)

> 
> It may be possible to have SystemTap, DTrace, or nop static trace
> event code.  A platform with no tracing support can only use the nop
> backend, which results in a build without static trace events.
> Platforms with tracing support can build with the appropriate backend
> or nop.  The backend tracing facility is abstracted and most of QEMU
> doesn't need to know which one is being used.

That would be ideal.

> 
> I hadn't seen trace markers.  However, I suspect they aren't ideal for
> static trace events because logging an event requires a write system
> call.  They look useful for annotating kernel tracing information, but
> less for high frequency/low overhead userspace tracing.

You never know for sure until you tried :). There are surely lots of
scenarios where this overhead does not matter.

Moreover, I'm sure that something of LTTng's high-frequency/low-overhead
tracing capabilities will make it (in whatever form) into mainline
sooner or later. So we need that smart infrastructure to make use of it
once it's available (actually, LTTng is already available, just still
requires "some" kernel patching).

Jan
Jan Kiszka - May 21, 2010, 1:46 p.m.
Anthony Liguori wrote:
> On 05/21/2010 04:42 AM, Stefan Hajnoczi wrote:
>> Trace events should be defined in trace.h.  Events are written to
>> /tmp/trace.log and can be formatted using trace.py.  Remember to add
>> events to trace.py for pretty-printing.
>>
>> Signed-off-by: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com>
>> ---
>>   Makefile.objs |    2 +-
>>   trace.c       |   64
>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   trace.h       |    9 ++++++++
>>   trace.py      |   30 ++++++++++++++++++++++++++
>>   4 files changed, 104 insertions(+), 1 deletions(-)
>>   create mode 100644 trace.c
>>   create mode 100644 trace.h
>>   create mode 100755 trace.py
>>
>> diff --git a/Makefile.objs b/Makefile.objs
>> index acbaf22..307e989 100644
>> --- a/Makefile.objs
>> +++ b/Makefile.objs
>> @@ -8,7 +8,7 @@ qobject-obj-y += qerror.o
>>   # block-obj-y is code used by both qemu system emulation and qemu-img
>>
>>   block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o
>> module.o
>> -block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o
>> +block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o trace.o
>>   block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
>>   block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>
>> diff --git a/trace.c b/trace.c
>> new file mode 100644
>> index 0000000..2fec4d3
>> --- /dev/null
>> +++ b/trace.c
>> @@ -0,0 +1,64 @@
>> +#include<stdlib.h>
>> +#include<stdio.h>
>> +#include "trace.h"
>> +
>> +typedef struct {
>> +    unsigned long event;
>> +    unsigned long x1;
>> +    unsigned long x2;
>> +    unsigned long x3;
>> +    unsigned long x4;
>> +    unsigned long x5;
>> +} TraceRecord;
>> +
>> +enum {
>> +    TRACE_BUF_LEN = 64 * 1024 / sizeof(TraceRecord),
>> +};
>> +
>> +static TraceRecord trace_buf[TRACE_BUF_LEN];
>> +static unsigned int trace_idx;
>> +static FILE *trace_fp;
>> +
>> +static void trace(TraceEvent event, unsigned long x1,
>> +                  unsigned long x2, unsigned long x3,
>> +                  unsigned long x4, unsigned long x5) {
>> +    TraceRecord *rec =&trace_buf[trace_idx];
>> +    rec->event = event;
>> +    rec->x1 = x1;
>> +    rec->x2 = x2;
>> +    rec->x3 = x3;
>> +    rec->x4 = x4;
>> +    rec->x5 = x5;
>> +
>> +    if (++trace_idx == TRACE_BUF_LEN) {
>> +        trace_idx = 0;
>> +
>> +        if (!trace_fp) {
>> +            trace_fp = fopen("/tmp/trace.log", "w");
>> +        }
>> +        if (trace_fp) {
>> +            size_t result = fwrite(trace_buf, sizeof trace_buf, 1,
>> trace_fp);
>> +            result = result;
>> +        }
>> +    }
>> +}
>>    
> 
> It is probably worth while to read trace points via the monitor or
> through some other mechanism.  My concern would be that writing even 64k
> out to disk would introduce enough performance overhead mainly because
> it runs lock-step with the guest's VCPU.
> 
> Maybe it's worth adding a thread that syncs the ring to disk if we want
> to write to disk?

That's not what QEMU should worry about. If somehow possible, let's push
this into the hands of a (user space) tracing framework, ideally one
that is already designed for such requirements. E.g. there exists quite
useful work in the context of LTTng (user space RCU for application
tracing).

We may need simple stubs for the case that no such framework is (yet)
available. But effort should focus on a QEMU infrastructure to add
useful tracepoints to the code. Specifically when tracing over KVM, you
usually need information about kernel states as well, so you depend on
an integrated approach, not Yet Another Log File.

Jan
Anthony Liguori - May 21, 2010, 2:03 p.m.
On 05/21/2010 08:46 AM, Jan Kiszka wrote:
> Anthony Liguori wrote:
>    
>> On 05/21/2010 04:42 AM, Stefan Hajnoczi wrote:
>>      
>>> Trace events should be defined in trace.h.  Events are written to
>>> /tmp/trace.log and can be formatted using trace.py.  Remember to add
>>> events to trace.py for pretty-printing.
>>>
>>> Signed-off-by: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com>
>>> ---
>>>    Makefile.objs |    2 +-
>>>    trace.c       |   64
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>    trace.h       |    9 ++++++++
>>>    trace.py      |   30 ++++++++++++++++++++++++++
>>>    4 files changed, 104 insertions(+), 1 deletions(-)
>>>    create mode 100644 trace.c
>>>    create mode 100644 trace.h
>>>    create mode 100755 trace.py
>>>
>>> diff --git a/Makefile.objs b/Makefile.objs
>>> index acbaf22..307e989 100644
>>> --- a/Makefile.objs
>>> +++ b/Makefile.objs
>>> @@ -8,7 +8,7 @@ qobject-obj-y += qerror.o
>>>    # block-obj-y is code used by both qemu system emulation and qemu-img
>>>
>>>    block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o
>>> module.o
>>> -block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o
>>> +block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o trace.o
>>>    block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
>>>    block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>>
>>> diff --git a/trace.c b/trace.c
>>> new file mode 100644
>>> index 0000000..2fec4d3
>>> --- /dev/null
>>> +++ b/trace.c
>>> @@ -0,0 +1,64 @@
>>> +#include<stdlib.h>
>>> +#include<stdio.h>
>>> +#include "trace.h"
>>> +
>>> +typedef struct {
>>> +    unsigned long event;
>>> +    unsigned long x1;
>>> +    unsigned long x2;
>>> +    unsigned long x3;
>>> +    unsigned long x4;
>>> +    unsigned long x5;
>>> +} TraceRecord;
>>> +
>>> +enum {
>>> +    TRACE_BUF_LEN = 64 * 1024 / sizeof(TraceRecord),
>>> +};
>>> +
>>> +static TraceRecord trace_buf[TRACE_BUF_LEN];
>>> +static unsigned int trace_idx;
>>> +static FILE *trace_fp;
>>> +
>>> +static void trace(TraceEvent event, unsigned long x1,
>>> +                  unsigned long x2, unsigned long x3,
>>> +                  unsigned long x4, unsigned long x5) {
>>> +    TraceRecord *rec =&trace_buf[trace_idx];
>>> +    rec->event = event;
>>> +    rec->x1 = x1;
>>> +    rec->x2 = x2;
>>> +    rec->x3 = x3;
>>> +    rec->x4 = x4;
>>> +    rec->x5 = x5;
>>> +
>>> +    if (++trace_idx == TRACE_BUF_LEN) {
>>> +        trace_idx = 0;
>>> +
>>> +        if (!trace_fp) {
>>> +            trace_fp = fopen("/tmp/trace.log", "w");
>>> +        }
>>> +        if (trace_fp) {
>>> +            size_t result = fwrite(trace_buf, sizeof trace_buf, 1,
>>> trace_fp);
>>> +            result = result;
>>> +        }
>>> +    }
>>> +}
>>>
>>>        
>> It is probably worth while to read trace points via the monitor or
>> through some other mechanism.  My concern would be that writing even 64k
>> out to disk would introduce enough performance overhead mainly because
>> it runs lock-step with the guest's VCPU.
>>
>> Maybe it's worth adding a thread that syncs the ring to disk if we want
>> to write to disk?
>>      
> That's not what QEMU should worry about. If somehow possible, let's push
> this into the hands of a (user space) tracing framework, ideally one
> that is already designed for such requirements. E.g. there exists quite
> useful work in the context of LTTng (user space RCU for application
> tracing).
>    

 From what I understand, none of the current kernel approaches to 
userspace tracing have much momentum at the moment.

> We may need simple stubs for the case that no such framework is (yet)
> available. But effort should focus on a QEMU infrastructure to add
> useful tracepoints to the code. Specifically when tracing over KVM, you
> usually need information about kernel states as well, so you depend on
> an integrated approach, not Yet Another Log File.
>    

I think the simple code that Stefan pasted gives us 95% of what we need.

Regards,

Anthony Liguori

> Jan
>
>
Jan Kiszka - May 21, 2010, 4:52 p.m.
Anthony Liguori wrote:
> On 05/21/2010 08:46 AM, Jan Kiszka wrote:
>> Anthony Liguori wrote:
>>    
>>> On 05/21/2010 04:42 AM, Stefan Hajnoczi wrote:
>>>      
>>>> Trace events should be defined in trace.h.  Events are written to
>>>> /tmp/trace.log and can be formatted using trace.py.  Remember to add
>>>> events to trace.py for pretty-printing.
>>>>
>>>> Signed-off-by: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com>
>>>> ---
>>>>    Makefile.objs |    2 +-
>>>>    trace.c       |   64
>>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    trace.h       |    9 ++++++++
>>>>    trace.py      |   30 ++++++++++++++++++++++++++
>>>>    4 files changed, 104 insertions(+), 1 deletions(-)
>>>>    create mode 100644 trace.c
>>>>    create mode 100644 trace.h
>>>>    create mode 100755 trace.py
>>>>
>>>> diff --git a/Makefile.objs b/Makefile.objs
>>>> index acbaf22..307e989 100644
>>>> --- a/Makefile.objs
>>>> +++ b/Makefile.objs
>>>> @@ -8,7 +8,7 @@ qobject-obj-y += qerror.o
>>>>    # block-obj-y is code used by both qemu system emulation and qemu-img
>>>>
>>>>    block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o
>>>> module.o
>>>> -block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o
>>>> +block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o trace.o
>>>>    block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
>>>>    block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>>>
>>>> diff --git a/trace.c b/trace.c
>>>> new file mode 100644
>>>> index 0000000..2fec4d3
>>>> --- /dev/null
>>>> +++ b/trace.c
>>>> @@ -0,0 +1,64 @@
>>>> +#include<stdlib.h>
>>>> +#include<stdio.h>
>>>> +#include "trace.h"
>>>> +
>>>> +typedef struct {
>>>> +    unsigned long event;
>>>> +    unsigned long x1;
>>>> +    unsigned long x2;
>>>> +    unsigned long x3;
>>>> +    unsigned long x4;
>>>> +    unsigned long x5;
>>>> +} TraceRecord;
>>>> +
>>>> +enum {
>>>> +    TRACE_BUF_LEN = 64 * 1024 / sizeof(TraceRecord),
>>>> +};
>>>> +
>>>> +static TraceRecord trace_buf[TRACE_BUF_LEN];
>>>> +static unsigned int trace_idx;
>>>> +static FILE *trace_fp;
>>>> +
>>>> +static void trace(TraceEvent event, unsigned long x1,
>>>> +                  unsigned long x2, unsigned long x3,
>>>> +                  unsigned long x4, unsigned long x5) {
>>>> +    TraceRecord *rec =&trace_buf[trace_idx];
>>>> +    rec->event = event;
>>>> +    rec->x1 = x1;
>>>> +    rec->x2 = x2;
>>>> +    rec->x3 = x3;
>>>> +    rec->x4 = x4;
>>>> +    rec->x5 = x5;
>>>> +
>>>> +    if (++trace_idx == TRACE_BUF_LEN) {
>>>> +        trace_idx = 0;
>>>> +
>>>> +        if (!trace_fp) {
>>>> +            trace_fp = fopen("/tmp/trace.log", "w");
>>>> +        }
>>>> +        if (trace_fp) {
>>>> +            size_t result = fwrite(trace_buf, sizeof trace_buf, 1,
>>>> trace_fp);
>>>> +            result = result;
>>>> +        }
>>>> +    }
>>>> +}
>>>>
>>>>        
>>> It is probably worth while to read trace points via the monitor or
>>> through some other mechanism.  My concern would be that writing even 64k
>>> out to disk would introduce enough performance overhead mainly because
>>> it runs lock-step with the guest's VCPU.
>>>
>>> Maybe it's worth adding a thread that syncs the ring to disk if we want
>>> to write to disk?
>>>      
>> That's not what QEMU should worry about. If somehow possible, let's push
>> this into the hands of a (user space) tracing framework, ideally one
>> that is already designed for such requirements. E.g. there exists quite
>> useful work in the context of LTTng (user space RCU for application
>> tracing).
>>    
> 
>  From what I understand, none of the current kernel approaches to 
> userspace tracing have much momentum at the moment.
> 
>> We may need simple stubs for the case that no such framework is (yet)
>> available. But effort should focus on a QEMU infrastructure to add
>> useful tracepoints to the code. Specifically when tracing over KVM, you
>> usually need information about kernel states as well, so you depend on
>> an integrated approach, not Yet Another Log File.
>>    
> 
> I think the simple code that Stefan pasted gives us 95% of what we need.

IMHO not 95%, but it is a start.

I would just like to avoid that too much efforts are spent on
re-inventing smart trace buffers, trace daemons, or trace visualization
tools. Then better pick up some semi-perfect approach (e.g. [1], it
unfortunately still seems to lack kernel integration) and drive it
according to our needs.

Jan

[1] http://lttng.org/ust
Stefan Hajnoczi - May 21, 2010, 8:49 p.m.
On Fri, May 21, 2010 at 5:52 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> I would just like to avoid that too much efforts are spent on
> re-inventing smart trace buffers, trace daemons, or trace visualization
> tools. Then better pick up some semi-perfect approach (e.g. [1], it
> unfortunately still seems to lack kernel integration) and drive it
> according to our needs.

I agree we have to consider existing solutions.  The killer is the
usability: what dependencies are required to build with tracing?  Is a
patched kernel or module required?  How easy is it to add static trace
events during debugging?

If there are too many dependencies, especially to unpackaged software,
many people will stop right there and not bother.  A patched kernel or
module isn't acceptable since the hassle of reconfiguring a system for
tracing becomes too great (or in some cases changing the kernel is not
possible/allowed).

Adding new static trace events should be easy, too.  Ideally it
doesn't require adding information about the trace event in multiple
places (header files, C files, etc).  It also shouldn't require
learning about the tracing system, adding a trace event should be
self-explanatory so anyone can easily add one for debugging.

A lot of opinions there, but what I'm saying is that friction must be
low.  If the tracing system is a pain to use, then no-one will use it.

http://lttng.org/files/ust/manual/ust.html

LTTng Userspace Tracer looks interesting - no kernel support required
AFAICT.  Toggling trace events in a running process supported.
Similar to kernel tracepoint.h and existing report/visualization tool.

x86 (32- and 64-bit) only.  Like you say, no correlation with kernel trace data.

I'll try to give LTTng UST a spin by converting my trace events to use
UST.  This seems closest to an existing tracing system we can drop in.

http://sourceware.org/systemtap/wiki/AddingUserSpaceProbingToApps

Requires kernel support - not sure if enough of utrace is in mainline
for this to work out-of-the-box across distros.

Unclear how exactly SystemTap userspace probing would work out.  Does
anyone have experience or want to try this?

Stefan
Anthony Liguori - May 21, 2010, 9:06 p.m.
On 05/21/2010 11:52 AM, Jan Kiszka wrote:
> Anthony Liguori wrote:
>    
>> On 05/21/2010 08:46 AM, Jan Kiszka wrote:
>>      
>>> Anthony Liguori wrote:
>>>
>>>        
>>>> On 05/21/2010 04:42 AM, Stefan Hajnoczi wrote:
>>>>
>>>>          
>>>>> Trace events should be defined in trace.h.  Events are written to
>>>>> /tmp/trace.log and can be formatted using trace.py.  Remember to add
>>>>> events to trace.py for pretty-printing.
>>>>>
>>>>> Signed-off-by: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com>
>>>>> ---
>>>>>     Makefile.objs |    2 +-
>>>>>     trace.c       |   64
>>>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>     trace.h       |    9 ++++++++
>>>>>     trace.py      |   30 ++++++++++++++++++++++++++
>>>>>     4 files changed, 104 insertions(+), 1 deletions(-)
>>>>>     create mode 100644 trace.c
>>>>>     create mode 100644 trace.h
>>>>>     create mode 100755 trace.py
>>>>>
>>>>> diff --git a/Makefile.objs b/Makefile.objs
>>>>> index acbaf22..307e989 100644
>>>>> --- a/Makefile.objs
>>>>> +++ b/Makefile.objs
>>>>> @@ -8,7 +8,7 @@ qobject-obj-y += qerror.o
>>>>>     # block-obj-y is code used by both qemu system emulation and qemu-img
>>>>>
>>>>>     block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o
>>>>> module.o
>>>>> -block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o
>>>>> +block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o trace.o
>>>>>     block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
>>>>>     block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>>>>
>>>>> diff --git a/trace.c b/trace.c
>>>>> new file mode 100644
>>>>> index 0000000..2fec4d3
>>>>> --- /dev/null
>>>>> +++ b/trace.c
>>>>> @@ -0,0 +1,64 @@
>>>>> +#include<stdlib.h>
>>>>> +#include<stdio.h>
>>>>> +#include "trace.h"
>>>>> +
>>>>> +typedef struct {
>>>>> +    unsigned long event;
>>>>> +    unsigned long x1;
>>>>> +    unsigned long x2;
>>>>> +    unsigned long x3;
>>>>> +    unsigned long x4;
>>>>> +    unsigned long x5;
>>>>> +} TraceRecord;
>>>>> +
>>>>> +enum {
>>>>> +    TRACE_BUF_LEN = 64 * 1024 / sizeof(TraceRecord),
>>>>> +};
>>>>> +
>>>>> +static TraceRecord trace_buf[TRACE_BUF_LEN];
>>>>> +static unsigned int trace_idx;
>>>>> +static FILE *trace_fp;
>>>>> +
>>>>> +static void trace(TraceEvent event, unsigned long x1,
>>>>> +                  unsigned long x2, unsigned long x3,
>>>>> +                  unsigned long x4, unsigned long x5) {
>>>>> +    TraceRecord *rec =&trace_buf[trace_idx];
>>>>> +    rec->event = event;
>>>>> +    rec->x1 = x1;
>>>>> +    rec->x2 = x2;
>>>>> +    rec->x3 = x3;
>>>>> +    rec->x4 = x4;
>>>>> +    rec->x5 = x5;
>>>>> +
>>>>> +    if (++trace_idx == TRACE_BUF_LEN) {
>>>>> +        trace_idx = 0;
>>>>> +
>>>>> +        if (!trace_fp) {
>>>>> +            trace_fp = fopen("/tmp/trace.log", "w");
>>>>> +        }
>>>>> +        if (trace_fp) {
>>>>> +            size_t result = fwrite(trace_buf, sizeof trace_buf, 1,
>>>>> trace_fp);
>>>>> +            result = result;
>>>>> +        }
>>>>> +    }
>>>>> +}
>>>>>
>>>>>
>>>>>            
>>>> It is probably worth while to read trace points via the monitor or
>>>> through some other mechanism.  My concern would be that writing even 64k
>>>> out to disk would introduce enough performance overhead mainly because
>>>> it runs lock-step with the guest's VCPU.
>>>>
>>>> Maybe it's worth adding a thread that syncs the ring to disk if we want
>>>> to write to disk?
>>>>
>>>>          
>>> That's not what QEMU should worry about. If somehow possible, let's push
>>> this into the hands of a (user space) tracing framework, ideally one
>>> that is already designed for such requirements. E.g. there exists quite
>>> useful work in the context of LTTng (user space RCU for application
>>> tracing).
>>>
>>>        
>>    From what I understand, none of the current kernel approaches to
>> userspace tracing have much momentum at the moment.
>>
>>      
>>> We may need simple stubs for the case that no such framework is (yet)
>>> available. But effort should focus on a QEMU infrastructure to add
>>> useful tracepoints to the code. Specifically when tracing over KVM, you
>>> usually need information about kernel states as well, so you depend on
>>> an integrated approach, not Yet Another Log File.
>>>
>>>        
>> I think the simple code that Stefan pasted gives us 95% of what we need.
>>      
> IMHO not 95%, but it is a start.
>    

I'm not opposed to using a framework, but I'd rather have an equivalent 
to kvm_stat tomorrow than wait 3 years for LTTng to not get merged.

So let's have a dirt-simple tracing mechanism and focus on adding useful 
trace points.  Then when we have a framework we can use, we can just 
convert the tracepoints to the new framework.

Regards,

Anthony Liguori

> I would just like to avoid that too much efforts are spent on
> re-inventing smart trace buffers, trace daemons, or trace visualization
> tools. Then better pick up some semi-perfect approach (e.g. [1], it
> unfortunately still seems to lack kernel integration) and drive it
> according to our needs.
>
> Jan
>
> [1] http://lttng.org/ust
>
>
Christoph Hellwig - May 21, 2010, 9:26 p.m.
On Fri, May 21, 2010 at 09:49:56PM +0100, Stefan Hajnoczi wrote:
> http://sourceware.org/systemtap/wiki/AddingUserSpaceProbingToApps
> 
> Requires kernel support - not sure if enough of utrace is in mainline
> for this to work out-of-the-box across distros.

Nothing of utrace is in mainline, nevermind the whole systemtap code
which is intentionally keep out of the kernel tree.  Using this means
that for every probe in userspace code you need to keep the configured
source tree of the currently running kernel around, which is completely
unusable for typical developer setups.
Jan Kiszka - May 21, 2010, 9:37 p.m.
Stefan Hajnoczi wrote:
> On Fri, May 21, 2010 at 5:52 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> I would just like to avoid that too much efforts are spent on
>> re-inventing smart trace buffers, trace daemons, or trace visualization
>> tools. Then better pick up some semi-perfect approach (e.g. [1], it
>> unfortunately still seems to lack kernel integration) and drive it
>> according to our needs.
> 
> I agree we have to consider existing solutions.  The killer is the
> usability: what dependencies are required to build with tracing?  Is a
> patched kernel or module required?  How easy is it to add static trace
> events during debugging?
> 
> If there are too many dependencies, especially to unpackaged software,
> many people will stop right there and not bother.  A patched kernel or
> module isn't acceptable since the hassle of reconfiguring a system for
> tracing becomes too great (or in some cases changing the kernel is not
> possible/allowed).
> 
> Adding new static trace events should be easy, too.  Ideally it
> doesn't require adding information about the trace event in multiple
> places (header files, C files, etc).  It also shouldn't require
> learning about the tracing system, adding a trace event should be
> self-explanatory so anyone can easily add one for debugging.
> 
> A lot of opinions there, but what I'm saying is that friction must be
> low.  If the tracing system is a pain to use, then no-one will use it.

No question.

I mentioned LTTng as it is most promising /wrt performance (both when
enabled and disabled). But LTTng was so far not best in class when it
came to usability.

> 
> http://lttng.org/files/ust/manual/ust.html
> 
> LTTng Userspace Tracer looks interesting - no kernel support required
> AFAICT.  Toggling trace events in a running process supported.
> Similar to kernel tracepoint.h and existing report/visualization tool.
> 
> x86 (32- and 64-bit) only.

Sure? I thought there might be an arch dependency due to urcu but it has
generic support as well now.

>  Like you say, no correlation with kernel trace data.

It would be good if we could still hook into trancepoints and stream
them out differently. That would allow for add-hoc tracing when
performance does not matter that much (trace to file, trace to kernel).
But we would still benefit from enabling tracepoints during runtime and
keeping them built in.

Jan
Jan Kiszka - May 21, 2010, 9:41 p.m.
Anthony Liguori wrote:
> I'm not opposed to using a framework, but I'd rather have an equivalent
> to kvm_stat tomorrow than wait 3 years for LTTng to not get merged.
> 
> So let's have a dirt-simple tracing mechanism and focus on adding useful
> trace points.  Then when we have a framework we can use, we can just
> convert the tracepoints to the new framework.

That could mean serializing the tracepoints to strings and dumping them
to our log file - no concerns.

Jan
Anthony Liguori - May 21, 2010, 9:58 p.m.
On 05/21/2010 04:41 PM, Jan Kiszka wrote:
> Anthony Liguori wrote:
>    
>> I'm not opposed to using a framework, but I'd rather have an equivalent
>> to kvm_stat tomorrow than wait 3 years for LTTng to not get merged.
>>
>> So let's have a dirt-simple tracing mechanism and focus on adding useful
>> trace points.  Then when we have a framework we can use, we can just
>> convert the tracepoints to the new framework.
>>      
> That could mean serializing the tracepoints to strings and dumping them
> to our log file - no concerns.
>    

Which I really don't mind.

Regards,

Anthony Liguori

> Jan
>
>

Patch

diff --git a/Makefile.objs b/Makefile.objs
index acbaf22..307e989 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -8,7 +8,7 @@  qobject-obj-y += qerror.o
 # block-obj-y is code used by both qemu system emulation and qemu-img
 
 block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
-block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o
+block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o trace.o
 block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
diff --git a/trace.c b/trace.c
new file mode 100644
index 0000000..2fec4d3
--- /dev/null
+++ b/trace.c
@@ -0,0 +1,64 @@ 
+#include <stdlib.h>
+#include <stdio.h>
+#include "trace.h"
+
+typedef struct {
+    unsigned long event;
+    unsigned long x1;
+    unsigned long x2;
+    unsigned long x3;
+    unsigned long x4;
+    unsigned long x5;
+} TraceRecord;
+
+enum {
+    TRACE_BUF_LEN = 64 * 1024 / sizeof(TraceRecord),
+};
+
+static TraceRecord trace_buf[TRACE_BUF_LEN];
+static unsigned int trace_idx;
+static FILE *trace_fp;
+
+static void trace(TraceEvent event, unsigned long x1,
+                  unsigned long x2, unsigned long x3,
+                  unsigned long x4, unsigned long x5) {
+    TraceRecord *rec = &trace_buf[trace_idx];
+    rec->event = event;
+    rec->x1 = x1;
+    rec->x2 = x2;
+    rec->x3 = x3;
+    rec->x4 = x4;
+    rec->x5 = x5;
+
+    if (++trace_idx == TRACE_BUF_LEN) {
+        trace_idx = 0;
+
+        if (!trace_fp) {
+            trace_fp = fopen("/tmp/trace.log", "w");
+        }
+        if (trace_fp) {
+            size_t result = fwrite(trace_buf, sizeof trace_buf, 1, trace_fp);
+            result = result;
+        }
+    }
+}
+
+void trace1(TraceEvent event, unsigned long x1) {
+    trace(event, x1, 0, 0, 0, 0);
+}
+
+void trace2(TraceEvent event, unsigned long x1, unsigned long x2) {
+    trace(event, x1, x2, 0, 0, 0);
+}
+
+void trace3(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3) {
+    trace(event, x1, x2, x3, 0, 0);
+}
+
+void trace4(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3, unsigned long x4) {
+    trace(event, x1, x2, x3, x4, 0);
+}
+
+void trace5(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3, unsigned long x4, unsigned long x5) {
+    trace(event, x1, x2, x3, x4, x5);
+}
diff --git a/trace.h b/trace.h
new file mode 100644
index 0000000..144aa1e
--- /dev/null
+++ b/trace.h
@@ -0,0 +1,9 @@ 
+typedef enum {
+    TRACE_MAX
+} TraceEvent;
+
+void trace1(TraceEvent event, unsigned long x1);
+void trace2(TraceEvent event, unsigned long x1, unsigned long x2);
+void trace3(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3);
+void trace4(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3, unsigned long x4);
+void trace5(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3, unsigned long x4, unsigned long x5);
diff --git a/trace.py b/trace.py
new file mode 100755
index 0000000..f38ab6b
--- /dev/null
+++ b/trace.py
@@ -0,0 +1,30 @@ 
+#!/usr/bin/env python
+import sys
+import struct
+
+trace_fmt = 'LLLLLL'
+trace_len = struct.calcsize(trace_fmt)
+
+events = {
+}
+
+def read_record(fobj):
+    s = fobj.read(trace_len)
+    if len(s) != trace_len:
+        return None
+    return struct.unpack(trace_fmt, s)
+
+def format_record(rec):
+    event = events[rec[0]]
+    fields = [event[0]]
+    for i in xrange(1, len(event)):
+        fields.append('%s=0x%x' % (event[i], rec[i]))
+    return ' '.join(fields)
+
+f = open(sys.argv[1], 'rb')
+while True:
+    rec = read_record(f)
+    if rec is None:
+        break
+
+    print format_record(rec)