diff mbox

[RFC,v3,2/3] simpletrace-v2: Handle variable number/size of elements per trace record.

Message ID 1326193179-19677-3-git-send-email-harsh@linux.vnet.ibm.com
State New
Headers show

Commit Message

Harsh Prateek Bora Jan. 10, 2012, 10:59 a.m. UTC
Advantages over existing simpletrace backend:
- More than 6 elements (vitually unlimited) arguments can be traced.
- This allows to trace strings (variable size element) as well.

Signed-off-by: Harsh Prateek Bora <harsh@linux.vnet.ibm.com>
---
 monitor.c      |    2 +-
 trace/simple.c |  178 ++++++++++++++++++++------------------------------------
 trace/simple.h |   31 ++++++++--
 3 files changed, 88 insertions(+), 123 deletions(-)

Comments

Stefan Hajnoczi Jan. 10, 2012, 4:41 p.m. UTC | #1
On Tue, Jan 10, 2012 at 10:59 AM, Harsh Prateek Bora
<harsh@linux.vnet.ibm.com> wrote:
> Advantages over existing simpletrace backend:
> - More than 6 elements (vitually unlimited) arguments can be traced.
> - This allows to trace strings (variable size element) as well.
>
> Signed-off-by: Harsh Prateek Bora <harsh@linux.vnet.ibm.com>
> ---
>  monitor.c      |    2 +-
>  trace/simple.c |  178 ++++++++++++++++++++------------------------------------
>  trace/simple.h |   31 ++++++++--
>  3 files changed, 88 insertions(+), 123 deletions(-)
>
> diff --git a/monitor.c b/monitor.c
> index ffda0fe..b6f85d1 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -945,7 +945,7 @@ static void do_info_cpu_stats(Monitor *mon)
>  #if defined(CONFIG_TRACE_SIMPLE)
>  static void do_info_trace(Monitor *mon)
>  {
> -    st_print_trace((FILE *)mon, &monitor_fprintf);
> +    /* FIXME: st_print_trace((FILE *)mon, &monitor_fprintf); */

This command is only available from the human monitor.  It's not very
useful because it historically hasn't been able to pretty-print events
or show them in the right order (we use a ringbuffer but it prints
them out from index 0).

Therefore, I don't think we're under any obligation to keep this
command around.  No one has complained about it's limitations - I
think this is a sign that no one has used it.  I'd be okay with a
patch that removes it.

>  }
>  #endif
>
> diff --git a/trace/simple.c b/trace/simple.c
> index b639dda..65cafe2 100644
> --- a/trace/simple.c
> +++ b/trace/simple.c
> @@ -27,7 +27,7 @@
>  #define HEADER_MAGIC 0xf2b177cb0aa429b4ULL
>
>  /** Trace file version number, bump if format changes */
> -#define HEADER_VERSION 0
> +#define HEADER_VERSION 2
>
>  /** Records were dropped event ID */
>  #define DROPPED_EVENT_ID (~(uint64_t)0 - 1)
> @@ -35,23 +35,6 @@
>  /** Trace record is valid */
>  #define TRACE_RECORD_VALID ((uint64_t)1 << 63)
>
> -/** Trace buffer entry */
> -typedef struct {
> -    uint64_t event;
> -    uint64_t timestamp_ns;
> -    uint64_t x1;
> -    uint64_t x2;
> -    uint64_t x3;
> -    uint64_t x4;
> -    uint64_t x5;
> -    uint64_t x6;
> -} TraceRecord;
> -
> -enum {
> -    TRACE_BUF_LEN = 4096,
> -    TRACE_BUF_FLUSH_THRESHOLD = TRACE_BUF_LEN / 4,
> -};
> -
>  /*
>  * Trace records are written out by a dedicated thread.  The thread waits for
>  * records to become available, writes them out, and then waits again.
> @@ -62,11 +45,12 @@ static GCond *trace_empty_cond;
>  static bool trace_available;
>  static bool trace_writeout_enabled;
>
> -static TraceRecord trace_buf[TRACE_BUF_LEN];
> +uint8_t trace_buf[TRACE_BUF_LEN];
>  static unsigned int trace_idx;
>  static FILE *trace_fp;
>  static char *trace_file_name = NULL;
>
> +

Spurious newline.

>  /**
>  * Read a trace record from the trace buffer
>  *
> @@ -75,16 +59,19 @@ static char *trace_file_name = NULL;
>  *
>  * Returns false if the record is not valid.
>  */
> -static bool get_trace_record(unsigned int idx, TraceRecord *record)
> +static bool get_trace_record(unsigned int idx, TraceRecord **recordptr)
>  {
> -    if (!(trace_buf[idx].event & TRACE_RECORD_VALID)) {
> +    TraceRecord *record = (TraceRecord *) &trace_buf[idx];
> +    if (!(record->event & TRACE_RECORD_VALID)) {
>         return false;
>     }
>
>     __sync_synchronize(); /* read memory barrier before accessing record */
>
> -    *record = trace_buf[idx];
> -    record->event &= ~TRACE_RECORD_VALID;
> +    *recordptr = g_malloc(record->length);
> +    /* make a copy of record to avoid being overwritten */
> +    memcpy(*recordptr, record, record->length);
> +    (*recordptr)->event &= ~TRACE_RECORD_VALID;
>     return true;
>  }
>
> @@ -106,6 +93,47 @@ static void flush_trace_file(bool wait)
>     g_static_mutex_unlock(&trace_lock);
>  }
>
> +unsigned int trace_alloc_record(TraceEventID event, uint32_t datasize)
> +{
> +    unsigned int idx, rec_off;
> +    uint32_t rec_len = ST_V2_REC_HDR_LEN + datasize;
> +    uint64_t timestamp_ns = get_clock();
> +
> +    idx = g_atomic_int_exchange_and_add((gint *)&trace_idx, rec_len) % TRACE_BUF_LEN;
> +    rec_off = idx;
> +    write_to_buffer(rec_off, (uint8_t*)&event, sizeof(event));
> +    rec_off += sizeof(event);
> +    write_to_buffer(rec_off, (uint8_t*)&timestamp_ns, sizeof(timestamp_ns));
> +    rec_off += sizeof(timestamp_ns);
> +    write_to_buffer(rec_off, (uint8_t*)&rec_len, sizeof(rec_len));
> +    rec_off += sizeof(rec_len);
> +    return idx;
> +}
> +
> +void write_to_buffer(unsigned int idx, uint8_t *dataptr, uint32_t size)
> +{
> +    uint32_t x = 0;
> +    while (x < size)
> +    {
> +        if (idx >= TRACE_BUF_LEN) {
> +            idx = idx % TRACE_BUF_LEN;
> +        }
> +        trace_buf[idx++] = dataptr[x++];
> +    }
> +}
> +
> +void trace_mark_record_complete(unsigned int idx)
> +{
> +    TraceRecord *record = (TraceRecord*) &trace_buf[idx];
> +
> +    __sync_synchronize(); /* write barrier before marking as valid */
> +    record->event |= TRACE_RECORD_VALID;
> +
> +    if (idx > TRACE_BUF_FLUSH_THRESHOLD) {
> +        flush_trace_file(false);
> +    }

This looks suspicious.  We want to kick the writeout thread when the
buffer reaches the threshold but not repeatedly after that.  Also
because it's a ring buffer idx could "start" above the threshold.

Perhaps something like:
if (idx > TRACE_BUF_FLUSH_THRESHOLD && old_idx <= TRACE_BUF_FLUSH_THRESHOLD) {
    flush_trace_file(false);
}

> +}
> +
>  static void wait_for_trace_records_available(void)
>  {
>     g_static_mutex_lock(&trace_lock);
> @@ -120,7 +148,7 @@ static void wait_for_trace_records_available(void)
>
>  static gpointer writeout_thread(gpointer opaque)
>  {
> -    TraceRecord record;
> +    TraceRecord record, *recordptr;
>     unsigned int writeout_idx = 0;
>     unsigned int num_available, idx;
>     size_t unused __attribute__ ((unused));
> @@ -130,19 +158,20 @@ static gpointer writeout_thread(gpointer opaque)
>
>         num_available = trace_idx - writeout_idx;
>         if (num_available > TRACE_BUF_LEN) {
> -            record = (TraceRecord){
> -                .event = DROPPED_EVENT_ID,
> -                .x1 = num_available,
> -            };
> -            unused = fwrite(&record, sizeof(record), 1, trace_fp);
> +            record.event = DROPPED_EVENT_ID,
> +            record.length = num_available,

Please don't use .length for the number of bytes dropped because this
makes life harder for parsing scripts like simpletrace.py.  They would
need a special case to check for DROPPED_EVENT_ID and then treat
.length as an argument.  Instead the number of bytes should be an
argument.

Also, by removing the struct literal assignment you lose the automatic
zeroing of fields.  We'll write out junk data - it would be nicer to
clear it.

> +            unused = fwrite(&record, ST_V2_REC_HDR_LEN, 1, trace_fp);
>             writeout_idx += num_available;
>         }
>
>         idx = writeout_idx % TRACE_BUF_LEN;
> -        while (get_trace_record(idx, &record)) {
> -            trace_buf[idx].event = 0; /* clear valid bit */
> -            unused = fwrite(&record, sizeof(record), 1, trace_fp);
> -            idx = ++writeout_idx % TRACE_BUF_LEN;
> +        while (get_trace_record(idx, &recordptr)) {
> +            unused = fwrite(recordptr, recordptr->length, 1, trace_fp);
> +            writeout_idx += recordptr->length;
> +            g_free(recordptr);
> +            recordptr = (TraceRecord *) &trace_buf[idx];
> +            recordptr->event = 0;
> +            idx = writeout_idx % TRACE_BUF_LEN;
>         }

I'm wondering if it's worth using a different approach here.  Writing
out individual records has bothered me.

If we have a second buffer, as big as trace_buf[], then a function can
copy out all records and make them available in trace_buf again:

/**
 * Copy completed trace records out of the ring buffer
 *
 * @idx    offset into trace_buf[]
 * @buf    destination buffer
 * @len    size of destination buffer
 * @ret    the number of bytes consumed
 */
size_t consume_trace_records(unsigned int idx, void *buf, size_t len);

That means consume gobbles up as many records as it can:
 * Until it reaches an invalid record which has not been written yet
 * Until it reaches the end of trace_buf[], the caller can call again
with idx wrapped to 0

After copying into buf[] it clears the valid bit in trace_buf[].

Then the loop which calls consume_trace_records() does a single
fwrite(3) and increments idx/writeout_idx.

The advantage to this is that we write many records in one go and I
think it splits up the writeout steps a little nicer than what we've
previously done.

>
>         fflush(trace_fp);
> @@ -231,7 +196,7 @@ void st_set_trace_file_enabled(bool enable)
>         static const TraceRecord header = {
>             .event = HEADER_EVENT_ID,
>             .timestamp_ns = HEADER_MAGIC,
> -            .x1 = HEADER_VERSION,
> +            .length = HEADER_VERSION,

Hmm...this is kind of ugly (see comment about using .length above) but
in this case most parsers will have a special-case anyway to check the
magic number.  We need to use the .length field because historically
that's where the version is located.

> +/* Interfaces for simpletrace v2 */

Please drop this comment.  All trace/simple.[ch] code is for v2, this
makes it seems like we support both versions at the same time.

> +
> +/** Trace buffer entry */
> +typedef struct {
> +    uint64_t event; /*  TraceEventID */
> +    uint64_t timestamp_ns;
> +    uint32_t length;   /*   in bytes */
> +    uint32_t reserved; /*   unused */
> +    uint8_t arguments[]; /* arguments position affects ST_V2_REC_HDR_LEN */
> +} TraceRecord;

Does this need to be visible?  I think it can live in trace/simple.c

> +
> +#define ST_V2_REC_HDR_LEN 24

Does not need to be visible.  Also why calculate manually, instead of
a macro I would use sizeof(TraceRecord) (the array without elements is
not counted by sizeof).

> +
> +enum {
> +    TRACE_BUF_LEN = 4096 * 64,
> +    TRACE_BUF_FLUSH_THRESHOLD = TRACE_BUF_LEN / 4,
> +};

Does not need to be visible.

> +
> +extern uint8_t trace_buf[TRACE_BUF_LEN];

Does not need to be visible.
Harsh Prateek Bora Jan. 18, 2012, 9:14 a.m. UTC | #2
On 01/10/2012 10:11 PM, Stefan Hajnoczi wrote:
> On Tue, Jan 10, 2012 at 10:59 AM, Harsh Prateek Bora
> <harsh@linux.vnet.ibm.com>  wrote:
>> Advantages over existing simpletrace backend:
>> - More than 6 elements (vitually unlimited) arguments can be traced.
>> - This allows to trace strings (variable size element) as well.
>>
>> Signed-off-by: Harsh Prateek Bora<harsh@linux.vnet.ibm.com>
>> ---
>>   monitor.c      |    2 +-
>>   trace/simple.c |  178 ++++++++++++++++++++------------------------------------
>>   trace/simple.h |   31 ++++++++--
>>   3 files changed, 88 insertions(+), 123 deletions(-)
>>
>> diff --git a/monitor.c b/monitor.c
>> index ffda0fe..b6f85d1 100644
>> --- a/monitor.c
>> +++ b/monitor.c
>> @@ -945,7 +945,7 @@ static void do_info_cpu_stats(Monitor *mon)
>>   #if defined(CONFIG_TRACE_SIMPLE)
>>   static void do_info_trace(Monitor *mon)
>>   {
>> -    st_print_trace((FILE *)mon,&monitor_fprintf);
>> +    /* FIXME: st_print_trace((FILE *)mon,&monitor_fprintf); */
>
> This command is only available from the human monitor.  It's not very
> useful because it historically hasn't been able to pretty-print events
> or show them in the right order (we use a ringbuffer but it prints
> them out from index 0).
>
> Therefore, I don't think we're under any obligation to keep this
> command around.  No one has complained about it's limitations - I
> think this is a sign that no one has used it.  I'd be okay with a
> patch that removes it.
>
>>   }
>>   #endif
>>
>> diff --git a/trace/simple.c b/trace/simple.c
>> index b639dda..65cafe2 100644
>> --- a/trace/simple.c
>> +++ b/trace/simple.c
>> @@ -27,7 +27,7 @@
>>   #define HEADER_MAGIC 0xf2b177cb0aa429b4ULL
>>
>>   /** Trace file version number, bump if format changes */
>> -#define HEADER_VERSION 0
>> +#define HEADER_VERSION 2
>>
>>   /** Records were dropped event ID */
>>   #define DROPPED_EVENT_ID (~(uint64_t)0 - 1)
>> @@ -35,23 +35,6 @@
>>   /** Trace record is valid */
>>   #define TRACE_RECORD_VALID ((uint64_t)1<<  63)
>>
>> -/** Trace buffer entry */
>> -typedef struct {
>> -    uint64_t event;
>> -    uint64_t timestamp_ns;
>> -    uint64_t x1;
>> -    uint64_t x2;
>> -    uint64_t x3;
>> -    uint64_t x4;
>> -    uint64_t x5;
>> -    uint64_t x6;
>> -} TraceRecord;
>> -
>> -enum {
>> -    TRACE_BUF_LEN = 4096,
>> -    TRACE_BUF_FLUSH_THRESHOLD = TRACE_BUF_LEN / 4,
>> -};
>> -
>>   /*
>>   * Trace records are written out by a dedicated thread.  The thread waits for
>>   * records to become available, writes them out, and then waits again.
>> @@ -62,11 +45,12 @@ static GCond *trace_empty_cond;
>>   static bool trace_available;
>>   static bool trace_writeout_enabled;
>>
>> -static TraceRecord trace_buf[TRACE_BUF_LEN];
>> +uint8_t trace_buf[TRACE_BUF_LEN];
>>   static unsigned int trace_idx;
>>   static FILE *trace_fp;
>>   static char *trace_file_name = NULL;
>>
>> +
>
> Spurious newline.
>
>>   /**
>>   * Read a trace record from the trace buffer
>>   *
>> @@ -75,16 +59,19 @@ static char *trace_file_name = NULL;
>>   *
>>   * Returns false if the record is not valid.
>>   */
>> -static bool get_trace_record(unsigned int idx, TraceRecord *record)
>> +static bool get_trace_record(unsigned int idx, TraceRecord **recordptr)
>>   {
>> -    if (!(trace_buf[idx].event&  TRACE_RECORD_VALID)) {
>> +    TraceRecord *record = (TraceRecord *)&trace_buf[idx];
>> +    if (!(record->event&  TRACE_RECORD_VALID)) {
>>          return false;
>>      }
>>
>>      __sync_synchronize(); /* read memory barrier before accessing record */
>>
>> -    *record = trace_buf[idx];
>> -    record->event&= ~TRACE_RECORD_VALID;
>> +    *recordptr = g_malloc(record->length);
>> +    /* make a copy of record to avoid being overwritten */
>> +    memcpy(*recordptr, record, record->length);
>> +    (*recordptr)->event&= ~TRACE_RECORD_VALID;
>>      return true;
>>   }
>>
>> @@ -106,6 +93,47 @@ static void flush_trace_file(bool wait)
>>      g_static_mutex_unlock(&trace_lock);
>>   }
>>
>> +unsigned int trace_alloc_record(TraceEventID event, uint32_t datasize)
>> +{
>> +    unsigned int idx, rec_off;
>> +    uint32_t rec_len = ST_V2_REC_HDR_LEN + datasize;
>> +    uint64_t timestamp_ns = get_clock();
>> +
>> +    idx = g_atomic_int_exchange_and_add((gint *)&trace_idx, rec_len) % TRACE_BUF_LEN;
>> +    rec_off = idx;
>> +    write_to_buffer(rec_off, (uint8_t*)&event, sizeof(event));
>> +    rec_off += sizeof(event);
>> +    write_to_buffer(rec_off, (uint8_t*)&timestamp_ns, sizeof(timestamp_ns));
>> +    rec_off += sizeof(timestamp_ns);
>> +    write_to_buffer(rec_off, (uint8_t*)&rec_len, sizeof(rec_len));
>> +    rec_off += sizeof(rec_len);
>> +    return idx;
>> +}
>> +
>> +void write_to_buffer(unsigned int idx, uint8_t *dataptr, uint32_t size)
>> +{
>> +    uint32_t x = 0;
>> +    while (x<  size)
>> +    {
>> +        if (idx>= TRACE_BUF_LEN) {
>> +            idx = idx % TRACE_BUF_LEN;
>> +        }
>> +        trace_buf[idx++] = dataptr[x++];
>> +    }
>> +}
>> +
>> +void trace_mark_record_complete(unsigned int idx)
>> +{
>> +    TraceRecord *record = (TraceRecord*)&trace_buf[idx];
>> +
>> +    __sync_synchronize(); /* write barrier before marking as valid */
>> +    record->event |= TRACE_RECORD_VALID;
>> +
>> +    if (idx>  TRACE_BUF_FLUSH_THRESHOLD) {
>> +        flush_trace_file(false);
>> +    }
>
> This looks suspicious.  We want to kick the writeout thread when the
> buffer reaches the threshold but not repeatedly after that.  Also
> because it's a ring buffer idx could "start" above the threshold.
>
> Perhaps something like:
> if (idx>  TRACE_BUF_FLUSH_THRESHOLD&&  old_idx<= TRACE_BUF_FLUSH_THRESHOLD) {
>      flush_trace_file(false);
> }
>
>> +}
>> +
>>   static void wait_for_trace_records_available(void)
>>   {
>>      g_static_mutex_lock(&trace_lock);
>> @@ -120,7 +148,7 @@ static void wait_for_trace_records_available(void)
>>
>>   static gpointer writeout_thread(gpointer opaque)
>>   {
>> -    TraceRecord record;
>> +    TraceRecord record, *recordptr;
>>      unsigned int writeout_idx = 0;
>>      unsigned int num_available, idx;
>>      size_t unused __attribute__ ((unused));
>> @@ -130,19 +158,20 @@ static gpointer writeout_thread(gpointer opaque)
>>
>>          num_available = trace_idx - writeout_idx;
>>          if (num_available>  TRACE_BUF_LEN) {
>> -            record = (TraceRecord){
>> -                .event = DROPPED_EVENT_ID,
>> -                .x1 = num_available,
>> -            };
>> -            unused = fwrite(&record, sizeof(record), 1, trace_fp);
>> +            record.event = DROPPED_EVENT_ID,
>> +            record.length = num_available,
>
> Please don't use .length for the number of bytes dropped because this
> makes life harder for parsing scripts like simpletrace.py.  They would
> need a special case to check for DROPPED_EVENT_ID and then treat
> .length as an argument.  Instead the number of bytes should be an
> argument.
>
> Also, by removing the struct literal assignment you lose the automatic
> zeroing of fields.  We'll write out junk data - it would be nicer to
> clear it.
>
>> +            unused = fwrite(&record, ST_V2_REC_HDR_LEN, 1, trace_fp);
>>              writeout_idx += num_available;
>>          }
>>
>>          idx = writeout_idx % TRACE_BUF_LEN;
>> -        while (get_trace_record(idx,&record)) {
>> -            trace_buf[idx].event = 0; /* clear valid bit */
>> -            unused = fwrite(&record, sizeof(record), 1, trace_fp);
>> -            idx = ++writeout_idx % TRACE_BUF_LEN;
>> +        while (get_trace_record(idx,&recordptr)) {
>> +            unused = fwrite(recordptr, recordptr->length, 1, trace_fp);
>> +            writeout_idx += recordptr->length;
>> +            g_free(recordptr);
>> +            recordptr = (TraceRecord *)&trace_buf[idx];
>> +            recordptr->event = 0;
>> +            idx = writeout_idx % TRACE_BUF_LEN;
>>          }
>
> I'm wondering if it's worth using a different approach here.  Writing
> out individual records has bothered me.
>
> If we have a second buffer, as big as trace_buf[], then a function can
> copy out all records and make them available in trace_buf again:
>
> /**
>   * Copy completed trace records out of the ring buffer
>   *
>   * @idx    offset into trace_buf[]
>   * @buf    destination buffer
>   * @len    size of destination buffer
>   * @ret    the number of bytes consumed
>   */
> size_t consume_trace_records(unsigned int idx, void *buf, size_t len);
>
> That means consume gobbles up as many records as it can:
>   * Until it reaches an invalid record which has not been written yet
>   * Until it reaches the end of trace_buf[], the caller can call again
> with idx wrapped to 0
>
> After copying into buf[] it clears the valid bit in trace_buf[].
>
> Then the loop which calls consume_trace_records() does a single
> fwrite(3) and increments idx/writeout_idx.
>
> The advantage to this is that we write many records in one go and I
> think it splits up the writeout steps a little nicer than what we've
> previously done.
>

I think this optimization can be introduced as a separate patch later.
Let me know if you think otherwise.

>>
>>          fflush(trace_fp);
>> @@ -231,7 +196,7 @@ void st_set_trace_file_enabled(bool enable)
>>          static const TraceRecord header = {
>>              .event = HEADER_EVENT_ID,
>>              .timestamp_ns = HEADER_MAGIC,
>> -            .x1 = HEADER_VERSION,
>> +            .length = HEADER_VERSION,
>
> Hmm...this is kind of ugly (see comment about using .length above) but
> in this case most parsers will have a special-case anyway to check the
> magic number.  We need to use the .length field because historically
> that's where the version is located.
>

So, lets keep the version here only, right ?

- Harsh

>> +/* Interfaces for simpletrace v2 */
>
> Please drop this comment.  All trace/simple.[ch] code is for v2, this
> makes it seems like we support both versions at the same time.
>
>> +
>> +/** Trace buffer entry */
>> +typedef struct {
>> +    uint64_t event; /*  TraceEventID */
>> +    uint64_t timestamp_ns;
>> +    uint32_t length;   /*   in bytes */
>> +    uint32_t reserved; /*   unused */
>> +    uint8_t arguments[]; /* arguments position affects ST_V2_REC_HDR_LEN */
>> +} TraceRecord;
>
> Does this need to be visible?  I think it can live in trace/simple.c
>
>> +
>> +#define ST_V2_REC_HDR_LEN 24
>
> Does not need to be visible.  Also why calculate manually, instead of
> a macro I would use sizeof(TraceRecord) (the array without elements is
> not counted by sizeof).
>
>> +
>> +enum {
>> +    TRACE_BUF_LEN = 4096 * 64,
>> +    TRACE_BUF_FLUSH_THRESHOLD = TRACE_BUF_LEN / 4,
>> +};
>
> Does not need to be visible.
>
>> +
>> +extern uint8_t trace_buf[TRACE_BUF_LEN];
>
> Does not need to be visible.
>
Stefan Hajnoczi Jan. 18, 2012, 10:31 a.m. UTC | #3
On Wed, Jan 18, 2012 at 9:14 AM, Harsh Bora <harsh@linux.vnet.ibm.com> wrote:
> On 01/10/2012 10:11 PM, Stefan Hajnoczi wrote:
>>> +            unused = fwrite(&record, ST_V2_REC_HDR_LEN, 1, trace_fp);
>>>             writeout_idx += num_available;
>>>         }
>>>
>>>         idx = writeout_idx % TRACE_BUF_LEN;
>>> -        while (get_trace_record(idx,&record)) {
>>> -            trace_buf[idx].event = 0; /* clear valid bit */
>>> -            unused = fwrite(&record, sizeof(record), 1, trace_fp);
>>> -            idx = ++writeout_idx % TRACE_BUF_LEN;
>>> +        while (get_trace_record(idx,&recordptr)) {
>>> +            unused = fwrite(recordptr, recordptr->length, 1, trace_fp);
>>> +            writeout_idx += recordptr->length;
>>> +            g_free(recordptr);
>>> +            recordptr = (TraceRecord *)&trace_buf[idx];
>>>
>>> +            recordptr->event = 0;
>>> +            idx = writeout_idx % TRACE_BUF_LEN;
>>>         }
>>
>>
>> I'm wondering if it's worth using a different approach here.  Writing
>> out individual records has bothered me.
>>
>> If we have a second buffer, as big as trace_buf[], then a function can
>> copy out all records and make them available in trace_buf again:
>>
>> /**
>>  * Copy completed trace records out of the ring buffer
>>  *
>>  * @idx    offset into trace_buf[]
>>  * @buf    destination buffer
>>  * @len    size of destination buffer
>>  * @ret    the number of bytes consumed
>>  */
>> size_t consume_trace_records(unsigned int idx, void *buf, size_t len);
>>
>> That means consume gobbles up as many records as it can:
>>  * Until it reaches an invalid record which has not been written yet
>>  * Until it reaches the end of trace_buf[], the caller can call again
>> with idx wrapped to 0
>>
>> After copying into buf[] it clears the valid bit in trace_buf[].
>>
>> Then the loop which calls consume_trace_records() does a single
>> fwrite(3) and increments idx/writeout_idx.
>>
>> The advantage to this is that we write many records in one go and I
>> think it splits up the writeout steps a little nicer than what we've
>> previously done.
>>
>
> I think this optimization can be introduced as a separate patch later.
> Let me know if you think otherwise.

Yes, that could be done later.  However there is something incorrect
here.  Threads will continue to write trace records into the
ringbuffer while the write-out thread is doing I/O.  Think about what
happens when threads overtake the write-out index modulo ringbuffer
size.  Since records are variable-length the write-out thread's next
index could point into the middle of an overwritten record.  And that
means the ->length field is junk - we may crash if we use it.

>>>
>>>         fflush(trace_fp);
>>> @@ -231,7 +196,7 @@ void st_set_trace_file_enabled(bool enable)
>>>         static const TraceRecord header = {
>>>             .event = HEADER_EVENT_ID,
>>>             .timestamp_ns = HEADER_MAGIC,
>>> -            .x1 = HEADER_VERSION,
>>> +            .length = HEADER_VERSION,
>>
>>
>> Hmm...this is kind of ugly (see comment about using .length above) but
>> in this case most parsers will have a special-case anyway to check the
>> magic number.  We need to use the .length field because historically
>> that's where the version is located.
>>
>
> So, lets keep the version here only, right ?

Yes, it's necessary to do .length = HEADER_VERSION.

Stefan
Harsh Prateek Bora Jan. 18, 2012, 10:41 a.m. UTC | #4
On 01/18/2012 04:01 PM, Stefan Hajnoczi wrote:
> On Wed, Jan 18, 2012 at 9:14 AM, Harsh Bora<harsh@linux.vnet.ibm.com>  wrote:
>> On 01/10/2012 10:11 PM, Stefan Hajnoczi wrote:
>>>> +            unused = fwrite(&record, ST_V2_REC_HDR_LEN, 1, trace_fp);
>>>>              writeout_idx += num_available;
>>>>          }
>>>>
>>>>          idx = writeout_idx % TRACE_BUF_LEN;
>>>> -        while (get_trace_record(idx,&record)) {
>>>> -            trace_buf[idx].event = 0; /* clear valid bit */
>>>> -            unused = fwrite(&record, sizeof(record), 1, trace_fp);
>>>> -            idx = ++writeout_idx % TRACE_BUF_LEN;
>>>> +        while (get_trace_record(idx,&recordptr)) {
>>>> +            unused = fwrite(recordptr, recordptr->length, 1, trace_fp);
>>>> +            writeout_idx += recordptr->length;
>>>> +            g_free(recordptr);
>>>> +            recordptr = (TraceRecord *)&trace_buf[idx];
>>>>
>>>> +            recordptr->event = 0;
>>>> +            idx = writeout_idx % TRACE_BUF_LEN;
>>>>          }
>>>
>>>
>>> I'm wondering if it's worth using a different approach here.  Writing
>>> out individual records has bothered me.
>>>
>>> If we have a second buffer, as big as trace_buf[], then a function can
>>> copy out all records and make them available in trace_buf again:
>>>
>>> /**
>>>   * Copy completed trace records out of the ring buffer
>>>   *
>>>   * @idx    offset into trace_buf[]
>>>   * @buf    destination buffer
>>>   * @len    size of destination buffer
>>>   * @ret    the number of bytes consumed
>>>   */
>>> size_t consume_trace_records(unsigned int idx, void *buf, size_t len);
>>>
>>> That means consume gobbles up as many records as it can:
>>>   * Until it reaches an invalid record which has not been written yet
>>>   * Until it reaches the end of trace_buf[], the caller can call again
>>> with idx wrapped to 0
>>>
>>> After copying into buf[] it clears the valid bit in trace_buf[].
>>>
>>> Then the loop which calls consume_trace_records() does a single
>>> fwrite(3) and increments idx/writeout_idx.
>>>
>>> The advantage to this is that we write many records in one go and I
>>> think it splits up the writeout steps a little nicer than what we've
>>> previously done.
>>>
>>
>> I think this optimization can be introduced as a separate patch later.
>> Let me know if you think otherwise.
>
> Yes, that could be done later.  However there is something incorrect
> here.  Threads will continue to write trace records into the
> ringbuffer while the write-out thread is doing I/O.  Think about what
> happens when threads overtake the write-out index modulo ringbuffer
> size.  Since records are variable-length the write-out thread's next
> index could point into the middle of an overwritten record.  And that
> means the ->length field is junk - we may crash if we use it.

In case of overwritten records, the valid bit of event id will also be 
overwritten, and therefore we will not consider that record. Moreover, 
the writeout thread will further get to know that some events were 
dropped and will start with the latest trace_idx, right ?

However, to handle the extreme rare case of having an overwritten value 
such that its valid bit appears to be set, we can put a check for <
NR_TRACE_EVENTS. Even better to have a magic byte for events also ?

Harsh

>
>>>>
>>>>          fflush(trace_fp);
>>>> @@ -231,7 +196,7 @@ void st_set_trace_file_enabled(bool enable)
>>>>          static const TraceRecord header = {
>>>>              .event = HEADER_EVENT_ID,
>>>>              .timestamp_ns = HEADER_MAGIC,
>>>> -            .x1 = HEADER_VERSION,
>>>> +            .length = HEADER_VERSION,
>>>
>>>
>>> Hmm...this is kind of ugly (see comment about using .length above) but
>>> in this case most parsers will have a special-case anyway to check the
>>> magic number.  We need to use the .length field because historically
>>> that's where the version is located.
>>>
>>
>> So, lets keep the version here only, right ?
>
> Yes, it's necessary to do .length = HEADER_VERSION.
>
> Stefan
>
Harsh Prateek Bora Jan. 18, 2012, 10:52 a.m. UTC | #5
On 01/18/2012 04:11 PM, Harsh Bora wrote:
> On 01/18/2012 04:01 PM, Stefan Hajnoczi wrote:
>> On Wed, Jan 18, 2012 at 9:14 AM, Harsh Bora<harsh@linux.vnet.ibm.com>
>> wrote:
>>> On 01/10/2012 10:11 PM, Stefan Hajnoczi wrote:
>>>>> + unused = fwrite(&record, ST_V2_REC_HDR_LEN, 1, trace_fp);
>>>>> writeout_idx += num_available;
>>>>> }
>>>>>
>>>>> idx = writeout_idx % TRACE_BUF_LEN;
>>>>> - while (get_trace_record(idx,&record)) {
>>>>> - trace_buf[idx].event = 0; /* clear valid bit */
>>>>> - unused = fwrite(&record, sizeof(record), 1, trace_fp);
>>>>> - idx = ++writeout_idx % TRACE_BUF_LEN;
>>>>> + while (get_trace_record(idx,&recordptr)) {
>>>>> + unused = fwrite(recordptr, recordptr->length, 1, trace_fp);
>>>>> + writeout_idx += recordptr->length;
>>>>> + g_free(recordptr);
>>>>> + recordptr = (TraceRecord *)&trace_buf[idx];
>>>>>
>>>>> + recordptr->event = 0;
>>>>> + idx = writeout_idx % TRACE_BUF_LEN;
>>>>> }
>>>>
>>>>
>>>> I'm wondering if it's worth using a different approach here. Writing
>>>> out individual records has bothered me.
>>>>
>>>> If we have a second buffer, as big as trace_buf[], then a function can
>>>> copy out all records and make them available in trace_buf again:
>>>>
>>>> /**
>>>> * Copy completed trace records out of the ring buffer
>>>> *
>>>> * @idx offset into trace_buf[]
>>>> * @buf destination buffer
>>>> * @len size of destination buffer
>>>> * @ret the number of bytes consumed
>>>> */
>>>> size_t consume_trace_records(unsigned int idx, void *buf, size_t len);
>>>>
>>>> That means consume gobbles up as many records as it can:
>>>> * Until it reaches an invalid record which has not been written yet
>>>> * Until it reaches the end of trace_buf[], the caller can call again
>>>> with idx wrapped to 0
>>>>
>>>> After copying into buf[] it clears the valid bit in trace_buf[].
>>>>
>>>> Then the loop which calls consume_trace_records() does a single
>>>> fwrite(3) and increments idx/writeout_idx.
>>>>
>>>> The advantage to this is that we write many records in one go and I
>>>> think it splits up the writeout steps a little nicer than what we've
>>>> previously done.
>>>>
>>>
>>> I think this optimization can be introduced as a separate patch later.
>>> Let me know if you think otherwise.
>>
>> Yes, that could be done later. However there is something incorrect
>> here. Threads will continue to write trace records into the
>> ringbuffer while the write-out thread is doing I/O. Think about what
>> happens when threads overtake the write-out index modulo ringbuffer
>> size. Since records are variable-length the write-out thread's next
>> index could point into the middle of an overwritten record. And that
>> means the ->length field is junk - we may crash if we use it.
>
> In case of overwritten records, the valid bit of event id will also be
> overwritten, and therefore we will not consider that record. Moreover,
> the writeout thread will further get to know that some events were
> dropped and will start with the latest trace_idx, right ?
>
> However, to handle the extreme rare case of having an overwritten value
> such that its valid bit appears to be set, we can put a check for <
> NR_TRACE_EVENTS. Even better to have a magic byte for events also ?
>
> Harsh

Also, I just realized that we need to put buffer boundary checks while 
copying a trace record:

 >> -static bool get_trace_record(unsigned int idx, TraceRecord *record)
 >> +static bool get_trace_record(unsigned int idx, TraceRecord **recordptr)
 >>   {
 >> -    if (!(trace_buf[idx].event&  TRACE_RECORD_VALID)) {
 >> +    TraceRecord *record = (TraceRecord *)&trace_buf[idx];
 >> +    if (!(record->event&  TRACE_RECORD_VALID)) {
 >>          return false;
 >>      }
 >>
 >>      __sync_synchronize(); /* read memory barrier before accessing 
record */
 >>
 >> -    *record = trace_buf[idx];
 >> -    record->event&= ~TRACE_RECORD_VALID;
 >> +    *recordptr = g_malloc(record->length);
 >> +    /* make a copy of record to avoid being overwritten */
 >> +    memcpy(*recordptr, record, record->length);

So, I need to use a wrapper which should copy byte by byte taking care 
of boundary overlaps, and that will also save us from crashing (as we 
will never be crossing buffer boundaries), right ?

Harsh

 >> +    (*recordptr)->event&= ~TRACE_RECORD_VALID;
 >>      return true;
 >>   }



>
>>
>>>>>
>>>>> fflush(trace_fp);
>>>>> @@ -231,7 +196,7 @@ void st_set_trace_file_enabled(bool enable)
>>>>> static const TraceRecord header = {
>>>>> .event = HEADER_EVENT_ID,
>>>>> .timestamp_ns = HEADER_MAGIC,
>>>>> - .x1 = HEADER_VERSION,
>>>>> + .length = HEADER_VERSION,
>>>>
>>>>
>>>> Hmm...this is kind of ugly (see comment about using .length above) but
>>>> in this case most parsers will have a special-case anyway to check the
>>>> magic number. We need to use the .length field because historically
>>>> that's where the version is located.
>>>>
>>>
>>> So, lets keep the version here only, right ?
>>
>> Yes, it's necessary to do .length = HEADER_VERSION.
>>
>> Stefan
>>
>
>
Stefan Hajnoczi Jan. 18, 2012, 10:59 a.m. UTC | #6
On Wed, Jan 18, 2012 at 10:52 AM, Harsh Bora <harsh@linux.vnet.ibm.com> wrote:
> On 01/18/2012 04:11 PM, Harsh Bora wrote:
>>
>> On 01/18/2012 04:01 PM, Stefan Hajnoczi wrote:
>>>
>>> On Wed, Jan 18, 2012 at 9:14 AM, Harsh Bora<harsh@linux.vnet.ibm.com>
>>> wrote:
>>>>
>>>> On 01/10/2012 10:11 PM, Stefan Hajnoczi wrote:
>>>>>>
>>>>>> + unused = fwrite(&record, ST_V2_REC_HDR_LEN, 1, trace_fp);
>>>>>> writeout_idx += num_available;
>>>>>> }
>>>>>>
>>>>>> idx = writeout_idx % TRACE_BUF_LEN;
>>>>>> - while (get_trace_record(idx,&record)) {
>>>>>> - trace_buf[idx].event = 0; /* clear valid bit */
>>>>>> - unused = fwrite(&record, sizeof(record), 1, trace_fp);
>>>>>> - idx = ++writeout_idx % TRACE_BUF_LEN;
>>>>>> + while (get_trace_record(idx,&recordptr)) {
>>>>>> + unused = fwrite(recordptr, recordptr->length, 1, trace_fp);
>>>>>> + writeout_idx += recordptr->length;
>>>>>> + g_free(recordptr);
>>>>>> + recordptr = (TraceRecord *)&trace_buf[idx];
>>>>>>
>>>>>> + recordptr->event = 0;
>>>>>> + idx = writeout_idx % TRACE_BUF_LEN;
>>>>>> }
>>>>>
>>>>>
>>>>>
>>>>> I'm wondering if it's worth using a different approach here. Writing
>>>>> out individual records has bothered me.
>>>>>
>>>>> If we have a second buffer, as big as trace_buf[], then a function can
>>>>> copy out all records and make them available in trace_buf again:
>>>>>
>>>>> /**
>>>>> * Copy completed trace records out of the ring buffer
>>>>> *
>>>>> * @idx offset into trace_buf[]
>>>>> * @buf destination buffer
>>>>> * @len size of destination buffer
>>>>> * @ret the number of bytes consumed
>>>>> */
>>>>> size_t consume_trace_records(unsigned int idx, void *buf, size_t len);
>>>>>
>>>>> That means consume gobbles up as many records as it can:
>>>>> * Until it reaches an invalid record which has not been written yet
>>>>> * Until it reaches the end of trace_buf[], the caller can call again
>>>>> with idx wrapped to 0
>>>>>
>>>>> After copying into buf[] it clears the valid bit in trace_buf[].
>>>>>
>>>>> Then the loop which calls consume_trace_records() does a single
>>>>> fwrite(3) and increments idx/writeout_idx.
>>>>>
>>>>> The advantage to this is that we write many records in one go and I
>>>>> think it splits up the writeout steps a little nicer than what we've
>>>>> previously done.
>>>>>
>>>>
>>>> I think this optimization can be introduced as a separate patch later.
>>>> Let me know if you think otherwise.
>>>
>>>
>>> Yes, that could be done later. However there is something incorrect
>>> here. Threads will continue to write trace records into the
>>> ringbuffer while the write-out thread is doing I/O. Think about what
>>> happens when threads overtake the write-out index modulo ringbuffer
>>> size. Since records are variable-length the write-out thread's next
>>> index could point into the middle of an overwritten record. And that
>>> means the ->length field is junk - we may crash if we use it.
>>
>>
>> In case of overwritten records, the valid bit of event id will also be
>> overwritten, and therefore we will not consider that record. Moreover,
>> the writeout thread will further get to know that some events were
>> dropped and will start with the latest trace_idx, right ?
>>
>> However, to handle the extreme rare case of having an overwritten value
>> such that its valid bit appears to be set, we can put a check for <
>> NR_TRACE_EVENTS. Even better to have a magic byte for events also ?
>>
>> Harsh
>
>
> Also, I just realized that we need to put buffer boundary checks while
> copying a trace record:
>
>
>>> -static bool get_trace_record(unsigned int idx, TraceRecord *record)
>>> +static bool get_trace_record(unsigned int idx, TraceRecord **recordptr)
>>>   {
>>> -    if (!(trace_buf[idx].event&  TRACE_RECORD_VALID)) {
>>> +    TraceRecord *record = (TraceRecord *)&trace_buf[idx];
>>> +    if (!(record->event&  TRACE_RECORD_VALID)) {
>>>          return false;
>>>      }
>>>
>>>      __sync_synchronize(); /* read memory barrier before accessing record
>>> */
>>>
>>> -    *record = trace_buf[idx];
>>> -    record->event&= ~TRACE_RECORD_VALID;
>>> +    *recordptr = g_malloc(record->length);
>>> +    /* make a copy of record to avoid being overwritten */
>>> +    memcpy(*recordptr, record, record->length);
>
> So, I need to use a wrapper which should copy byte by byte taking care of
> boundary overlaps, and that will also save us from crashing (as we will
> never be crossing buffer boundaries), right ?

It won't be enough if length has a junk value like 2^32 - 1.  We don't
want to allocate/copy 4 GB :).

Stefan
Harsh Prateek Bora Jan. 18, 2012, 11:09 a.m. UTC | #7
On 01/18/2012 04:29 PM, Stefan Hajnoczi wrote:
> On Wed, Jan 18, 2012 at 10:52 AM, Harsh Bora<harsh@linux.vnet.ibm.com>  wrote:
>> On 01/18/2012 04:11 PM, Harsh Bora wrote:
>>>
>>> On 01/18/2012 04:01 PM, Stefan Hajnoczi wrote:
>>>>
>>>> On Wed, Jan 18, 2012 at 9:14 AM, Harsh Bora<harsh@linux.vnet.ibm.com>
>>>> wrote:
>>>>>
>>>>> On 01/10/2012 10:11 PM, Stefan Hajnoczi wrote:
>>>>>>>
>>>>>>> + unused = fwrite(&record, ST_V2_REC_HDR_LEN, 1, trace_fp);
>>>>>>> writeout_idx += num_available;
>>>>>>> }
>>>>>>>
>>>>>>> idx = writeout_idx % TRACE_BUF_LEN;
>>>>>>> - while (get_trace_record(idx,&record)) {
>>>>>>> - trace_buf[idx].event = 0; /* clear valid bit */
>>>>>>> - unused = fwrite(&record, sizeof(record), 1, trace_fp);
>>>>>>> - idx = ++writeout_idx % TRACE_BUF_LEN;
>>>>>>> + while (get_trace_record(idx,&recordptr)) {
>>>>>>> + unused = fwrite(recordptr, recordptr->length, 1, trace_fp);
>>>>>>> + writeout_idx += recordptr->length;
>>>>>>> + g_free(recordptr);
>>>>>>> + recordptr = (TraceRecord *)&trace_buf[idx];
>>>>>>>
>>>>>>> + recordptr->event = 0;
>>>>>>> + idx = writeout_idx % TRACE_BUF_LEN;
>>>>>>> }
>>>>>>
>>>>>>
>>>>>>
>>>>>> I'm wondering if it's worth using a different approach here. Writing
>>>>>> out individual records has bothered me.
>>>>>>
>>>>>> If we have a second buffer, as big as trace_buf[], then a function can
>>>>>> copy out all records and make them available in trace_buf again:
>>>>>>
>>>>>> /**
>>>>>> * Copy completed trace records out of the ring buffer
>>>>>> *
>>>>>> * @idx offset into trace_buf[]
>>>>>> * @buf destination buffer
>>>>>> * @len size of destination buffer
>>>>>> * @ret the number of bytes consumed
>>>>>> */
>>>>>> size_t consume_trace_records(unsigned int idx, void *buf, size_t len);
>>>>>>
>>>>>> That means consume gobbles up as many records as it can:
>>>>>> * Until it reaches an invalid record which has not been written yet
>>>>>> * Until it reaches the end of trace_buf[], the caller can call again
>>>>>> with idx wrapped to 0
>>>>>>
>>>>>> After copying into buf[] it clears the valid bit in trace_buf[].
>>>>>>
>>>>>> Then the loop which calls consume_trace_records() does a single
>>>>>> fwrite(3) and increments idx/writeout_idx.
>>>>>>
>>>>>> The advantage to this is that we write many records in one go and I
>>>>>> think it splits up the writeout steps a little nicer than what we've
>>>>>> previously done.
>>>>>>
>>>>>
>>>>> I think this optimization can be introduced as a separate patch later.
>>>>> Let me know if you think otherwise.
>>>>
>>>>
>>>> Yes, that could be done later. However there is something incorrect
>>>> here. Threads will continue to write trace records into the
>>>> ringbuffer while the write-out thread is doing I/O. Think about what
>>>> happens when threads overtake the write-out index modulo ringbuffer
>>>> size. Since records are variable-length the write-out thread's next
>>>> index could point into the middle of an overwritten record. And that
>>>> means the ->length field is junk - we may crash if we use it.
>>>
>>>
>>> In case of overwritten records, the valid bit of event id will also be
>>> overwritten, and therefore we will not consider that record. Moreover,
>>> the writeout thread will further get to know that some events were
>>> dropped and will start with the latest trace_idx, right ?
>>>
>>> However, to handle the extreme rare case of having an overwritten value
>>> such that its valid bit appears to be set, we can put a check for<
>>> NR_TRACE_EVENTS. Even better to have a magic byte for events also ?
>>>
>>> Harsh
>>
>>
>> Also, I just realized that we need to put buffer boundary checks while
>> copying a trace record:
>>
>>
>>>> -static bool get_trace_record(unsigned int idx, TraceRecord *record)
>>>> +static bool get_trace_record(unsigned int idx, TraceRecord **recordptr)
>>>>    {
>>>> -    if (!(trace_buf[idx].event&    TRACE_RECORD_VALID)) {
>>>> +    TraceRecord *record = (TraceRecord *)&trace_buf[idx];
>>>> +    if (!(record->event&    TRACE_RECORD_VALID)) {
>>>>           return false;
>>>>       }
>>>>
>>>>       __sync_synchronize(); /* read memory barrier before accessing record
>>>> */
>>>>
>>>> -    *record = trace_buf[idx];
>>>> -    record->event&= ~TRACE_RECORD_VALID;
>>>> +    *recordptr = g_malloc(record->length);
>>>> +    /* make a copy of record to avoid being overwritten */
>>>> +    memcpy(*recordptr, record, record->length);
>>
>> So, I need to use a wrapper which should copy byte by byte taking care of
>> boundary overlaps, and that will also save us from crashing (as we will
>> never be crossing buffer boundaries), right ?
>
> It won't be enough if length has a junk value like 2^32 - 1.  We don't
> want to allocate/copy 4 GB :).

So, whats the max limit that we may want to propose for a record length 
? Obviously less than buffer length, or TRACE_BUF_LEN / 2 ?

Hope, a check for max buffer length with byte-wise copy will help ?

Harsh

>
> Stefan
>
diff mbox

Patch

diff --git a/monitor.c b/monitor.c
index ffda0fe..b6f85d1 100644
--- a/monitor.c
+++ b/monitor.c
@@ -945,7 +945,7 @@  static void do_info_cpu_stats(Monitor *mon)
 #if defined(CONFIG_TRACE_SIMPLE)
 static void do_info_trace(Monitor *mon)
 {
-    st_print_trace((FILE *)mon, &monitor_fprintf);
+    /* FIXME: st_print_trace((FILE *)mon, &monitor_fprintf); */
 }
 #endif
 
diff --git a/trace/simple.c b/trace/simple.c
index b639dda..65cafe2 100644
--- a/trace/simple.c
+++ b/trace/simple.c
@@ -27,7 +27,7 @@ 
 #define HEADER_MAGIC 0xf2b177cb0aa429b4ULL
 
 /** Trace file version number, bump if format changes */
-#define HEADER_VERSION 0
+#define HEADER_VERSION 2
 
 /** Records were dropped event ID */
 #define DROPPED_EVENT_ID (~(uint64_t)0 - 1)
@@ -35,23 +35,6 @@ 
 /** Trace record is valid */
 #define TRACE_RECORD_VALID ((uint64_t)1 << 63)
 
-/** Trace buffer entry */
-typedef struct {
-    uint64_t event;
-    uint64_t timestamp_ns;
-    uint64_t x1;
-    uint64_t x2;
-    uint64_t x3;
-    uint64_t x4;
-    uint64_t x5;
-    uint64_t x6;
-} TraceRecord;
-
-enum {
-    TRACE_BUF_LEN = 4096,
-    TRACE_BUF_FLUSH_THRESHOLD = TRACE_BUF_LEN / 4,
-};
-
 /*
  * Trace records are written out by a dedicated thread.  The thread waits for
  * records to become available, writes them out, and then waits again.
@@ -62,11 +45,12 @@  static GCond *trace_empty_cond;
 static bool trace_available;
 static bool trace_writeout_enabled;
 
-static TraceRecord trace_buf[TRACE_BUF_LEN];
+uint8_t trace_buf[TRACE_BUF_LEN];
 static unsigned int trace_idx;
 static FILE *trace_fp;
 static char *trace_file_name = NULL;
 
+
 /**
  * Read a trace record from the trace buffer
  *
@@ -75,16 +59,19 @@  static char *trace_file_name = NULL;
  *
  * Returns false if the record is not valid.
  */
-static bool get_trace_record(unsigned int idx, TraceRecord *record)
+static bool get_trace_record(unsigned int idx, TraceRecord **recordptr)
 {
-    if (!(trace_buf[idx].event & TRACE_RECORD_VALID)) {
+    TraceRecord *record = (TraceRecord *) &trace_buf[idx];
+    if (!(record->event & TRACE_RECORD_VALID)) {
         return false;
     }
 
     __sync_synchronize(); /* read memory barrier before accessing record */
 
-    *record = trace_buf[idx];
-    record->event &= ~TRACE_RECORD_VALID;
+    *recordptr = g_malloc(record->length);
+    /* make a copy of record to avoid being overwritten */
+    memcpy(*recordptr, record, record->length);
+    (*recordptr)->event &= ~TRACE_RECORD_VALID;
     return true;
 }
 
@@ -106,6 +93,47 @@  static void flush_trace_file(bool wait)
     g_static_mutex_unlock(&trace_lock);
 }
 
+unsigned int trace_alloc_record(TraceEventID event, uint32_t datasize)
+{
+    unsigned int idx, rec_off;
+    uint32_t rec_len = ST_V2_REC_HDR_LEN + datasize;
+    uint64_t timestamp_ns = get_clock();
+
+    idx = g_atomic_int_exchange_and_add((gint *)&trace_idx, rec_len) % TRACE_BUF_LEN;
+    rec_off = idx;
+    write_to_buffer(rec_off, (uint8_t*)&event, sizeof(event));
+    rec_off += sizeof(event);
+    write_to_buffer(rec_off, (uint8_t*)&timestamp_ns, sizeof(timestamp_ns));
+    rec_off += sizeof(timestamp_ns);
+    write_to_buffer(rec_off, (uint8_t*)&rec_len, sizeof(rec_len));
+    rec_off += sizeof(rec_len);
+    return idx;
+}
+
+void write_to_buffer(unsigned int idx, uint8_t *dataptr, uint32_t size)
+{
+    uint32_t x = 0;
+    while (x < size)
+    {
+        if (idx >= TRACE_BUF_LEN) {
+            idx = idx % TRACE_BUF_LEN;
+        }
+        trace_buf[idx++] = dataptr[x++];
+    }
+}
+
+void trace_mark_record_complete(unsigned int idx)
+{
+    TraceRecord *record = (TraceRecord*) &trace_buf[idx];
+
+    __sync_synchronize(); /* write barrier before marking as valid */
+    record->event |= TRACE_RECORD_VALID;
+
+    if (idx > TRACE_BUF_FLUSH_THRESHOLD) {
+        flush_trace_file(false);
+    }
+}
+
 static void wait_for_trace_records_available(void)
 {
     g_static_mutex_lock(&trace_lock);
@@ -120,7 +148,7 @@  static void wait_for_trace_records_available(void)
 
 static gpointer writeout_thread(gpointer opaque)
 {
-    TraceRecord record;
+    TraceRecord record, *recordptr;
     unsigned int writeout_idx = 0;
     unsigned int num_available, idx;
     size_t unused __attribute__ ((unused));
@@ -130,19 +158,20 @@  static gpointer writeout_thread(gpointer opaque)
 
         num_available = trace_idx - writeout_idx;
         if (num_available > TRACE_BUF_LEN) {
-            record = (TraceRecord){
-                .event = DROPPED_EVENT_ID,
-                .x1 = num_available,
-            };
-            unused = fwrite(&record, sizeof(record), 1, trace_fp);
+            record.event = DROPPED_EVENT_ID,
+            record.length = num_available,
+            unused = fwrite(&record, ST_V2_REC_HDR_LEN, 1, trace_fp);
             writeout_idx += num_available;
         }
 
         idx = writeout_idx % TRACE_BUF_LEN;
-        while (get_trace_record(idx, &record)) {
-            trace_buf[idx].event = 0; /* clear valid bit */
-            unused = fwrite(&record, sizeof(record), 1, trace_fp);
-            idx = ++writeout_idx % TRACE_BUF_LEN;
+        while (get_trace_record(idx, &recordptr)) {
+            unused = fwrite(recordptr, recordptr->length, 1, trace_fp);
+            writeout_idx += recordptr->length;
+            g_free(recordptr);
+            recordptr = (TraceRecord *) &trace_buf[idx];
+            recordptr->event = 0;
+            idx = writeout_idx % TRACE_BUF_LEN;
         }
 
         fflush(trace_fp);
@@ -150,71 +179,7 @@  static gpointer writeout_thread(gpointer opaque)
     return NULL;
 }
 
-static void trace(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3,
-                  uint64_t x4, uint64_t x5, uint64_t x6)
-{
-    unsigned int idx;
-    uint64_t timestamp;
-
-    if (!trace_list[event].state) {
-        return;
-    }
-
-    timestamp = get_clock();
-
-    idx = g_atomic_int_exchange_and_add((gint *)&trace_idx, 1) % TRACE_BUF_LEN;
-    trace_buf[idx] = (TraceRecord){
-        .event = event,
-        .timestamp_ns = timestamp,
-        .x1 = x1,
-        .x2 = x2,
-        .x3 = x3,
-        .x4 = x4,
-        .x5 = x5,
-        .x6 = x6,
-    };
-    __sync_synchronize(); /* write barrier before marking as valid */
-    trace_buf[idx].event |= TRACE_RECORD_VALID;
-
-    if ((idx + 1) % TRACE_BUF_FLUSH_THRESHOLD == 0) {
-        flush_trace_file(false);
-    }
-}
-
-void trace0(TraceEventID event)
-{
-    trace(event, 0, 0, 0, 0, 0, 0);
-}
-
-void trace1(TraceEventID event, uint64_t x1)
-{
-    trace(event, x1, 0, 0, 0, 0, 0);
-}
-
-void trace2(TraceEventID event, uint64_t x1, uint64_t x2)
-{
-    trace(event, x1, x2, 0, 0, 0, 0);
-}
 
-void trace3(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3)
-{
-    trace(event, x1, x2, x3, 0, 0, 0);
-}
-
-void trace4(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4)
-{
-    trace(event, x1, x2, x3, x4, 0, 0);
-}
-
-void trace5(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t x5)
-{
-    trace(event, x1, x2, x3, x4, x5, 0);
-}
-
-void trace6(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t x5, uint64_t x6)
-{
-    trace(event, x1, x2, x3, x4, x5, x6);
-}
 
 void st_set_trace_file_enabled(bool enable)
 {
@@ -231,7 +196,7 @@  void st_set_trace_file_enabled(bool enable)
         static const TraceRecord header = {
             .event = HEADER_EVENT_ID,
             .timestamp_ns = HEADER_MAGIC,
-            .x1 = HEADER_VERSION,
+            .length = HEADER_VERSION,
         };
 
         trace_fp = fopen(trace_file_name, "wb");
@@ -288,23 +253,6 @@  void st_print_trace_file_status(FILE *stream, int (*stream_printf)(FILE *stream,
                   trace_file_name, trace_fp ? "on" : "off");
 }
 
-void st_print_trace(FILE *stream, int (*stream_printf)(FILE *stream, const char *fmt, ...))
-{
-    unsigned int i;
-
-    for (i = 0; i < TRACE_BUF_LEN; i++) {
-        TraceRecord record;
-
-        if (!get_trace_record(i, &record)) {
-            continue;
-        }
-        stream_printf(stream, "Event %" PRIu64 " : %" PRIx64 " %" PRIx64
-                      " %" PRIx64 " %" PRIx64 " %" PRIx64 " %" PRIx64 "\n",
-                      record.event, record.x1, record.x2,
-                      record.x3, record.x4, record.x5,
-                      record.x6);
-    }
-}
 
 void st_flush_trace_buffer(void)
 {
diff --git a/trace/simple.h b/trace/simple.h
index 466e75b..671cbeb 100644
--- a/trace/simple.h
+++ b/trace/simple.h
@@ -22,17 +22,34 @@  typedef struct {
     bool state;
 } TraceEvent;
 
-void trace0(TraceEventID event);
-void trace1(TraceEventID event, uint64_t x1);
-void trace2(TraceEventID event, uint64_t x1, uint64_t x2);
-void trace3(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3);
-void trace4(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4);
-void trace5(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t x5);
-void trace6(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t x5, uint64_t x6);
 void st_print_trace(FILE *stream, fprintf_function stream_printf);
 void st_print_trace_file_status(FILE *stream, fprintf_function stream_printf);
 void st_set_trace_file_enabled(bool enable);
 bool st_set_trace_file(const char *file);
 void st_flush_trace_buffer(void);
 
+/* Interfaces for simpletrace v2 */
+
+/** Trace buffer entry */
+typedef struct {
+    uint64_t event; /*  TraceEventID */
+    uint64_t timestamp_ns;
+    uint32_t length;   /*   in bytes */
+    uint32_t reserved; /*   unused */
+    uint8_t arguments[]; /* arguments position affects ST_V2_REC_HDR_LEN */
+} TraceRecord;
+
+#define ST_V2_REC_HDR_LEN 24
+
+enum {
+    TRACE_BUF_LEN = 4096 * 64,
+    TRACE_BUF_FLUSH_THRESHOLD = TRACE_BUF_LEN / 4,
+};
+
+extern uint8_t trace_buf[TRACE_BUF_LEN];
+
+unsigned int trace_alloc_record(TraceEventID event, uint32_t datasize);
+void write_to_buffer(unsigned int idx, uint8_t *dataptr, uint32_t size);
+void trace_mark_record_complete(unsigned int idx);
+
 #endif /* TRACE_SIMPLE_H */