diff mbox

[net-next,v2] packet: tx timestamping on tpacket ring

Message ID 1366408317-16432-1-git-send-email-willemb@google.com
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Willem de Bruijn April 19, 2013, 9:51 p.m. UTC
v1->v2:
- move sock_tx_timestamp near other sk reads (warm cacheline)
- remove duplicate flush_dcache_page
- enable hardware timestamps reporting using the error queue (not ring)
- use new ktime_to_timespec_cond API

When transmit timestamping is enabled at the socket level, record a
timestamp on packets written to a PACKET_TX_RING. Tx timestamps are
always looped to the application over the socket error queue. Software
timestamps are also written back into the packet frame header in the
packet ring.

Reported-by: Paul Chavent <paul.chavent@onera.fr>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/core/skbuff.c      | 12 ++++++------
 net/packet/af_packet.c | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 6 deletions(-)

Comments

Daniel Borkmann April 20, 2013, 12:33 p.m. UTC | #1
On 04/19/2013 11:51 PM, Willem de Bruijn wrote:
> v1->v2:
> - move sock_tx_timestamp near other sk reads (warm cacheline)
> - remove duplicate flush_dcache_page
> - enable hardware timestamps reporting using the error queue (not ring)
> - use new ktime_to_timespec_cond API

Nitpick: probably this should rather go below the "---" line, so that this
will not show up in the commit message.

> When transmit timestamping is enabled at the socket level, record a
> timestamp on packets written to a PACKET_TX_RING. Tx timestamps are
> always looped to the application over the socket error queue. Software
> timestamps are also written back into the packet frame header in the
> packet ring.
>
> Reported-by: Paul Chavent <paul.chavent@onera.fr>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
>   net/core/skbuff.c      | 12 ++++++------
>   net/packet/af_packet.c | 33 +++++++++++++++++++++++++++++++++
>   2 files changed, 39 insertions(+), 6 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 898cf5c..af9185d 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3327,12 +3327,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>   	if (!sk)
>   		return;
>
> -	skb = skb_clone(orig_skb, GFP_ATOMIC);
> -	if (!skb)
> -		return;
> -
>   	if (hwtstamps) {
> -		*skb_hwtstamps(skb) =
> +		*skb_hwtstamps(orig_skb) =
>   			*hwtstamps;
>   	} else {
>   		/*
> @@ -3340,9 +3336,13 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>   		 * so keep the shared tx_flags and only
>   		 * store software time stamp
>   		 */
> -		skb->tstamp = ktime_get_real();
> +		orig_skb->tstamp = ktime_get_real();
>   	}
>
> +	skb = skb_clone(orig_skb, GFP_ATOMIC);
> +	if (!skb)
> +		return;
> +
>   	serr = SKB_EXT_ERR(skb);
>   	memset(serr, 0, sizeof(*serr));
>   	serr->ee.ee_errno = ENOMSG;
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 7e387ff..ec8ea27 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -339,6 +339,37 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
>   	}
>   }
>
> +static void __packet_set_timestamp(struct packet_sock *po, void *frame,
> +				   ktime_t tstamp)
> +{
> +	union tpacket_uhdr h;
> +	struct timespec ts;
> +
> +	if (!ktime_to_timespec_cond(tstamp, &ts) ||
> +	    !sock_flag(&po->sk, SOCK_TIMESTAMPING_SOFTWARE))
> +		return;

If we currently have the po->tp_tstamp member that was introduced for the RX_RING
part only, using it if possible for the TX_RING part as well would be good, at
least cleaner. Also the documentation [1] should receive a status update on this
feature, otherwise only Paul will use this feature and nobody else.

Not an expert in timestamping, but why can't we also allow hw timestamps but have
to use sw only? Would it be possible to reuse the tpacket_get_timestamp() function
that got recently introduced for this (while keeping sock_tx_timestamp() below on
TX path)?

Thanks.

  [1] Documentation/networking/packet_mmap.txt

> +	h.raw = frame;
> +	switch (po->tp_version) {
> +	case TPACKET_V1:
> +		h.h1->tp_sec = ts.tv_sec;
> +		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
> +		break;
> +	case TPACKET_V2:
> +		h.h2->tp_sec = ts.tv_sec;
> +		h.h2->tp_nsec = ts.tv_nsec;
> +		break;
> +	case TPACKET_V3:
> +	default:
> +		WARN(1, "TPACKET version not supported.\n");
> +		BUG();
> +	}
> +
> +	/* one flush is safe, as both fields always lie on the same cacheline */
> +	flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
> +	smp_wmb();
> +}
> +
>   static void *packet_lookup_frame(struct packet_sock *po,
>   		struct packet_ring_buffer *rb,
>   		unsigned int position,
> @@ -1877,6 +1908,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
>   		ph = skb_shinfo(skb)->destructor_arg;
>   		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
>   		atomic_dec(&po->tx_ring.pending);
> +		__packet_set_timestamp(po, ph, skb->tstamp);
>   		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
>   	}
>
> @@ -1900,6 +1932,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>   	skb->dev = dev;
>   	skb->priority = po->sk.sk_priority;
>   	skb->mark = po->sk.sk_mark;
> +	sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
>   	skb_shinfo(skb)->destructor_arg = ph.raw;
>
>   	switch (po->tp_version) {
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Richard Cochran April 20, 2013, 4:43 p.m. UTC | #2
On Fri, Apr 19, 2013 at 05:51:57PM -0400, Willem de Bruijn wrote:

> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 898cf5c..af9185d 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3327,12 +3327,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>  	if (!sk)
>  		return;
>  
> -	skb = skb_clone(orig_skb, GFP_ATOMIC);
> -	if (!skb)
> -		return;
> -
>  	if (hwtstamps) {
> -		*skb_hwtstamps(skb) =
> +		*skb_hwtstamps(orig_skb) =
>  			*hwtstamps;

And how does *hwtstamps get into the clone?

>  	} else {
>  		/*
> @@ -3340,9 +3336,13 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>  		 * so keep the shared tx_flags and only
>  		 * store software time stamp
>  		 */
> -		skb->tstamp = ktime_get_real();
> +		orig_skb->tstamp = ktime_get_real();
>  	}
>  
> +	skb = skb_clone(orig_skb, GFP_ATOMIC);
> +	if (!skb)
> +		return;
> +
>  	serr = SKB_EXT_ERR(skb);
>  	memset(serr, 0, sizeof(*serr));
>  	serr->ee.ee_errno = ENOMSG;

Thanks,
Richard
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 21, 2013, 2:30 a.m. UTC | #3
On Sat, Apr 20, 2013 at 8:33 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 04/19/2013 11:51 PM, Willem de Bruijn wrote:
>>
>> v1->v2:
>> - move sock_tx_timestamp near other sk reads (warm cacheline)
>> - remove duplicate flush_dcache_page
>> - enable hardware timestamps reporting using the error queue (not ring)
>> - use new ktime_to_timespec_cond API
>
>
> Nitpick: probably this should rather go below the "---" line, so that this
> will not show up in the commit message.
>
>
>> When transmit timestamping is enabled at the socket level, record a
>> timestamp on packets written to a PACKET_TX_RING. Tx timestamps are
>> always looped to the application over the socket error queue. Software
>> timestamps are also written back into the packet frame header in the
>> packet ring.
>>
>> Reported-by: Paul Chavent <paul.chavent@onera.fr>
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>> ---
>>   net/core/skbuff.c      | 12 ++++++------
>>   net/packet/af_packet.c | 33 +++++++++++++++++++++++++++++++++
>>   2 files changed, 39 insertions(+), 6 deletions(-)
>>
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index 898cf5c..af9185d 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -3327,12 +3327,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>>         if (!sk)
>>                 return;
>>
>> -       skb = skb_clone(orig_skb, GFP_ATOMIC);
>> -       if (!skb)
>> -               return;
>> -
>>         if (hwtstamps) {
>> -               *skb_hwtstamps(skb) =
>> +               *skb_hwtstamps(orig_skb) =
>>                         *hwtstamps;
>>         } else {
>>                 /*
>> @@ -3340,9 +3336,13 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>>                  * so keep the shared tx_flags and only
>>                  * store software time stamp
>>                  */
>> -               skb->tstamp = ktime_get_real();
>> +               orig_skb->tstamp = ktime_get_real();
>>         }
>>
>> +       skb = skb_clone(orig_skb, GFP_ATOMIC);
>> +       if (!skb)
>> +               return;
>> +
>>         serr = SKB_EXT_ERR(skb);
>>         memset(serr, 0, sizeof(*serr));
>>         serr->ee.ee_errno = ENOMSG;
>> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
>> index 7e387ff..ec8ea27 100644
>> --- a/net/packet/af_packet.c
>> +++ b/net/packet/af_packet.c
>> @@ -339,6 +339,37 @@ static int __packet_get_status(struct packet_sock
>> *po, void *frame)
>>         }
>>   }
>>
>> +static void __packet_set_timestamp(struct packet_sock *po, void *frame,
>> +                                  ktime_t tstamp)
>> +{
>> +       union tpacket_uhdr h;
>> +       struct timespec ts;
>> +
>> +       if (!ktime_to_timespec_cond(tstamp, &ts) ||
>> +           !sock_flag(&po->sk, SOCK_TIMESTAMPING_SOFTWARE))
>> +               return;
>
>
> If we currently have the po->tp_tstamp member that was introduced for the
> RX_RING
> part only, using it if possible for the TX_RING part as well would be good,
> at
> least cleaner.

Then the caller has to set both the SOL_SOCKET and SOL_PACKET
options. I'm not convinced that that is an improvement over using generic
socket tx timestamping option as is. How would you use it specifically?
To determine whether or not to write the values into the ring, or more
extensively?

> Also the documentation [1] should receive a status update on
> this
> feature, otherwise only Paul will use this feature and nobody else.

Okay. I'll update the documentation. Speaking of which, it would be
great if someone could proofread the packet socket manpage update
patch (http://patchwork.ozlabs.org/patch/228709/)

> Not an expert in timestamping, but why can't we also allow hw timestamps but
> have
> to use sw only?

To avoid getting into the same situation on tx that Richard pointed out
about rx: the ring can only communicate one timestamp and cannot
communicate which one it is. Better to be consistent in that case, I
think. I'm open to discussing this, though.

> Would it be possible to reuse the tpacket_get_timestamp()
> function
> that got recently introduced for this (while keeping sock_tx_timestamp()
> below on
> TX path)?
>
> Thanks.
>
>  [1] Documentation/networking/packet_mmap.txt
>
>
>> +       h.raw = frame;
>> +       switch (po->tp_version) {
>> +       case TPACKET_V1:
>> +               h.h1->tp_sec = ts.tv_sec;
>> +               h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
>> +               break;
>> +       case TPACKET_V2:
>> +               h.h2->tp_sec = ts.tv_sec;
>> +               h.h2->tp_nsec = ts.tv_nsec;
>> +               break;
>> +       case TPACKET_V3:
>> +       default:
>> +               WARN(1, "TPACKET version not supported.\n");
>> +               BUG();
>> +       }
>> +
>> +       /* one flush is safe, as both fields always lie on the same
>> cacheline */
>> +       flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
>> +       smp_wmb();
>> +}
>> +
>>   static void *packet_lookup_frame(struct packet_sock *po,
>>                 struct packet_ring_buffer *rb,
>>                 unsigned int position,
>> @@ -1877,6 +1908,7 @@ static void tpacket_destruct_skb(struct sk_buff
>> *skb)
>>                 ph = skb_shinfo(skb)->destructor_arg;
>>                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
>>                 atomic_dec(&po->tx_ring.pending);
>> +               __packet_set_timestamp(po, ph, skb->tstamp);
>>                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
>>         }
>>
>> @@ -1900,6 +1932,7 @@ static int tpacket_fill_skb(struct packet_sock *po,
>> struct sk_buff *skb,
>>         skb->dev = dev;
>>         skb->priority = po->sk.sk_priority;
>>         skb->mark = po->sk.sk_mark;
>> +       sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
>>         skb_shinfo(skb)->destructor_arg = ph.raw;
>>
>>         switch (po->tp_version) {
>>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 21, 2013, 2:34 a.m. UTC | #4
On Sat, Apr 20, 2013 at 12:43 PM, Richard Cochran
<richardcochran@gmail.com> wrote:
> On Fri, Apr 19, 2013 at 05:51:57PM -0400, Willem de Bruijn wrote:
>
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index 898cf5c..af9185d 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -3327,12 +3327,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>>       if (!sk)
>>               return;
>>
>> -     skb = skb_clone(orig_skb, GFP_ATOMIC);
>> -     if (!skb)
>> -             return;
>> -
>>       if (hwtstamps) {
>> -             *skb_hwtstamps(skb) =
>> +             *skb_hwtstamps(orig_skb) =
>>                       *hwtstamps;
>
> And how does *hwtstamps get into the clone?

The struct is part of skb_shared_info, which is stored immediately
after skb->end in the region shared by all clones.

>
>>       } else {
>>               /*
>> @@ -3340,9 +3336,13 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>>                * so keep the shared tx_flags and only
>>                * store software time stamp
>>                */
>> -             skb->tstamp = ktime_get_real();
>> +             orig_skb->tstamp = ktime_get_real();
>>       }
>>
>> +     skb = skb_clone(orig_skb, GFP_ATOMIC);
>> +     if (!skb)
>> +             return;
>> +
>>       serr = SKB_EXT_ERR(skb);
>>       memset(serr, 0, sizeof(*serr));
>>       serr->ee.ee_errno = ENOMSG;
>
> Thanks,
> Richard
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann April 21, 2013, 10:10 a.m. UTC | #5
On 04/21/2013 04:30 AM, Willem de Bruijn wrote:
> On Sat, Apr 20, 2013 at 8:33 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
[..]
>> On 04/19/2013 11:51 PM, Willem de Bruijn wrote:
>> If we currently have the po->tp_tstamp member that was introduced for the
>> RX_RING
>> part only, using it if possible for the TX_RING part as well would be good,
>> at
>> least cleaner.
>
> Then the caller has to set both the SOL_SOCKET and SOL_PACKET
> options. I'm not convinced that that is an improvement over using generic
> socket tx timestamping option as is. How would you use it specifically?
> To determine whether or not to write the values into the ring, or more
> extensively?

I would use it in combination with tpacket_get_timestamp(). There, we do not
check for SOCK_TIMESTAMPING_SOFTWARE, but just fall back for skb->tstamp if
not null. The tpacket_get_timestamp() would need a small change though, in
that sense that i) getnstimeofday() is moved out of the function and ii) the
function will return a bool, if the ts variable was filled. Thus, in the RX
path if the function returns false, you do getnstimeofday() as fallback, and
in the TX path nothing needs to be done in __packet_set_timestamp() if the skb
was not timestamped.

That said, for the software timestamps, the caller would not need to do more
as with your current patch, for the hardware part, it's then the same afaik
as in the RX_RING.

Just to give some pseudo code (instead of the bool, we could also use an enum
that tells which timestamp was found, but more on that further below):

static bool tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
				  unsigned int flags)
{
	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);

	if (shhwtstamps) {
		if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
		    ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
			return true;
		if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
		    ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
			return true;
	}

	if (ktime_to_timespec_cond(skb->tstamp, ts))
		return true;

	return false;
}

In tpacket_rcv():
...
	if (!tpacket_get_timestamp(skb, &ts, po->tp_tstamp))
		getnstimeofday(ts);
...

In __packet_set_timestamp():
...
	if (!tpacket_get_timestamp(skb, &ts, po->tp_tstamp))
		return;
...

>> Also the documentation [1] should receive a status update on
>> this
>> feature, otherwise only Paul will use this feature and nobody else.
>
> Okay. I'll update the documentation. Speaking of which, it would be
> great if someone could proofread the packet socket manpage update
> patch (http://patchwork.ozlabs.org/patch/228709/)

Ok, I will have a look.

>> Not an expert in timestamping, but why can't we also allow hw timestamps but
>> have
>> to use sw only?
>
> To avoid getting into the same situation on tx that Richard pointed out
> about rx: the ring can only communicate one timestamp and cannot
> communicate which one it is. Better to be consistent in that case, I
> think. I'm open to discussing this, though.

Ok, I agree that it would be very useful to also state what timestamp is
being reported. Ideally, without introducing yet another tpacket version.

The only thing I can currently think of could be to use the tp_status bit
for that, then used by both RX_RING and TX_RING, e.g. ...

	TP_STATUS_TS_SYS_HARDWARE	(1 << 30)
	TP_STATUS_TS_RAW_HARDWARE	(1 << 31)

... although it would be a bit of waste to use 2 bits for that. If none of
them is set, it indicates that the timestamp is in software-only. This we
could do, if you don't have a better idea.

``Better to be consistent in that case'' would not be the current situation
with the patch, right? I mean, for RX_RING, we report one of those 3, for
TX_RING only the sw timestamp. So on one hand we might be consistent, but on
the other hand we're not. ;-) With the above proposal, we could tackle this
and stay consistent then. If you want, I could do that after your revised
patch gets accepted (with hw ts support). Let me know what you think of that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 21, 2013, 4:42 p.m. UTC | #6
On Sun, Apr 21, 2013 at 6:10 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 04/21/2013 04:30 AM, Willem de Bruijn wrote:
>>
>> On Sat, Apr 20, 2013 at 8:33 AM, Daniel Borkmann <dborkman@redhat.com>
>> wrote:
>
> [..]
>>>
>>> On 04/19/2013 11:51 PM, Willem de Bruijn wrote:
>>> If we currently have the po->tp_tstamp member that was introduced for the
>>> RX_RING
>>> part only, using it if possible for the TX_RING part as well would be
>>> good,
>>> at
>>> least cleaner.
>>
>>
>> Then the caller has to set both the SOL_SOCKET and SOL_PACKET
>> options. I'm not convinced that that is an improvement over using generic
>> socket tx timestamping option as is. How would you use it specifically?
>> To determine whether or not to write the values into the ring, or more
>> extensively?
>
>
> I would use it in combination with tpacket_get_timestamp(). There, we do not
> check for SOCK_TIMESTAMPING_SOFTWARE, but just fall back for skb->tstamp if
> not null. The tpacket_get_timestamp() would need a small change though, in
> that sense that i) getnstimeofday() is moved out of the function and ii) the
> function will return a bool, if the ts variable was filled. Thus, in the RX
> path if the function returns false, you do getnstimeofday() as fallback, and
> in the TX path nothing needs to be done in __packet_set_timestamp() if the
> skb
> was not timestamped.
>
> That said, for the software timestamps, the caller would not need to do more
> as with your current patch, for the hardware part, it's then the same afaik
> as in the RX_RING.
>
> Just to give some pseudo code (instead of the bool, we could also use an
> enum
> that tells which timestamp was found, but more on that further below):
>
> static bool tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
>                                   unsigned int flags)
> {
>         struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
>
>         if (shhwtstamps) {
>                 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
>                     ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
>                         return true;
>                 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
>                     ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
>                         return true;
>         }
>
>         if (ktime_to_timespec_cond(skb->tstamp, ts))
>                 return true;
>
>         return false;
> }
>
> In tpacket_rcv():
> ...
>         if (!tpacket_get_timestamp(skb, &ts, po->tp_tstamp))
>                 getnstimeofday(ts);
> ...
>
> In __packet_set_timestamp():
> ...
>         if (!tpacket_get_timestamp(skb, &ts, po->tp_tstamp))
>                 return;
> ...
>
>
>>> Also the documentation [1] should receive a status update on
>>> this
>>> feature, otherwise only Paul will use this feature and nobody else.
>>
>>
>> Okay. I'll update the documentation. Speaking of which, it would be
>> great if someone could proofread the packet socket manpage update
>> patch (http://patchwork.ozlabs.org/patch/228709/)
>
>
> Ok, I will have a look.

Saw your ack. Thanks, Daniel!
>
>>> Not an expert in timestamping, but why can't we also allow hw timestamps
>>> but
>>> have
>>> to use sw only?
>>
>>
>> To avoid getting into the same situation on tx that Richard pointed out
>> about rx: the ring can only communicate one timestamp and cannot
>> communicate which one it is. Better to be consistent in that case, I
>> think. I'm open to discussing this, though.
>
>
> Ok, I agree that it would be very useful to also state what timestamp is
> being reported. Ideally, without introducing yet another tpacket version.
>
> The only thing I can currently think of could be to use the tp_status bit
> for that, then used by both RX_RING and TX_RING, e.g. ...
>
>         TP_STATUS_TS_SYS_HARDWARE       (1 << 30)
>         TP_STATUS_TS_RAW_HARDWARE       (1 << 31)
>
> ... although it would be a bit of waste to use 2 bits for that. If none of
> them is set, it indicates that the timestamp is in software-only. This we
> could do, if you don't have a better idea.

I like that a lot. Visibility would certainly improve the state on
rx-ring side. One issue with interleaving various kinds of timestamps,
even when clearly labeled in tp_status, is that applications likely
cannot use timestamp series from different sources, as these sequences
are incomparable. Tx has the advantage that time sources can be chosen
per socket independent of all other sockets. The only time when
multiple sources come into play is when a socket has hw timestamps
selected, but the hardware failed to create a tstamp for some
device-specific reason. In that case, generating the sw timestamp may
or may not be helpful to the application. When at least it is clearly
labeled in tp_status, worst case the application will simply ignore
it. If the application only selects software timestamps, the mechanism
works just like what I proposed: always communicate the software
timestamp and do not modify tp_status. So, I'd say this is a clear
win. Since it is a strict superset, how about I send the current patch
as is (with extra documentation) and you submit the hw tstamp support
+ labels separately?

> ``Better to be consistent in that case'' would not be the current situation
> with the patch, right? I mean, for RX_RING, we report one of those 3, for
> TX_RING only the sw timestamp. So on one hand we might be consistent, but on
> the other hand we're not. ;-) With the above proposal, we could tackle this
> and stay consistent then. If you want, I could do that after your revised
> patch gets accepted (with hw ts support). Let me know what you think of
> that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann April 21, 2013, 6:14 p.m. UTC | #7
On 04/21/2013 06:42 PM, Willem de Bruijn wrote:
> win. Since it is a strict superset, how about I send the current patch
> as is (with extra documentation) and you submit the hw tstamp support
> + labels separately?

Ok, we could do that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Chavent April 22, 2013, 8:19 a.m. UTC | #8
On 04/21/2013 06:42 PM, Willem de Bruijn wrote:
> Tx has the advantage that time sources can be chosen
> per socket independent of all other sockets

Sorry if my question is trivial.
I understand that when we require rx timestamping, we need to ask to the 
device to timestamp all incoming packets since we don't know the path of 
the packet in advance.
For tx timestamping, you seems to say that the request to timestamp the 
packet is contained in the skbuff and is done on a per packet basis ?

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann April 22, 2013, 10:25 a.m. UTC | #9
On 04/22/2013 10:19 AM, Paul Chavent wrote:
> On 04/21/2013 06:42 PM, Willem de Bruijn wrote:
>> Tx has the advantage that time sources can be chosen
>> per socket independent of all other sockets
>
> Sorry if my question is trivial.
> I understand that when we require rx timestamping, we need to ask to the device to timestamp all incoming packets since we don't know the path of the packet in advance.
> For tx timestamping, you seems to say that the request to timestamp the packet is contained in the skbuff and is done on a per packet basis ?

It's all in: Documentation/networking/timestamping.txt
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 22, 2013, 2:23 p.m. UTC | #10
On Mon, Apr 22, 2013 at 4:19 AM, Paul Chavent <Paul.Chavent@onera.fr> wrote:
>
>
> On 04/21/2013 06:42 PM, Willem de Bruijn wrote:
>>
>> Tx has the advantage that time sources can be chosen
>> per socket independent of all other sockets
>
>
> Sorry if my question is trivial.
> I understand that when we require rx timestamping, we need to ask to the
> device to timestamp all incoming packets since we don't know the path of the
> packet in advance.
> For tx timestamping, you seems to say that the request to timestamp the
> packet is contained in the skbuff and is done on a per packet basis ?

Yes. More precisely, this is set using a socket option and applies to
all packets on that socket sent since the last update of the socket
option.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 898cf5c..af9185d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3327,12 +3327,8 @@  void skb_tstamp_tx(struct sk_buff *orig_skb,
 	if (!sk)
 		return;
 
-	skb = skb_clone(orig_skb, GFP_ATOMIC);
-	if (!skb)
-		return;
-
 	if (hwtstamps) {
-		*skb_hwtstamps(skb) =
+		*skb_hwtstamps(orig_skb) =
 			*hwtstamps;
 	} else {
 		/*
@@ -3340,9 +3336,13 @@  void skb_tstamp_tx(struct sk_buff *orig_skb,
 		 * so keep the shared tx_flags and only
 		 * store software time stamp
 		 */
-		skb->tstamp = ktime_get_real();
+		orig_skb->tstamp = ktime_get_real();
 	}
 
+	skb = skb_clone(orig_skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
 	serr = SKB_EXT_ERR(skb);
 	memset(serr, 0, sizeof(*serr));
 	serr->ee.ee_errno = ENOMSG;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 7e387ff..ec8ea27 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -339,6 +339,37 @@  static int __packet_get_status(struct packet_sock *po, void *frame)
 	}
 }
 
+static void __packet_set_timestamp(struct packet_sock *po, void *frame,
+				   ktime_t tstamp)
+{
+	union tpacket_uhdr h;
+	struct timespec ts;
+
+	if (!ktime_to_timespec_cond(tstamp, &ts) ||
+	    !sock_flag(&po->sk, SOCK_TIMESTAMPING_SOFTWARE))
+		return;
+
+	h.raw = frame;
+	switch (po->tp_version) {
+	case TPACKET_V1:
+		h.h1->tp_sec = ts.tv_sec;
+		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+		break;
+	case TPACKET_V2:
+		h.h2->tp_sec = ts.tv_sec;
+		h.h2->tp_nsec = ts.tv_nsec;
+		break;
+	case TPACKET_V3:
+	default:
+		WARN(1, "TPACKET version not supported.\n");
+		BUG();
+	}
+
+	/* one flush is safe, as both fields always lie on the same cacheline */
+	flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
+	smp_wmb();
+}
+
 static void *packet_lookup_frame(struct packet_sock *po,
 		struct packet_ring_buffer *rb,
 		unsigned int position,
@@ -1877,6 +1908,7 @@  static void tpacket_destruct_skb(struct sk_buff *skb)
 		ph = skb_shinfo(skb)->destructor_arg;
 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
 		atomic_dec(&po->tx_ring.pending);
+		__packet_set_timestamp(po, ph, skb->tstamp);
 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
 	}
 
@@ -1900,6 +1932,7 @@  static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 	skb->dev = dev;
 	skb->priority = po->sk.sk_priority;
 	skb->mark = po->sk.sk_mark;
+	sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
 	skb_shinfo(skb)->destructor_arg = ph.raw;
 
 	switch (po->tp_version) {