diff mbox

net-packet: tx timestamping on tpacket ring

Message ID 1365879412-9541-1-git-send-email-willemb@google.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Willem de Bruijn April 13, 2013, 6:56 p.m. UTC
When transmit timestamping is enabled at the socket level, have
writes to a PACKET_TX_RING record a timestamp for the generated
skbuffs. Tx timestamps are always looped to the application over
the socket error queue.

The patch also loops software timestamps back into the ring.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/core/skbuff.c      |  2 +-
 net/packet/af_packet.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

Comments

Daniel Borkmann April 13, 2013, 10:18 p.m. UTC | #1
On 04/13/2013 08:56 PM, Willem de Bruijn wrote:
> When transmit timestamping is enabled at the socket level, have
> writes to a PACKET_TX_RING record a timestamp for the generated
> skbuffs. Tx timestamps are always looped to the application over
> the socket error queue.

Nitpick: if so, then this should go to net-next (subject line).

> The patch also loops software timestamps back into the ring.
>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
>   net/core/skbuff.c      |  2 +-
>   net/packet/af_packet.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 43 insertions(+), 1 deletion(-)
[...]
> +static void __packet_set_timestamp(struct packet_sock *po, void *frame,
> +				   ktime_t tstamp)
> +{
> +	struct tpacket_hdr *h1;
> +	struct tpacket2_hdr *h2;
> +	struct timespec ts;
> +
> +	if (!tstamp.tv64 || !sock_flag(&po->sk, SOCK_TIMESTAMPING_SOFTWARE))
> +		return;
> +
> +	ts = ktime_to_timespec(tstamp);
> +
> +	switch (po->tp_version) {
> +	case TPACKET_V1:
> +		h1 = frame;
> +		h1->tp_sec = ts.tv_sec;
> +		h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
> +
> +		flush_dcache_page(pgv_to_page(&h1->tp_sec));
> +		flush_dcache_page(pgv_to_page(&h1->tp_usec));

Hmm, not sure, but could we also flush the dcache only once?

> +		break;
> +	case TPACKET_V2:
> +		h2 = frame;
> +		h2->tp_sec = ts.tv_sec;
> +		h2->tp_nsec = ts.tv_nsec;
> +
> +		flush_dcache_page(pgv_to_page(&h2->tp_sec));
> +		flush_dcache_page(pgv_to_page(&h2->tp_nsec));
> +		break;
> +	case TPACKET_V3:
> +	default:
> +		WARN(1, "TPACKET version not supported.\n");
> +		BUG();
> +	}
> +
> +

Nitpick: one space too much.

> +	smp_wmb();
> +}
> +
>   static void *packet_lookup_frame(struct packet_sock *po,
>   		struct packet_ring_buffer *rb,
>   		unsigned int position,
> @@ -1900,6 +1939,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
>   		ph = skb_shinfo(skb)->destructor_arg;
>   		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
>   		atomic_dec(&po->tx_ring.pending);
> +		__packet_set_timestamp(po, ph, skb->tstamp);
>   		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
>   	}
>
> @@ -2119,6 +2159,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>   			}
>   		}
>
> +		sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
> +

Hmm, so in case nobody wants to use timestamping on TX (which might be the majority
of people), we have to go through those 3 additional if statements in sock_tx_timestamp()
each time? Shouldn't we rather make the TX_RING faster? ;-)

>   		skb->destructor = tpacket_destruct_skb;
>   		__packet_set_status(po, ph, TP_STATUS_SENDING);
>   		atomic_inc(&po->tx_ring.pending);
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 13, 2013, 10:47 p.m. UTC | #2
From: Daniel Borkmann <dborkman@redhat.com>
Date: Sun, 14 Apr 2013 00:18:48 +0200

>> +		flush_dcache_page(pgv_to_page(&h1->tp_sec));
>> +		flush_dcache_page(pgv_to_page(&h1->tp_usec));
> 
> Hmm, not sure, but could we also flush the dcache only once?

Indeed, I truly hope that headers never straddle pages.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 14, 2013, midnight UTC | #3
On Sat, Apr 13, 2013 at 6:18 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 04/13/2013 08:56 PM, Willem de Bruijn wrote:
>>
>> When transmit timestamping is enabled at the socket level, have
>> writes to a PACKET_TX_RING record a timestamp for the generated
>> skbuffs. Tx timestamps are always looped to the application over
>> the socket error queue.
>
>
> Nitpick: if so, then this should go to net-next (subject line).

Thanks.
>
>> The patch also loops software timestamps back into the ring.
>>
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>> ---
>>   net/core/skbuff.c      |  2 +-
>>   net/packet/af_packet.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 43 insertions(+), 1 deletion(-)
>
> [...]
>
>> +static void __packet_set_timestamp(struct packet_sock *po, void *frame,
>> +                                  ktime_t tstamp)
>> +{
>> +       struct tpacket_hdr *h1;
>> +       struct tpacket2_hdr *h2;
>> +       struct timespec ts;
>> +
>> +       if (!tstamp.tv64 || !sock_flag(&po->sk,
>> SOCK_TIMESTAMPING_SOFTWARE))
>> +               return;
>> +
>> +       ts = ktime_to_timespec(tstamp);
>> +
>> +       switch (po->tp_version) {
>> +       case TPACKET_V1:
>> +               h1 = frame;
>> +               h1->tp_sec = ts.tv_sec;
>> +               h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
>> +
>> +               flush_dcache_page(pgv_to_page(&h1->tp_sec));
>> +               flush_dcache_page(pgv_to_page(&h1->tp_usec));
>
>
> Hmm, not sure, but could we also flush the dcache only once?
>
>
>> +               break;
>> +       case TPACKET_V2:
>> +               h2 = frame;
>> +               h2->tp_sec = ts.tv_sec;
>> +               h2->tp_nsec = ts.tv_nsec;
>> +
>> +               flush_dcache_page(pgv_to_page(&h2->tp_sec));
>> +               flush_dcache_page(pgv_to_page(&h2->tp_nsec));
>> +               break;
>> +       case TPACKET_V3:
>> +       default:
>> +               WARN(1, "TPACKET version not supported.\n");
>> +               BUG();
>> +       }
>> +
>> +
>
>
> Nitpick: one space too much.
>
>
>> +       smp_wmb();
>> +}
>> +
>>   static void *packet_lookup_frame(struct packet_sock *po,
>>                 struct packet_ring_buffer *rb,
>>                 unsigned int position,
>> @@ -1900,6 +1939,7 @@ static void tpacket_destruct_skb(struct sk_buff
>> *skb)
>>                 ph = skb_shinfo(skb)->destructor_arg;
>>                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
>>                 atomic_dec(&po->tx_ring.pending);
>> +               __packet_set_timestamp(po, ph, skb->tstamp);
>>                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
>>         }
>>
>> @@ -2119,6 +2159,8 @@ static int tpacket_snd(struct packet_sock *po,
>> struct msghdr *msg)
>>                         }
>>                 }
>>
>> +               sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
>> +
>
>
> Hmm, so in case nobody wants to use timestamping on TX (which might be the
> majority
> of people), we have to go through those 3 additional if statements in
> sock_tx_timestamp()
> each time? Shouldn't we rather make the TX_RING faster? ;-)

A static key, similar to netstamp_needed, might make it cheaper, but
may be more complexity than warranted for a corner case. Simpler is
testing only the software timestamp, and in tpacket_fill_skb, where
the sk struct is already accessed. I don't feel strongly about merging
given the performance trade-off: just wanted to see if it worked.
Happy to resubmit either revision (or a better idea).

>
>>                 skb->destructor = tpacket_destruct_skb;
>>                 __packet_set_status(po, ph, TP_STATUS_SENDING);
>>                 atomic_inc(&po->tx_ring.pending);
>>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 14, 2013, 12:04 a.m. UTC | #4
On Sat, Apr 13, 2013 at 6:47 PM, David Miller <davem@davemloft.net> wrote:
> From: Daniel Borkmann <dborkman@redhat.com>
> Date: Sun, 14 Apr 2013 00:18:48 +0200
>
>>> +            flush_dcache_page(pgv_to_page(&h1->tp_sec));
>>> +            flush_dcache_page(pgv_to_page(&h1->tp_usec));
>>
>> Hmm, not sure, but could we also flush the dcache only once?
>
> Indeed, I truly hope that headers never straddle pages.

I should have checked the alignment restrictions on frames. Frames
must be a multiple of 16 B as well as larger than the header (obviously),
so this can indeed never happen.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 14, 2013, 12:16 a.m. UTC | #5
On Sat, Apr 13, 2013 at 8:04 PM, Willem de Bruijn <willemb@google.com> wrote:
> On Sat, Apr 13, 2013 at 6:47 PM, David Miller <davem@davemloft.net> wrote:
>> From: Daniel Borkmann <dborkman@redhat.com>
>> Date: Sun, 14 Apr 2013 00:18:48 +0200
>>
>>>> +            flush_dcache_page(pgv_to_page(&h1->tp_sec));
>>>> +            flush_dcache_page(pgv_to_page(&h1->tp_usec));
>>>
>>> Hmm, not sure, but could we also flush the dcache only once?
>>
>> Indeed, I truly hope that headers never straddle pages.
>
> I should have checked the alignment restrictions on frames. Frames
> must be a multiple of 16 B as well as larger than the header (obviously),
> so this can indeed never happen.

Actually, 48 B is a multiple of 16, so should be accepted, and 85
frames on a page leaves half a frame for the next. I'll check whether
this is right. Even if so, it would still not matter for these time
offsets, as they start at 16 or 20 B offset from the start of the
frame.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 14, 2013, 12:49 a.m. UTC | #6
On Sat, Apr 13, 2013 at 8:16 PM, Willem de Bruijn <willemb@google.com> wrote:
> On Sat, Apr 13, 2013 at 8:04 PM, Willem de Bruijn <willemb@google.com> wrote:
>> On Sat, Apr 13, 2013 at 6:47 PM, David Miller <davem@davemloft.net> wrote:
>>> From: Daniel Borkmann <dborkman@redhat.com>
>>> Date: Sun, 14 Apr 2013 00:18:48 +0200
>>>
>>>>> +            flush_dcache_page(pgv_to_page(&h1->tp_sec));
>>>>> +            flush_dcache_page(pgv_to_page(&h1->tp_usec));
>>>>
>>>> Hmm, not sure, but could we also flush the dcache only once?
>>>
>>> Indeed, I truly hope that headers never straddle pages.
>>
>> I should have checked the alignment restrictions on frames. Frames
>> must be a multiple of 16 B as well as larger than the header (obviously),
>> so this can indeed never happen.
>
> Actually, 48 B is a multiple of 16, so should be accepted, and 85
> frames on a page leaves half a frame for the next. I'll check whether
> this is right. Even if so, it would still not matter for these time
> offsets, as they start at 16 or 20 B offset from the start of the
> frame.

48 B is too small (Because less than TPACKET_HDRLEN), but it can
be triggered with 80 B frames. Daniel, thanks for submitting the
selftest: both the timestamp and this alignment question were now very
easy to test by just changing a few lines in your code.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann April 14, 2013, 5:16 a.m. UTC | #7
On 04/14/2013 02:49 AM, Willem de Bruijn wrote:
> On Sat, Apr 13, 2013 at 8:16 PM, Willem de Bruijn <willemb@google.com> wrote:
>> On Sat, Apr 13, 2013 at 8:04 PM, Willem de Bruijn <willemb@google.com> wrote:
>>> On Sat, Apr 13, 2013 at 6:47 PM, David Miller <davem@davemloft.net> wrote:
>>>> From: Daniel Borkmann <dborkman@redhat.com>
>>>> Date: Sun, 14 Apr 2013 00:18:48 +0200
>>>>
>>>>>> +            flush_dcache_page(pgv_to_page(&h1->tp_sec));
>>>>>> +            flush_dcache_page(pgv_to_page(&h1->tp_usec));
>>>>>
>>>>> Hmm, not sure, but could we also flush the dcache only once?
>>>>
>>>> Indeed, I truly hope that headers never straddle pages.
>>>
>>> I should have checked the alignment restrictions on frames. Frames
>>> must be a multiple of 16 B as well as larger than the header (obviously),
>>> so this can indeed never happen.
>>
>> Actually, 48 B is a multiple of 16, so should be accepted, and 85
>> frames on a page leaves half a frame for the next. I'll check whether
>> this is right. Even if so, it would still not matter for these time
>> offsets, as they start at 16 or 20 B offset from the start of the
>> frame.
>
> 48 B is too small (Because less than TPACKET_HDRLEN), but it can
> be triggered with 80 B frames. Daniel, thanks for submitting the
> selftest: both the timestamp and this alignment question were now very
> easy to test by just changing a few lines in your code.

Feel free to further extend it. :-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann April 14, 2013, 10:52 a.m. UTC | #8
On 04/14/2013 02:00 AM, Willem de Bruijn wrote:
> On Sat, Apr 13, 2013 at 6:18 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
>> On 04/13/2013 08:56 PM, Willem de Bruijn wrote:
>>>
>>> When transmit timestamping is enabled at the socket level, have
>>> writes to a PACKET_TX_RING record a timestamp for the generated
>>> skbuffs. Tx timestamps are always looped to the application over
>>> the socket error queue.
>>
>>
>> Nitpick: if so, then this should go to net-next (subject line).
>
> Thanks.
>>
>>> The patch also loops software timestamps back into the ring.

Also v2 should probably contain a ``Reported-by''.

>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>> ---
>>>    net/core/skbuff.c      |  2 +-
>>>    net/packet/af_packet.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>>>    2 files changed, 43 insertions(+), 1 deletion(-)
>>
>> [...]
>>
>>> +static void __packet_set_timestamp(struct packet_sock *po, void *frame,
>>> +                                  ktime_t tstamp)
>>> +{
>>> +       struct tpacket_hdr *h1;
>>> +       struct tpacket2_hdr *h2;
>>> +       struct timespec ts;
>>> +
>>> +       if (!tstamp.tv64 || !sock_flag(&po->sk,
>>> SOCK_TIMESTAMPING_SOFTWARE))
>>> +               return;

While going a bit more through the code, I'm wondering .. if we want to support
TX timestamps, could we also support SW _and_ HW timestamps e.g. similar as in
sock_recv_timestamp()? I'm asking, because we already allow setting the flags
for it via sock_tx_timestamp(). This might be good, if possible.

Paul, since you've reported / requested this, your use case would be to fill
the ring, trigger a sendto() and then loop through all the frames to check the
tx timestamps for your custom protocol mockup, then fill the returned frames
again, etc.?

>>> +       ts = ktime_to_timespec(tstamp);
>>> +
>>> +       switch (po->tp_version) {
>>> +       case TPACKET_V1:
>>> +               h1 = frame;
>>> +               h1->tp_sec = ts.tv_sec;
>>> +               h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
>>> +
>>> +               flush_dcache_page(pgv_to_page(&h1->tp_sec));
>>> +               flush_dcache_page(pgv_to_page(&h1->tp_usec));
>>
>>
>> Hmm, not sure, but could we also flush the dcache only once?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Richard Cochran April 14, 2013, 1:07 p.m. UTC | #9
On Sun, Apr 14, 2013 at 12:52:16PM +0200, Daniel Borkmann wrote:
> 
> While going a bit more through the code, I'm wondering .. if we want to support
> TX timestamps, could we also support SW _and_ HW timestamps e.g. similar as in
> sock_recv_timestamp()? I'm asking, because we already allow setting the flags
> for it via sock_tx_timestamp(). This might be good, if possible.

And while you are at it, you could also fix the receive code.

As it stand now, it is fairly useless, since there is no way for user
space to tell which kind of time stamp has been reported. In fact, the
code will silently intermingle hardware and software time stamps. That
is surely a mean trick to play on the users.

Thanks,
Richard
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Chavent April 15, 2013, 7:31 a.m. UTC | #10
Hi.

On 04/13/2013 08:56 PM, Willem de Bruijn wrote:
> [...]
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3311,7 +3311,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>   		 * so keep the shared tx_flags and only
>   		 * store software time stamp
>   		 */
> -		skb->tstamp = ktime_get_real();
> +		orig_skb->tstamp = skb->tstamp = ktime_get_real();
>   	}

You said that "the orig_skb is usually freed shortly after
skb_tstamp_tx is called".

So i suppose that if you had coded it, that's because this orig_skb is 
the one that is given to the skb destructor. So when you call 
__packet_set_timestamp, in tpacket_destruct_skb, you get this timestamp 
? Am i right ?


Why we couldn't call
*skb_hwtstamps(orig_skb) = *skb_hwtstamps(skb) = *hwtstamps;
in order to get the hardware timestamping too ?

Thank for your help.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Chavent April 15, 2013, 7:37 a.m. UTC | #11
On 04/14/2013 03:07 PM, Richard Cochran wrote:
> On Sun, Apr 14, 2013 at 12:52:16PM +0200, Daniel Borkmann wrote:
>>
>> While going a bit more through the code, I'm wondering .. if we want to support
>> TX timestamps, could we also support SW _and_ HW timestamps e.g. similar as in
>> sock_recv_timestamp()? I'm asking, because we already allow setting the flags
>> for it via sock_tx_timestamp(). This might be good, if possible.
>
> And while you are at it, you could also fix the receive code.
>
> As it stand now, it is fairly useless, since there is no way for user
> space to tell which kind of time stamp has been reported. In fact, the
> code will silently intermingle hardware and software time stamps. That
> is surely a mean trick to play on the users.

Isn't it the one that the user ask with setsockopt(fd, SOL_PACKET, 
PACKET_TIMESTAMP, &timestamping, sizeof(timestamping)) ?

However, i wonder why you added an other sockopt that do the same thing 
as SOL_SOCKET/SO_TIMESTAMPING sockopt ?

Paul.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight April 15, 2013, 9:45 a.m. UTC | #12
> > +	case TPACKET_V1:
> > +		h1 = frame;
> > +		h1->tp_sec = ts.tv_sec;
> > +		h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
> > +
> > +		flush_dcache_page(pgv_to_page(&h1->tp_sec));
> > +		flush_dcache_page(pgv_to_page(&h1->tp_usec));
> 
> Hmm, not sure, but could we also flush the dcache only once?

If it isn't a silly question, why is the dcache being flushed
here at all?

	David



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Chavent April 15, 2013, 3:41 p.m. UTC | #13
On 04/14/2013 12:52 PM, Daniel Borkmann wrote:
> Paul, since you've reported / requested this, your use case would be to
> fill
> the ring, trigger a sendto() and then loop through all the frames to
> check the
> tx timestamps for your custom protocol mockup, then fill the returned
> frames
> again, etc.?

I've run a test that create (and setup) a tx ring buffer with 8 frames, 
then fill the frames payload, then call sendto.
I've checked the timestamp by two means :
  - check the timestamp in the tx ring after the status became 
TP_STATUS_AVAILABLE again,
  - by issuing 8 recvmsg on the ERRQUEUE.
Both timestamp are coherents and increment themselves.

The tests has been done with a UML kernel, and with software timestamps.

 From my point of view, it seems usable.

However, I've found one strange behavior. I must call recvmsg as many 
times as i have submitted a frame, or not at all. If i only pop the 
ERRQUEUE for one or two message for instance, the next call to sentdo 
fails with "No message of desired type" (errno 42).


Paul.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 15, 2013, 4:37 p.m. UTC | #14
On Mon, Apr 15, 2013 at 3:31 AM, Paul Chavent <Paul.Chavent@onera.fr> wrote:
> Hi.
>
>
> On 04/13/2013 08:56 PM, Willem de Bruijn wrote:
>>
>> [...]
>>
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -3311,7 +3311,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
>>                  * so keep the shared tx_flags and only
>>                  * store software time stamp
>>                  */
>> -               skb->tstamp = ktime_get_real();
>> +               orig_skb->tstamp = skb->tstamp = ktime_get_real();
>>         }
>
>
> You said that "the orig_skb is usually freed shortly after
> skb_tstamp_tx is called".
>
> So i suppose that if you had coded it, that's because this orig_skb is the
> one that is given to the skb destructor. So when you call
> __packet_set_timestamp, in tpacket_destruct_skb, you get this timestamp ? Am
> i right ?
>
>
> Why we couldn't call
> *skb_hwtstamps(orig_skb) = *skb_hwtstamps(skb) = *hwtstamps;
> in order to get the hardware timestamping too ?

Probably. That would also answer Daniel's question to the same
functionality. I'm a bit less familiar with the hardware path, so will
read up before just submitting it.

> Thank for your help.
>
> Paul.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Richard Cochran April 15, 2013, 4:56 p.m. UTC | #15
On Mon, Apr 15, 2013 at 09:37:30AM +0200, Paul Chavent wrote:
> On 04/14/2013 03:07 PM, Richard Cochran wrote:
> >As it stand now, it is fairly useless, since there is no way for user
> >space to tell which kind of time stamp has been reported. In fact, the
> >code will silently intermingle hardware and software time stamps. That
> >is surely a mean trick to play on the users.
> 
> Isn't it the one that the user ask with setsockopt(fd, SOL_PACKET,
> PACKET_TIMESTAMP, &timestamping, sizeof(timestamping)) ?

No, not necessarily. Look at the code in net/packet/af_packet.c.

		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
				&& shhwtstamps->syststamp.tv64)
			ts = ktime_to_timespec(shhwtstamps->syststamp);
		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
				&& shhwtstamps->hwtstamp.tv64)
			ts = ktime_to_timespec(shhwtstamps->hwtstamp);
		else if (skb->tstamp.tv64)
			ts = ktime_to_timespec(skb->tstamp);
		else
			getnstimeofday(&ts);

What happens if RAW is requested, but no HW time stamp is available?

 
> However, i wonder why you added an other sockopt that do the same
> thing as SOL_SOCKET/SO_TIMESTAMPING sockopt ?

Not sure what you mean. I did not add the SOL_PACKET socket option.

Thanks,
Richard
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 15, 2013, 4:59 p.m. UTC | #16
On Mon, Apr 15, 2013 at 3:37 AM, Paul Chavent <Paul.Chavent@onera.fr> wrote:
>
>
> On 04/14/2013 03:07 PM, Richard Cochran wrote:
>>
>> On Sun, Apr 14, 2013 at 12:52:16PM +0200, Daniel Borkmann wrote:
>>>
>>>
>>> While going a bit more through the code, I'm wondering .. if we want to
>>> support
>>> TX timestamps, could we also support SW _and_ HW timestamps e.g. similar
>>> as in
>>> sock_recv_timestamp()? I'm asking, because we already allow setting the
>>> flags
>>> for it via sock_tx_timestamp(). This might be good, if possible.
>>
>>
>> And while you are at it, you could also fix the receive code.
>>
>> As it stand now, it is fairly useless, since there is no way for user
>> space to tell which kind of time stamp has been reported. In fact, the
>> code will silently intermingle hardware and software time stamps. That
>> is surely a mean trick to play on the users.

Interesting issue, but I'll leave that for a separate fix.
>
> Isn't it the one that the user ask with setsockopt(fd, SOL_PACKET,
> PACKET_TIMESTAMP, &timestamping, sizeof(timestamping)) ?

This option toggles whether, if an skbuff has a timestamp, it should
be written to the rx ring metadata. skbuffs can have multiple
timestamps, so it also determines which one is selected.

If I understand correctly, the problem that Richard refers to is that
when one socket requests rx timestamping, all rx skbuffs have to be
timestamped as it is not known early in the datapath to which socket
they will be enqueued. As a result, the timestamps in skbuffs depend
on the preferences of other active sockets. Correct me if I'm wrong.

If so, it may even vary during the lifetime of a packet socket, so a
getsockopt wouldn't help, either. Perhaps nothing short of a new field
in the frame header structure will. In which case it makes more sense
to support sending each type of timestamp independently, just like the
socker error queue mechanism. Adding yet another header format,
though?

> However, i wonder why you added an other sockopt that do the same thing as
> SOL_SOCKET/SO_TIMESTAMPING sockopt ?

Back to the patch. I'll revise with

- a small patch for the errqueue mechanism
  - add hw timestamp pass-through in skb_tstamp_tx, if correct
  - move sock_tx_timestamp where cache is warm
  - update it to use only a single conditional in the common case: no
flags enabled

- a follow-up for the ring
  - single flush_dcache_page, which is safe for current header layouts
on 32bit+ platforms
    (it is also a noop on many of these)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willem de Bruijn April 15, 2013, 5:08 p.m. UTC | #17
On Mon, Apr 15, 2013 at 5:45 AM, David Laight <David.Laight@aculab.com> wrote:
>> > +   case TPACKET_V1:
>> > +           h1 = frame;
>> > +           h1->tp_sec = ts.tv_sec;
>> > +           h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
>> > +
>> > +           flush_dcache_page(pgv_to_page(&h1->tp_sec));
>> > +           flush_dcache_page(pgv_to_page(&h1->tp_usec));
>>
>> Hmm, not sure, but could we also flush the dcache only once?
>
> If it isn't a silly question, why is the dcache being flushed
> here at all?

I'm not an expert on this, but have a look at
Documentation/cachetlb.txt, specifically the bits on this function and
the discussion of aliasing: on virtually indexed cache architectures,
the same physical address may be cached in multiple cachelines at the
same time. If I understand correctly, updating the kernel logical
address does not necessarily invalidate the user virtual cacheline for
the same physical memory.

>         David
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 15, 2013, 5:31 p.m. UTC | #18
From: "David Laight" <David.Laight@ACULAB.COM>
Date: Mon, 15 Apr 2013 10:45:55 +0100

>> > +	case TPACKET_V1:
>> > +		h1 = frame;
>> > +		h1->tp_sec = ts.tv_sec;
>> > +		h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
>> > +
>> > +		flush_dcache_page(pgv_to_page(&h1->tp_sec));
>> > +		flush_dcache_page(pgv_to_page(&h1->tp_usec));
>> 
>> Hmm, not sure, but could we also flush the dcache only once?
> 
> If it isn't a silly question, why is the dcache being flushed
> here at all?

Anything that gets written into userspace mapped pages via the
kernel linear mapping of the pages must be D-cache flushed into
order to achieve coherency on cpus that have virtually indexed
caches.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Richard Cochran April 17, 2013, 10:22 a.m. UTC | #19
On Mon, Apr 15, 2013 at 12:59:26PM -0400, Willem de Bruijn wrote:
> 
> If I understand correctly, the problem that Richard refers to is that
> when one socket requests rx timestamping, all rx skbuffs have to be
> timestamped as it is not known early in the datapath to which socket
> they will be enqueued. As a result, the timestamps in skbuffs depend
> on the preferences of other active sockets. Correct me if I'm wrong.

No, that is not it.

Hardware time stamping needs to be globally enabled at the driver
level using the SIOCSHWTSTAMP [1] ioctl. The hardware has various
different abilities, usually one of:

- no time stamping
- some subset of PTP packets
- all packets

So depending on the mode, not all packets will receive a time
stamp. In addition, some of the hardware can only time stamp one
packet at a time. Furthermore, time stamps can get dropped in high
traffic situations.

The code in af_packet.c unconditionally delivers some sort of time
stamp, with an inexplicable fall back to gettimeofday. Thus, the user
space has no way of knowing what is being reported in the time stamp.

Thanks,
Richard

1. See Documentation/networking/timestamping.txt
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ba64614..44b9c2a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3311,7 +3311,7 @@  void skb_tstamp_tx(struct sk_buff *orig_skb,
 		 * so keep the shared tx_flags and only
 		 * store software time stamp
 		 */
-		skb->tstamp = ktime_get_real();
+		orig_skb->tstamp = skb->tstamp = ktime_get_real();
 	}
 
 	serr = SKB_EXT_ERR(skb);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8e4644f..dc5d224 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -341,6 +341,45 @@  static int __packet_get_status(struct packet_sock *po, void *frame)
 	}
 }
 
+static void __packet_set_timestamp(struct packet_sock *po, void *frame,
+				   ktime_t tstamp)
+{
+	struct tpacket_hdr *h1;
+	struct tpacket2_hdr *h2;
+	struct timespec ts;
+
+	if (!tstamp.tv64 || !sock_flag(&po->sk, SOCK_TIMESTAMPING_SOFTWARE))
+		return;
+
+	ts = ktime_to_timespec(tstamp);
+
+	switch (po->tp_version) {
+	case TPACKET_V1:
+		h1 = frame;
+		h1->tp_sec = ts.tv_sec;
+		h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+		flush_dcache_page(pgv_to_page(&h1->tp_sec));
+		flush_dcache_page(pgv_to_page(&h1->tp_usec));
+		break;
+	case TPACKET_V2:
+		h2 = frame;
+		h2->tp_sec = ts.tv_sec;
+		h2->tp_nsec = ts.tv_nsec;
+
+		flush_dcache_page(pgv_to_page(&h2->tp_sec));
+		flush_dcache_page(pgv_to_page(&h2->tp_nsec));
+		break;
+	case TPACKET_V3:
+	default:
+		WARN(1, "TPACKET version not supported.\n");
+		BUG();
+	}
+
+
+	smp_wmb();
+}
+
 static void *packet_lookup_frame(struct packet_sock *po,
 		struct packet_ring_buffer *rb,
 		unsigned int position,
@@ -1900,6 +1939,7 @@  static void tpacket_destruct_skb(struct sk_buff *skb)
 		ph = skb_shinfo(skb)->destructor_arg;
 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
 		atomic_dec(&po->tx_ring.pending);
+		__packet_set_timestamp(po, ph, skb->tstamp);
 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
 	}
 
@@ -2119,6 +2159,8 @@  static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 			}
 		}
 
+		sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
+
 		skb->destructor = tpacket_destruct_skb;
 		__packet_set_status(po, ph, TP_STATUS_SENDING);
 		atomic_inc(&po->tx_ring.pending);