Message ID | 1365879412-9541-1-git-send-email-willemb@google.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On 04/13/2013 08:56 PM, Willem de Bruijn wrote: > When transmit timestamping is enabled at the socket level, have > writes to a PACKET_TX_RING record a timestamp for the generated > skbuffs. Tx timestamps are always looped to the application over > the socket error queue. Nitpick: if so, then this should go to net-next (subject line). > The patch also loops software timestamps back into the ring. > > Signed-off-by: Willem de Bruijn <willemb@google.com> > --- > net/core/skbuff.c | 2 +- > net/packet/af_packet.c | 42 ++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 43 insertions(+), 1 deletion(-) [...] > +static void __packet_set_timestamp(struct packet_sock *po, void *frame, > + ktime_t tstamp) > +{ > + struct tpacket_hdr *h1; > + struct tpacket2_hdr *h2; > + struct timespec ts; > + > + if (!tstamp.tv64 || !sock_flag(&po->sk, SOCK_TIMESTAMPING_SOFTWARE)) > + return; > + > + ts = ktime_to_timespec(tstamp); > + > + switch (po->tp_version) { > + case TPACKET_V1: > + h1 = frame; > + h1->tp_sec = ts.tv_sec; > + h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; > + > + flush_dcache_page(pgv_to_page(&h1->tp_sec)); > + flush_dcache_page(pgv_to_page(&h1->tp_usec)); Hmm, not sure, but could we also flush the dcache only once? > + break; > + case TPACKET_V2: > + h2 = frame; > + h2->tp_sec = ts.tv_sec; > + h2->tp_nsec = ts.tv_nsec; > + > + flush_dcache_page(pgv_to_page(&h2->tp_sec)); > + flush_dcache_page(pgv_to_page(&h2->tp_nsec)); > + break; > + case TPACKET_V3: > + default: > + WARN(1, "TPACKET version not supported.\n"); > + BUG(); > + } > + > + Nitpick: one space too much. > + smp_wmb(); > +} > + > static void *packet_lookup_frame(struct packet_sock *po, > struct packet_ring_buffer *rb, > unsigned int position, > @@ -1900,6 +1939,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) > ph = skb_shinfo(skb)->destructor_arg; > BUG_ON(atomic_read(&po->tx_ring.pending) == 0); > atomic_dec(&po->tx_ring.pending); > + __packet_set_timestamp(po, ph, skb->tstamp); > __packet_set_status(po, ph, TP_STATUS_AVAILABLE); > } > > @@ -2119,6 +2159,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) > } > } > > + sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); > + Hmm, so in case nobody wants to use timestamping on TX (which might be the majority of people), we have to go through those 3 additional if statements in sock_tx_timestamp() each time? Shouldn't we rather make the TX_RING faster? ;-) > skb->destructor = tpacket_destruct_skb; > __packet_set_status(po, ph, TP_STATUS_SENDING); > atomic_inc(&po->tx_ring.pending); > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Daniel Borkmann <dborkman@redhat.com> Date: Sun, 14 Apr 2013 00:18:48 +0200 >> + flush_dcache_page(pgv_to_page(&h1->tp_sec)); >> + flush_dcache_page(pgv_to_page(&h1->tp_usec)); > > Hmm, not sure, but could we also flush the dcache only once? Indeed, I truly hope that headers never straddle pages. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Apr 13, 2013 at 6:18 PM, Daniel Borkmann <dborkman@redhat.com> wrote: > On 04/13/2013 08:56 PM, Willem de Bruijn wrote: >> >> When transmit timestamping is enabled at the socket level, have >> writes to a PACKET_TX_RING record a timestamp for the generated >> skbuffs. Tx timestamps are always looped to the application over >> the socket error queue. > > > Nitpick: if so, then this should go to net-next (subject line). Thanks. > >> The patch also loops software timestamps back into the ring. >> >> Signed-off-by: Willem de Bruijn <willemb@google.com> >> --- >> net/core/skbuff.c | 2 +- >> net/packet/af_packet.c | 42 ++++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 43 insertions(+), 1 deletion(-) > > [...] > >> +static void __packet_set_timestamp(struct packet_sock *po, void *frame, >> + ktime_t tstamp) >> +{ >> + struct tpacket_hdr *h1; >> + struct tpacket2_hdr *h2; >> + struct timespec ts; >> + >> + if (!tstamp.tv64 || !sock_flag(&po->sk, >> SOCK_TIMESTAMPING_SOFTWARE)) >> + return; >> + >> + ts = ktime_to_timespec(tstamp); >> + >> + switch (po->tp_version) { >> + case TPACKET_V1: >> + h1 = frame; >> + h1->tp_sec = ts.tv_sec; >> + h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; >> + >> + flush_dcache_page(pgv_to_page(&h1->tp_sec)); >> + flush_dcache_page(pgv_to_page(&h1->tp_usec)); > > > Hmm, not sure, but could we also flush the dcache only once? > > >> + break; >> + case TPACKET_V2: >> + h2 = frame; >> + h2->tp_sec = ts.tv_sec; >> + h2->tp_nsec = ts.tv_nsec; >> + >> + flush_dcache_page(pgv_to_page(&h2->tp_sec)); >> + flush_dcache_page(pgv_to_page(&h2->tp_nsec)); >> + break; >> + case TPACKET_V3: >> + default: >> + WARN(1, "TPACKET version not supported.\n"); >> + BUG(); >> + } >> + >> + > > > Nitpick: one space too much. > > >> + smp_wmb(); >> +} >> + >> static void *packet_lookup_frame(struct packet_sock *po, >> struct packet_ring_buffer *rb, >> unsigned int position, >> @@ -1900,6 +1939,7 @@ static void tpacket_destruct_skb(struct sk_buff >> *skb) >> ph = skb_shinfo(skb)->destructor_arg; >> BUG_ON(atomic_read(&po->tx_ring.pending) == 0); >> atomic_dec(&po->tx_ring.pending); >> + __packet_set_timestamp(po, ph, skb->tstamp); >> __packet_set_status(po, ph, TP_STATUS_AVAILABLE); >> } >> >> @@ -2119,6 +2159,8 @@ static int tpacket_snd(struct packet_sock *po, >> struct msghdr *msg) >> } >> } >> >> + sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); >> + > > > Hmm, so in case nobody wants to use timestamping on TX (which might be the > majority > of people), we have to go through those 3 additional if statements in > sock_tx_timestamp() > each time? Shouldn't we rather make the TX_RING faster? ;-) A static key, similar to netstamp_needed, might make it cheaper, but may be more complexity than warranted for a corner case. Simpler is testing only the software timestamp, and in tpacket_fill_skb, where the sk struct is already accessed. I don't feel strongly about merging given the performance trade-off: just wanted to see if it worked. Happy to resubmit either revision (or a better idea). > >> skb->destructor = tpacket_destruct_skb; >> __packet_set_status(po, ph, TP_STATUS_SENDING); >> atomic_inc(&po->tx_ring.pending); >> > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Apr 13, 2013 at 6:47 PM, David Miller <davem@davemloft.net> wrote: > From: Daniel Borkmann <dborkman@redhat.com> > Date: Sun, 14 Apr 2013 00:18:48 +0200 > >>> + flush_dcache_page(pgv_to_page(&h1->tp_sec)); >>> + flush_dcache_page(pgv_to_page(&h1->tp_usec)); >> >> Hmm, not sure, but could we also flush the dcache only once? > > Indeed, I truly hope that headers never straddle pages. I should have checked the alignment restrictions on frames. Frames must be a multiple of 16 B as well as larger than the header (obviously), so this can indeed never happen. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Apr 13, 2013 at 8:04 PM, Willem de Bruijn <willemb@google.com> wrote: > On Sat, Apr 13, 2013 at 6:47 PM, David Miller <davem@davemloft.net> wrote: >> From: Daniel Borkmann <dborkman@redhat.com> >> Date: Sun, 14 Apr 2013 00:18:48 +0200 >> >>>> + flush_dcache_page(pgv_to_page(&h1->tp_sec)); >>>> + flush_dcache_page(pgv_to_page(&h1->tp_usec)); >>> >>> Hmm, not sure, but could we also flush the dcache only once? >> >> Indeed, I truly hope that headers never straddle pages. > > I should have checked the alignment restrictions on frames. Frames > must be a multiple of 16 B as well as larger than the header (obviously), > so this can indeed never happen. Actually, 48 B is a multiple of 16, so should be accepted, and 85 frames on a page leaves half a frame for the next. I'll check whether this is right. Even if so, it would still not matter for these time offsets, as they start at 16 or 20 B offset from the start of the frame. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Apr 13, 2013 at 8:16 PM, Willem de Bruijn <willemb@google.com> wrote: > On Sat, Apr 13, 2013 at 8:04 PM, Willem de Bruijn <willemb@google.com> wrote: >> On Sat, Apr 13, 2013 at 6:47 PM, David Miller <davem@davemloft.net> wrote: >>> From: Daniel Borkmann <dborkman@redhat.com> >>> Date: Sun, 14 Apr 2013 00:18:48 +0200 >>> >>>>> + flush_dcache_page(pgv_to_page(&h1->tp_sec)); >>>>> + flush_dcache_page(pgv_to_page(&h1->tp_usec)); >>>> >>>> Hmm, not sure, but could we also flush the dcache only once? >>> >>> Indeed, I truly hope that headers never straddle pages. >> >> I should have checked the alignment restrictions on frames. Frames >> must be a multiple of 16 B as well as larger than the header (obviously), >> so this can indeed never happen. > > Actually, 48 B is a multiple of 16, so should be accepted, and 85 > frames on a page leaves half a frame for the next. I'll check whether > this is right. Even if so, it would still not matter for these time > offsets, as they start at 16 or 20 B offset from the start of the > frame. 48 B is too small (Because less than TPACKET_HDRLEN), but it can be triggered with 80 B frames. Daniel, thanks for submitting the selftest: both the timestamp and this alignment question were now very easy to test by just changing a few lines in your code. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 04/14/2013 02:49 AM, Willem de Bruijn wrote: > On Sat, Apr 13, 2013 at 8:16 PM, Willem de Bruijn <willemb@google.com> wrote: >> On Sat, Apr 13, 2013 at 8:04 PM, Willem de Bruijn <willemb@google.com> wrote: >>> On Sat, Apr 13, 2013 at 6:47 PM, David Miller <davem@davemloft.net> wrote: >>>> From: Daniel Borkmann <dborkman@redhat.com> >>>> Date: Sun, 14 Apr 2013 00:18:48 +0200 >>>> >>>>>> + flush_dcache_page(pgv_to_page(&h1->tp_sec)); >>>>>> + flush_dcache_page(pgv_to_page(&h1->tp_usec)); >>>>> >>>>> Hmm, not sure, but could we also flush the dcache only once? >>>> >>>> Indeed, I truly hope that headers never straddle pages. >>> >>> I should have checked the alignment restrictions on frames. Frames >>> must be a multiple of 16 B as well as larger than the header (obviously), >>> so this can indeed never happen. >> >> Actually, 48 B is a multiple of 16, so should be accepted, and 85 >> frames on a page leaves half a frame for the next. I'll check whether >> this is right. Even if so, it would still not matter for these time >> offsets, as they start at 16 or 20 B offset from the start of the >> frame. > > 48 B is too small (Because less than TPACKET_HDRLEN), but it can > be triggered with 80 B frames. Daniel, thanks for submitting the > selftest: both the timestamp and this alignment question were now very > easy to test by just changing a few lines in your code. Feel free to further extend it. :-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 04/14/2013 02:00 AM, Willem de Bruijn wrote: > On Sat, Apr 13, 2013 at 6:18 PM, Daniel Borkmann <dborkman@redhat.com> wrote: >> On 04/13/2013 08:56 PM, Willem de Bruijn wrote: >>> >>> When transmit timestamping is enabled at the socket level, have >>> writes to a PACKET_TX_RING record a timestamp for the generated >>> skbuffs. Tx timestamps are always looped to the application over >>> the socket error queue. >> >> >> Nitpick: if so, then this should go to net-next (subject line). > > Thanks. >> >>> The patch also loops software timestamps back into the ring. Also v2 should probably contain a ``Reported-by''. >>> Signed-off-by: Willem de Bruijn <willemb@google.com> >>> --- >>> net/core/skbuff.c | 2 +- >>> net/packet/af_packet.c | 42 ++++++++++++++++++++++++++++++++++++++++++ >>> 2 files changed, 43 insertions(+), 1 deletion(-) >> >> [...] >> >>> +static void __packet_set_timestamp(struct packet_sock *po, void *frame, >>> + ktime_t tstamp) >>> +{ >>> + struct tpacket_hdr *h1; >>> + struct tpacket2_hdr *h2; >>> + struct timespec ts; >>> + >>> + if (!tstamp.tv64 || !sock_flag(&po->sk, >>> SOCK_TIMESTAMPING_SOFTWARE)) >>> + return; While going a bit more through the code, I'm wondering .. if we want to support TX timestamps, could we also support SW _and_ HW timestamps e.g. similar as in sock_recv_timestamp()? I'm asking, because we already allow setting the flags for it via sock_tx_timestamp(). This might be good, if possible. Paul, since you've reported / requested this, your use case would be to fill the ring, trigger a sendto() and then loop through all the frames to check the tx timestamps for your custom protocol mockup, then fill the returned frames again, etc.? >>> + ts = ktime_to_timespec(tstamp); >>> + >>> + switch (po->tp_version) { >>> + case TPACKET_V1: >>> + h1 = frame; >>> + h1->tp_sec = ts.tv_sec; >>> + h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; >>> + >>> + flush_dcache_page(pgv_to_page(&h1->tp_sec)); >>> + flush_dcache_page(pgv_to_page(&h1->tp_usec)); >> >> >> Hmm, not sure, but could we also flush the dcache only once? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Apr 14, 2013 at 12:52:16PM +0200, Daniel Borkmann wrote: > > While going a bit more through the code, I'm wondering .. if we want to support > TX timestamps, could we also support SW _and_ HW timestamps e.g. similar as in > sock_recv_timestamp()? I'm asking, because we already allow setting the flags > for it via sock_tx_timestamp(). This might be good, if possible. And while you are at it, you could also fix the receive code. As it stand now, it is fairly useless, since there is no way for user space to tell which kind of time stamp has been reported. In fact, the code will silently intermingle hardware and software time stamps. That is surely a mean trick to play on the users. Thanks, Richard -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi. On 04/13/2013 08:56 PM, Willem de Bruijn wrote: > [...] > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -3311,7 +3311,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, > * so keep the shared tx_flags and only > * store software time stamp > */ > - skb->tstamp = ktime_get_real(); > + orig_skb->tstamp = skb->tstamp = ktime_get_real(); > } You said that "the orig_skb is usually freed shortly after skb_tstamp_tx is called". So i suppose that if you had coded it, that's because this orig_skb is the one that is given to the skb destructor. So when you call __packet_set_timestamp, in tpacket_destruct_skb, you get this timestamp ? Am i right ? Why we couldn't call *skb_hwtstamps(orig_skb) = *skb_hwtstamps(skb) = *hwtstamps; in order to get the hardware timestamping too ? Thank for your help. Paul. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 04/14/2013 03:07 PM, Richard Cochran wrote: > On Sun, Apr 14, 2013 at 12:52:16PM +0200, Daniel Borkmann wrote: >> >> While going a bit more through the code, I'm wondering .. if we want to support >> TX timestamps, could we also support SW _and_ HW timestamps e.g. similar as in >> sock_recv_timestamp()? I'm asking, because we already allow setting the flags >> for it via sock_tx_timestamp(). This might be good, if possible. > > And while you are at it, you could also fix the receive code. > > As it stand now, it is fairly useless, since there is no way for user > space to tell which kind of time stamp has been reported. In fact, the > code will silently intermingle hardware and software time stamps. That > is surely a mean trick to play on the users. Isn't it the one that the user ask with setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, ×tamping, sizeof(timestamping)) ? However, i wonder why you added an other sockopt that do the same thing as SOL_SOCKET/SO_TIMESTAMPING sockopt ? Paul. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > + case TPACKET_V1: > > + h1 = frame; > > + h1->tp_sec = ts.tv_sec; > > + h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; > > + > > + flush_dcache_page(pgv_to_page(&h1->tp_sec)); > > + flush_dcache_page(pgv_to_page(&h1->tp_usec)); > > Hmm, not sure, but could we also flush the dcache only once? If it isn't a silly question, why is the dcache being flushed here at all? David -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 04/14/2013 12:52 PM, Daniel Borkmann wrote: > Paul, since you've reported / requested this, your use case would be to > fill > the ring, trigger a sendto() and then loop through all the frames to > check the > tx timestamps for your custom protocol mockup, then fill the returned > frames > again, etc.? I've run a test that create (and setup) a tx ring buffer with 8 frames, then fill the frames payload, then call sendto. I've checked the timestamp by two means : - check the timestamp in the tx ring after the status became TP_STATUS_AVAILABLE again, - by issuing 8 recvmsg on the ERRQUEUE. Both timestamp are coherents and increment themselves. The tests has been done with a UML kernel, and with software timestamps. From my point of view, it seems usable. However, I've found one strange behavior. I must call recvmsg as many times as i have submitted a frame, or not at all. If i only pop the ERRQUEUE for one or two message for instance, the next call to sentdo fails with "No message of desired type" (errno 42). Paul. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 15, 2013 at 3:31 AM, Paul Chavent <Paul.Chavent@onera.fr> wrote: > Hi. > > > On 04/13/2013 08:56 PM, Willem de Bruijn wrote: >> >> [...] >> >> --- a/net/core/skbuff.c >> +++ b/net/core/skbuff.c >> @@ -3311,7 +3311,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, >> * so keep the shared tx_flags and only >> * store software time stamp >> */ >> - skb->tstamp = ktime_get_real(); >> + orig_skb->tstamp = skb->tstamp = ktime_get_real(); >> } > > > You said that "the orig_skb is usually freed shortly after > skb_tstamp_tx is called". > > So i suppose that if you had coded it, that's because this orig_skb is the > one that is given to the skb destructor. So when you call > __packet_set_timestamp, in tpacket_destruct_skb, you get this timestamp ? Am > i right ? > > > Why we couldn't call > *skb_hwtstamps(orig_skb) = *skb_hwtstamps(skb) = *hwtstamps; > in order to get the hardware timestamping too ? Probably. That would also answer Daniel's question to the same functionality. I'm a bit less familiar with the hardware path, so will read up before just submitting it. > Thank for your help. > > Paul. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 15, 2013 at 09:37:30AM +0200, Paul Chavent wrote: > On 04/14/2013 03:07 PM, Richard Cochran wrote: > >As it stand now, it is fairly useless, since there is no way for user > >space to tell which kind of time stamp has been reported. In fact, the > >code will silently intermingle hardware and software time stamps. That > >is surely a mean trick to play on the users. > > Isn't it the one that the user ask with setsockopt(fd, SOL_PACKET, > PACKET_TIMESTAMP, ×tamping, sizeof(timestamping)) ? No, not necessarily. Look at the code in net/packet/af_packet.c. if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) && shhwtstamps->syststamp.tv64) ts = ktime_to_timespec(shhwtstamps->syststamp); else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) && shhwtstamps->hwtstamp.tv64) ts = ktime_to_timespec(shhwtstamps->hwtstamp); else if (skb->tstamp.tv64) ts = ktime_to_timespec(skb->tstamp); else getnstimeofday(&ts); What happens if RAW is requested, but no HW time stamp is available? > However, i wonder why you added an other sockopt that do the same > thing as SOL_SOCKET/SO_TIMESTAMPING sockopt ? Not sure what you mean. I did not add the SOL_PACKET socket option. Thanks, Richard -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 15, 2013 at 3:37 AM, Paul Chavent <Paul.Chavent@onera.fr> wrote: > > > On 04/14/2013 03:07 PM, Richard Cochran wrote: >> >> On Sun, Apr 14, 2013 at 12:52:16PM +0200, Daniel Borkmann wrote: >>> >>> >>> While going a bit more through the code, I'm wondering .. if we want to >>> support >>> TX timestamps, could we also support SW _and_ HW timestamps e.g. similar >>> as in >>> sock_recv_timestamp()? I'm asking, because we already allow setting the >>> flags >>> for it via sock_tx_timestamp(). This might be good, if possible. >> >> >> And while you are at it, you could also fix the receive code. >> >> As it stand now, it is fairly useless, since there is no way for user >> space to tell which kind of time stamp has been reported. In fact, the >> code will silently intermingle hardware and software time stamps. That >> is surely a mean trick to play on the users. Interesting issue, but I'll leave that for a separate fix. > > Isn't it the one that the user ask with setsockopt(fd, SOL_PACKET, > PACKET_TIMESTAMP, ×tamping, sizeof(timestamping)) ? This option toggles whether, if an skbuff has a timestamp, it should be written to the rx ring metadata. skbuffs can have multiple timestamps, so it also determines which one is selected. If I understand correctly, the problem that Richard refers to is that when one socket requests rx timestamping, all rx skbuffs have to be timestamped as it is not known early in the datapath to which socket they will be enqueued. As a result, the timestamps in skbuffs depend on the preferences of other active sockets. Correct me if I'm wrong. If so, it may even vary during the lifetime of a packet socket, so a getsockopt wouldn't help, either. Perhaps nothing short of a new field in the frame header structure will. In which case it makes more sense to support sending each type of timestamp independently, just like the socker error queue mechanism. Adding yet another header format, though? > However, i wonder why you added an other sockopt that do the same thing as > SOL_SOCKET/SO_TIMESTAMPING sockopt ? Back to the patch. I'll revise with - a small patch for the errqueue mechanism - add hw timestamp pass-through in skb_tstamp_tx, if correct - move sock_tx_timestamp where cache is warm - update it to use only a single conditional in the common case: no flags enabled - a follow-up for the ring - single flush_dcache_page, which is safe for current header layouts on 32bit+ platforms (it is also a noop on many of these) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 15, 2013 at 5:45 AM, David Laight <David.Laight@aculab.com> wrote: >> > + case TPACKET_V1: >> > + h1 = frame; >> > + h1->tp_sec = ts.tv_sec; >> > + h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; >> > + >> > + flush_dcache_page(pgv_to_page(&h1->tp_sec)); >> > + flush_dcache_page(pgv_to_page(&h1->tp_usec)); >> >> Hmm, not sure, but could we also flush the dcache only once? > > If it isn't a silly question, why is the dcache being flushed > here at all? I'm not an expert on this, but have a look at Documentation/cachetlb.txt, specifically the bits on this function and the discussion of aliasing: on virtually indexed cache architectures, the same physical address may be cached in multiple cachelines at the same time. If I understand correctly, updating the kernel logical address does not necessarily invalidate the user virtual cacheline for the same physical memory. > David > > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: "David Laight" <David.Laight@ACULAB.COM> Date: Mon, 15 Apr 2013 10:45:55 +0100 >> > + case TPACKET_V1: >> > + h1 = frame; >> > + h1->tp_sec = ts.tv_sec; >> > + h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; >> > + >> > + flush_dcache_page(pgv_to_page(&h1->tp_sec)); >> > + flush_dcache_page(pgv_to_page(&h1->tp_usec)); >> >> Hmm, not sure, but could we also flush the dcache only once? > > If it isn't a silly question, why is the dcache being flushed > here at all? Anything that gets written into userspace mapped pages via the kernel linear mapping of the pages must be D-cache flushed into order to achieve coherency on cpus that have virtually indexed caches. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 15, 2013 at 12:59:26PM -0400, Willem de Bruijn wrote: > > If I understand correctly, the problem that Richard refers to is that > when one socket requests rx timestamping, all rx skbuffs have to be > timestamped as it is not known early in the datapath to which socket > they will be enqueued. As a result, the timestamps in skbuffs depend > on the preferences of other active sockets. Correct me if I'm wrong. No, that is not it. Hardware time stamping needs to be globally enabled at the driver level using the SIOCSHWTSTAMP [1] ioctl. The hardware has various different abilities, usually one of: - no time stamping - some subset of PTP packets - all packets So depending on the mode, not all packets will receive a time stamp. In addition, some of the hardware can only time stamp one packet at a time. Furthermore, time stamps can get dropped in high traffic situations. The code in af_packet.c unconditionally delivers some sort of time stamp, with an inexplicable fall back to gettimeofday. Thus, the user space has no way of knowing what is being reported in the time stamp. Thanks, Richard 1. See Documentation/networking/timestamping.txt -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ba64614..44b9c2a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3311,7 +3311,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, * so keep the shared tx_flags and only * store software time stamp */ - skb->tstamp = ktime_get_real(); + orig_skb->tstamp = skb->tstamp = ktime_get_real(); } serr = SKB_EXT_ERR(skb); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8e4644f..dc5d224 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -341,6 +341,45 @@ static int __packet_get_status(struct packet_sock *po, void *frame) } } +static void __packet_set_timestamp(struct packet_sock *po, void *frame, + ktime_t tstamp) +{ + struct tpacket_hdr *h1; + struct tpacket2_hdr *h2; + struct timespec ts; + + if (!tstamp.tv64 || !sock_flag(&po->sk, SOCK_TIMESTAMPING_SOFTWARE)) + return; + + ts = ktime_to_timespec(tstamp); + + switch (po->tp_version) { + case TPACKET_V1: + h1 = frame; + h1->tp_sec = ts.tv_sec; + h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; + + flush_dcache_page(pgv_to_page(&h1->tp_sec)); + flush_dcache_page(pgv_to_page(&h1->tp_usec)); + break; + case TPACKET_V2: + h2 = frame; + h2->tp_sec = ts.tv_sec; + h2->tp_nsec = ts.tv_nsec; + + flush_dcache_page(pgv_to_page(&h2->tp_sec)); + flush_dcache_page(pgv_to_page(&h2->tp_nsec)); + break; + case TPACKET_V3: + default: + WARN(1, "TPACKET version not supported.\n"); + BUG(); + } + + + smp_wmb(); +} + static void *packet_lookup_frame(struct packet_sock *po, struct packet_ring_buffer *rb, unsigned int position, @@ -1900,6 +1939,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) ph = skb_shinfo(skb)->destructor_arg; BUG_ON(atomic_read(&po->tx_ring.pending) == 0); atomic_dec(&po->tx_ring.pending); + __packet_set_timestamp(po, ph, skb->tstamp); __packet_set_status(po, ph, TP_STATUS_AVAILABLE); } @@ -2119,6 +2159,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } } + sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); + skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); atomic_inc(&po->tx_ring.pending);
When transmit timestamping is enabled at the socket level, have writes to a PACKET_TX_RING record a timestamp for the generated skbuffs. Tx timestamps are always looped to the application over the socket error queue. The patch also loops software timestamps back into the ring. Signed-off-by: Willem de Bruijn <willemb@google.com> --- net/core/skbuff.c | 2 +- net/packet/af_packet.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-)