diff mbox

[v1,net-next,3/6] net: add new control message for incoming HW-timestamped packets

Message ID 20170426145035.25846-4-mlichvar@redhat.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Miroslav Lichvar April 26, 2017, 2:50 p.m. UTC
Add SOF_TIMESTAMPING_OPT_PKTINFO option to request a new control message
for incoming packets with hardware timestamps. It contains the index of
the real interface which received the packet and the length of the
packet at layer 2.

The index is useful with bonding, bridges and other interfaces, where
IP_PKTINFO doesn't allow applications to determine which PHC made the
timestamp. With the L2 length (and link speed) it is possible to
transpose preamble timestamps to trailer timestamps, which are used in
the NTP protocol.

While this information could be provided by two new socket options
independently from timestamping, it doesn't look like it would be very
useful. With this option any performance impact is limited to hardware
timestamping.

As skb currently has no field for the original interface index, use
napi_id to look up the device and its index. This limits the option to
kernels with enabled CONFIG_NET_RX_BUSY_POLL and drivers using napi,
which should cover all current MAC drivers that support hardware
timestamping.

CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 Documentation/networking/timestamping.txt |  8 ++++++++
 include/linux/netdevice.h                 |  1 +
 include/uapi/asm-generic/socket.h         |  2 ++
 include/uapi/linux/net_tstamp.h           |  9 ++++++++-
 net/core/dev.c                            | 18 ++++++++++++++++++
 net/socket.c                              | 26 +++++++++++++++++++++++++-
 6 files changed, 62 insertions(+), 2 deletions(-)

Comments

Willem de Bruijn April 26, 2017, 11:34 p.m. UTC | #1
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 1b3317c..0a78f7f 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -160,6 +160,7 @@ static int netif_rx_internal(struct sk_buff *skb);
>  static int call_netdevice_notifiers_info(unsigned long val,
>                                          struct net_device *dev,
>                                          struct netdev_notifier_info *info);
> +static struct napi_struct *napi_by_id(unsigned int napi_id);
>
>  /*
>   * The @dev_base_head list is protected by @dev_base_lock and the rtnl
> @@ -863,6 +864,23 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
>  }
>  EXPORT_SYMBOL(dev_get_by_index);
>
> +struct net_device *dev_get_by_napi_id(unsigned int napi_id)
> +{
> +       struct net_device *dev = NULL;
> +       struct napi_struct *napi;
> +
> +       rcu_read_lock();
> +
> +       napi = napi_by_id(napi_id);
> +       if (napi)
> +               dev = napi->dev;
> +
> +       rcu_read_unlock();
> +
> +       return dev;
> +}
> +EXPORT_SYMBOL(dev_get_by_napi_id);

Returning dev without holding a reference is not safe. You'll probably
have to call this with rcu_read_lock held instead.

Also, this generic napi function should probably be in a separate patch.

> diff --git a/net/socket.c b/net/socket.c
> index c2564eb..5ea5f29 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -662,6 +662,26 @@ static bool skb_is_err_queue(const struct sk_buff *skb)
>         return skb->pkt_type == PACKET_OUTGOING;
>  }
>
> +static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
> +{
> +#ifdef CONFIG_NET_RX_BUSY_POLL

Let's limit the ifdef scope to napi code. Perhaps we need an skb_get_napi_id
helper that returns 0 if the feature is not compiled in.

> +       struct scm_ts_pktinfo ts_pktinfo;
> +       struct net_device *orig_dev;
> +
> +       if (skb->napi_id < MIN_NAPI_ID || !skb_mac_header_was_set(skb))
> +               return;

This MIN_NAPI_ID check can be moved into dev_get_by_napi_id.

>  /*
>   * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
>   */
> @@ -699,8 +719,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
>                 empty = 0;
>         if (shhwtstamps &&
>             (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&

This information is also informative with software timestamps.

And getting the real iif is definitely useful outside timestamps. An
alternative approach is to add versioning to IP_PKTINFO with a new
setsockopt IP_PKTINFO_VERSION plus a new struct in_pktinfo_v2
that extends in_pktinfo. Just a thought.
kernel test robot April 27, 2017, 4:09 a.m. UTC | #2
Hi Miroslav,

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Miroslav-Lichvar/Extend-socket-timestamping-API/20170427-093243
config: xtensa-allmodconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 4.9.0
reproduce:
        wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   net/socket.c: In function 'put_ts_pktinfo':
>> net/socket.c:680:28: error: 'SCM_TIMESTAMPING_PKTINFO' undeclared (first use in this function)
     put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
                               ^
   net/socket.c:680:28: note: each undeclared identifier is reported only once for each function it appears in

vim +/SCM_TIMESTAMPING_PKTINFO +680 net/socket.c

   674		orig_dev = dev_get_by_napi_id(skb->napi_id);
   675		if (!orig_dev)
   676			return;
   677	
   678		ts_pktinfo.if_index = orig_dev->ifindex;
   679		ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb);
 > 680		put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
   681			 sizeof(ts_pktinfo), &ts_pktinfo);
   682	#endif
   683	}

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Miroslav Lichvar April 27, 2017, 10:15 a.m. UTC | #3
Thanks for the comments.

On Wed, Apr 26, 2017 at 07:34:49PM -0400, Willem de Bruijn wrote:
> > +struct net_device *dev_get_by_napi_id(unsigned int napi_id)
> > +{
> > +       struct net_device *dev = NULL;
> > +       struct napi_struct *napi;
> > +
> > +       rcu_read_lock();
> > +
> > +       napi = napi_by_id(napi_id);
> > +       if (napi)
> > +               dev = napi->dev;
> > +
> > +       rcu_read_unlock();
> > +
> > +       return dev;
> > +}
> > +EXPORT_SYMBOL(dev_get_by_napi_id);
> 
> Returning dev without holding a reference is not safe. You'll probably
> have to call this with rcu_read_lock held instead.

How about changing the function to simply return the index instead of
the device (e.g. dev_get_ifindex_by_napi_id())? Would that be too
specific?

> >  /*
> >   * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
> >   */
> > @@ -699,8 +719,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
> >                 empty = 0;
> >         if (shhwtstamps &&
> >             (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
> 
> This information is also informative with software timestamps.

But is it useful and worth the cost? If I have two interfaces and only
one has HW timestamping, or just one interface which can timestamp
incoming packets at a limited rate, I would prefer to not waste CPU
cycles preparing and processing useless data.

> And getting the real iif is definitely useful outside timestamps.

Do you have an example? We have asked that in the original thread,
but no one suggested anything. For AF_PACKET there is PACKET_ORIGDEV.
When I was searching the Internet on how to get the index with INET
sockets, it looked like I was the only one who had this problem :).

> An
> alternative approach is to add versioning to IP_PKTINFO with a new
> setsockopt IP_PKTINFO_VERSION plus a new struct in_pktinfo_v2
> that extends in_pktinfo. Just a thought.

The struct would contain both the original and last interface index,
and the length as well? And similarly with in6_pktinfo?

If there is an agreement that the information would useful also for
other things than timestamping, I can try that. If not, I think it
would be better to keep it tied to HW timestamping.
Willem de Bruijn April 27, 2017, 11:38 a.m. UTC | #4
On Thu, Apr 27, 2017 at 6:15 AM, Miroslav Lichvar <mlichvar@redhat.com> wrote:
> Thanks for the comments.
>
> On Wed, Apr 26, 2017 at 07:34:49PM -0400, Willem de Bruijn wrote:
>> > +struct net_device *dev_get_by_napi_id(unsigned int napi_id)
>> > +{
>> > +       struct net_device *dev = NULL;
>> > +       struct napi_struct *napi;
>> > +
>> > +       rcu_read_lock();
>> > +
>> > +       napi = napi_by_id(napi_id);
>> > +       if (napi)
>> > +               dev = napi->dev;
>> > +
>> > +       rcu_read_unlock();
>> > +
>> > +       return dev;
>> > +}
>> > +EXPORT_SYMBOL(dev_get_by_napi_id);
>>
>> Returning dev without holding a reference is not safe. You'll probably
>> have to call this with rcu_read_lock held instead.
>
> How about changing the function to simply return the index instead of
> the device (e.g. dev_get_ifindex_by_napi_id())? Would that be too
> specific?

I suspect so. If you can achieve the same by calling this function
within an rcu read_side critical section, I would do that.

>> >  /*
>> >   * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
>> >   */
>> > @@ -699,8 +719,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
>> >                 empty = 0;
>> >         if (shhwtstamps &&
>> >             (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
>>
>> This information is also informative with software timestamps.
>
> But is it useful and worth the cost? If I have two interfaces and only
> one has HW timestamping, or just one interface which can timestamp
> incoming packets at a limited rate, I would prefer to not waste CPU
> cycles preparing and processing useless data.
>
>> And getting the real iif is definitely useful outside timestamps.
>
> Do you have an example? We have asked that in the original thread,
> but no one suggested anything. For AF_PACKET there is PACKET_ORIGDEV.
> When I was searching the Internet on how to get the index with INET
> sockets, it looked like I was the only one who had this problem :).

Okay. Maybe I'm mistaken. If no one else responds with a use case,
then I agree that we can ignore these cases.

>> An
>> alternative approach is to add versioning to IP_PKTINFO with a new
>> setsockopt IP_PKTINFO_VERSION plus a new struct in_pktinfo_v2
>> that extends in_pktinfo. Just a thought.
>
> The struct would contain both the original and last interface index,
> and the length as well? And similarly with in6_pktinfo?
>
> If there is an agreement that the information would useful also for
> other things than timestamping, I can try that. If not, I think it
> would be better to keep it tied to HW timestamping.

Ack.
diff mbox

Patch

diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 96f5069..6c07e7c 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -193,6 +193,14 @@  SOF_TIMESTAMPING_OPT_STATS:
   the transmit timestamps, such as how long a certain block of
   data was limited by peer's receiver window.
 
+SOF_TIMESTAMPING_OPT_PKTINFO:
+
+  Enable the SCM_TIMESTAMPING_PKTINFO control message for incoming
+  packets with hardware timestamps. The message contains struct
+  scm_ts_pktinfo, which supplies the index of the real interface
+  which received the packet and its length at layer 2. This option
+  works only if CONFIG_NET_RX_BUSY_POLL is enabled.
+
 New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
 disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
 regardless of the setting of sysctl net.core.tstamp_allow_data.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8c5c8cd..c7ce189 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2451,6 +2451,7 @@  static inline int dev_recursion_level(void)
 struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
+struct net_device *dev_get_by_napi_id(unsigned int napi_id);
 int netdev_get_name(struct net *net, char *name, int ifindex);
 int dev_restart(struct net_device *dev);
 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb);
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 2b48856..a5f6e81 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -100,4 +100,6 @@ 
 
 #define SO_COOKIE		57
 
+#define SCM_TIMESTAMPING_PKTINFO	58
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 0749fb1..8fcae35 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -26,8 +26,9 @@  enum {
 	SOF_TIMESTAMPING_OPT_CMSG = (1<<10),
 	SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
 	SOF_TIMESTAMPING_OPT_STATS = (1<<12),
+	SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_PKTINFO,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
@@ -130,4 +131,10 @@  enum hwtstamp_rx_filters {
 	HWTSTAMP_FILTER_NTP_ALL,
 };
 
+/* SCM_TIMESTAMPING_PKTINFO control message */
+struct scm_ts_pktinfo {
+	int if_index;
+	int pkt_length;
+};
+
 #endif /* _NET_TIMESTAMPING_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 1b3317c..0a78f7f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -160,6 +160,7 @@  static int netif_rx_internal(struct sk_buff *skb);
 static int call_netdevice_notifiers_info(unsigned long val,
 					 struct net_device *dev,
 					 struct netdev_notifier_info *info);
+static struct napi_struct *napi_by_id(unsigned int napi_id);
 
 /*
  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
@@ -863,6 +864,23 @@  struct net_device *dev_get_by_index(struct net *net, int ifindex)
 }
 EXPORT_SYMBOL(dev_get_by_index);
 
+struct net_device *dev_get_by_napi_id(unsigned int napi_id)
+{
+	struct net_device *dev = NULL;
+	struct napi_struct *napi;
+
+	rcu_read_lock();
+
+	napi = napi_by_id(napi_id);
+	if (napi)
+		dev = napi->dev;
+
+	rcu_read_unlock();
+
+	return dev;
+}
+EXPORT_SYMBOL(dev_get_by_napi_id);
+
 /**
  *	netdev_get_name - get a netdevice name, knowing its ifindex.
  *	@net: network namespace
diff --git a/net/socket.c b/net/socket.c
index c2564eb..5ea5f29 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -662,6 +662,26 @@  static bool skb_is_err_queue(const struct sk_buff *skb)
 	return skb->pkt_type == PACKET_OUTGOING;
 }
 
+static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	struct scm_ts_pktinfo ts_pktinfo;
+	struct net_device *orig_dev;
+
+	if (skb->napi_id < MIN_NAPI_ID || !skb_mac_header_was_set(skb))
+		return;
+
+	orig_dev = dev_get_by_napi_id(skb->napi_id);
+	if (!orig_dev)
+		return;
+
+	ts_pktinfo.if_index = orig_dev->ifindex;
+	ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb);
+	put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
+		 sizeof(ts_pktinfo), &ts_pktinfo);
+#endif
+}
+
 /*
  * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
  */
@@ -699,8 +719,12 @@  void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 		empty = 0;
 	if (shhwtstamps &&
 	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
-	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
+	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
 		empty = 0;
+		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
+		    !skb_is_err_queue(skb))
+			put_ts_pktinfo(msg, skb);
+	}
 	if (!empty) {
 		put_cmsg(msg, SOL_SOCKET,
 			 SCM_TIMESTAMPING, sizeof(tss), &tss);