diff mbox series

[ovs-dev,RFC,net-next] openvswitch: add trace points

Message ID 20210527191532.376414-1-aconole@redhat.com
State Awaiting Upstream
Headers show
Series [ovs-dev,RFC,net-next] openvswitch: add trace points | expand

Commit Message

Aaron Conole May 27, 2021, 7:15 p.m. UTC
This makes openvswitch module use the event tracing framework
to log the upcall interface and action execution pipeline.  When
using openvswitch as the packet forwarding engine, some types of
debugging are made possible simply by using the ovs-vswitchd's
ofproto/trace command.  However, such a command has some
limitations:

  1. When trying to trace packets that go through the CT action,
     the state of the packet can't be determined, and probably
     would be potentially wrong.

  2. Deducing problem packets can sometimes be difficult as well
     even if many of the flows are known

  3. It's possible to use the openvswitch module even without
     the ovs-vswitchd (although, not common use).

Introduce the event tracing points here to make it possible for
working through these problems in kernel space.  The style is
copied from the mac80211 driver-trace / trace code for
consistency.

Signed-off-by: Aaron Conole <aconole@redhat.com>
---
 net/openvswitch/Makefile            |   3 +
 net/openvswitch/actions.c           |   4 +
 net/openvswitch/datapath.c          |   7 ++
 net/openvswitch/openvswitch_trace.c |  10 ++
 net/openvswitch/openvswitch_trace.h | 152 ++++++++++++++++++++++++++++
 5 files changed, 176 insertions(+)
 create mode 100644 net/openvswitch/openvswitch_trace.c
 create mode 100644 net/openvswitch/openvswitch_trace.h

Comments

Eelco Chaudron June 4, 2021, 1:31 p.m. UTC | #1
On 27 May 2021, at 21:15, Aaron Conole wrote:

> This makes openvswitch module use the event tracing framework
> to log the upcall interface and action execution pipeline.  When
> using openvswitch as the packet forwarding engine, some types of
> debugging are made possible simply by using the ovs-vswitchd's
> ofproto/trace command.  However, such a command has some
> limitations:
>
>   1. When trying to trace packets that go through the CT action,
>      the state of the packet can't be determined, and probably
>      would be potentially wrong.
>
>   2. Deducing problem packets can sometimes be difficult as well
>      even if many of the flows are known
>
>   3. It's possible to use the openvswitch module even without
>      the ovs-vswitchd (although, not common use).
>
> Introduce the event tracing points here to make it possible for
> working through these problems in kernel space.  The style is
> copied from the mac80211 driver-trace / trace code for
> consistency.


Thanks for doing this Aaron, it will definitely help when trying to debug some customer issues.

Just to be sure, I did some tests to make sure these changes do not impact performance, and it looks fine! See some small nits/comments below, but other than that I would say please re-send as an official patch.

Cheers,

Eelco

> Signed-off-by: Aaron Conole <aconole@redhat.com>
> ---
>  net/openvswitch/Makefile            |   3 +
>  net/openvswitch/actions.c           |   4 +
>  net/openvswitch/datapath.c          |   7 ++
>  net/openvswitch/openvswitch_trace.c |  10 ++
>  net/openvswitch/openvswitch_trace.h | 152 ++++++++++++++++++++++++++++
>  5 files changed, 176 insertions(+)
>  create mode 100644 net/openvswitch/openvswitch_trace.c
>  create mode 100644 net/openvswitch/openvswitch_trace.h
>
> diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
> index 41109c326f3a..28982630bef3 100644
> --- a/net/openvswitch/Makefile
> +++ b/net/openvswitch/Makefile
> @@ -13,6 +13,7 @@ openvswitch-y := \
>  	flow_netlink.o \
>  	flow_table.o \
>  	meter.o \
> +	openvswitch_trace.o \
>  	vport.o \
>  	vport-internal_dev.o \
>  	vport-netdev.o
> @@ -24,3 +25,5 @@ endif
>  obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
>  obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
>  obj-$(CONFIG_OPENVSWITCH_GRE)	+= vport-gre.o
> +
> +CFLAGS_openvswitch_trace.o = -I$(src)
> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index d858ea580e43..62285453ca79 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -30,6 +30,7 @@
>  #include "conntrack.h"
>  #include "vport.h"
>  #include "flow_netlink.h"
> +#include "openvswitch_trace.h"
>
>  struct deferred_action {
>  	struct sk_buff *skb;
> @@ -1242,6 +1243,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>  	     a = nla_next(a, &rem)) {
>  		int err = 0;
>
> +		if (trace_openvswitch_probe_action_enabled())
> +			trace_openvswitch_probe_action(dp, skb, key, a, rem);
> +
>  		switch (nla_type(a)) {
>  		case OVS_ACTION_ATTR_OUTPUT: {
>  			int port = nla_get_u32(a);
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index 9d6ef6cb9b26..63f19a6ed472 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -43,6 +43,7 @@
>  #include "flow_table.h"
>  #include "flow_netlink.h"
>  #include "meter.h"
> +#include "openvswitch_trace.h"
>  #include "vport-internal_dev.h"
>  #include "vport-netdev.h"
>
> @@ -275,6 +276,12 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
>  	struct dp_stats_percpu *stats;
>  	int err;
>
> +	if (trace_openvswitch_probe_userspace_enabled()) {
> +		struct sw_flow_key ukey = *key;
> +
> +		trace_openvswitch_probe_userspace(dp, skb, &ukey, upcall_info);
> +	}

Rather than work around the const here, can we not fix this in the trace include?

        TP_PROTO(struct datapath *dp, struct sk_buff *skb,
-                struct sw_flow_key *key,
+                const struct sw_flow_key *key,
                 const struct dp_upcall_info *upcall_info),

@@ -97,7 +97,7 @@ TRACE_EVENT(openvswitch_probe_userspace,
                __field(        u16,            gso_type                )
                __field(        u32,            ovs_flow_hash           )
                __field(        u32,            recirc_id               )
-               __field(        void *,         keyaddr                 )
+               __field(        const void *,   keyaddr                 )
                __field(        u16,            key_eth_type            )

> +
>  	if (upcall_info->portid == 0) {
>  		err = -ENOTCONN;
>  		goto err;
> diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c
> new file mode 100644
> index 000000000000..62c5f7d6f023
> --- /dev/null
> +++ b/net/openvswitch/openvswitch_trace.c
> @@ -0,0 +1,10 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* bug in tracepoint.h, it should include this */
> +#include <linux/module.h>
> +
> +/* sparse isn't too happy with all macros... */
> +#ifndef __CHECKER__
> +#define CREATE_TRACE_POINTS
> +#include "openvswitch_trace.h"
> +
> +#endif
> diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h
> new file mode 100644
> index 000000000000..1b350306f622
> --- /dev/null
> +++ b/net/openvswitch/openvswitch_trace.h
> @@ -0,0 +1,152 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM openvswitch
> +
> +#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_OPENVSWITCH_H
> +
> +#include <linux/tracepoint.h>
> +
> +#include "datapath.h"
> +

I guess the naming of the events can be a long debate :) But I think just adding the word “probe” doesn’t help much.

Maybe for just for function entry/exit, we could do something like <subsys>_<function_name>_entry/exit. I’ve seen this used in other components. For example:

  openvswitch_probe_userspace -> openvswitch_ovs_dp_upcall_entry

For none entry/exit trace macros, we could either use a specific name, as we have:

  openvswitch_probe_action -> openvswitch_execute_action

Or maybe even prepended by the function name to easily locate it. For example:

  openvswitch_probe_action -> openvswitch_do_execute_actions__execute_action

I like the double underscore to differentiate, so it might even be nice for the __entry, so openvswitch_ovs_dp_upcall__entry.


Maybe we should also change openvswitch to ovs for both system and events? Or is this too short and might cause confusion?

> +TRACE_EVENT(openvswitch_probe_action,
> +
> +	TP_PROTO(struct datapath *dp, struct sk_buff *skb,
> +		 struct sw_flow_key *key, const struct nlattr *a, int rem),
> +
> +	TP_ARGS(dp, skb, key, a, rem),
> +
> +	TP_STRUCT__entry(
> +		__field(	void *,		dpaddr			)
> +		__string(	dp_name,	ovs_dp_name(dp)		)
> +		__string(	dev_name,	skb->dev->name		)
> +		__field(	void *,		skbaddr			)
> +		__field(	unsigned int,	len			)
> +		__field(	unsigned int,	data_len		)
> +		__field(	unsigned int,	truesize		)
> +		__field(	u8,		nr_frags		)
> +		__field(	u16,		gso_size		)
> +		__field(	u16,		gso_type		)
> +		__field(	u32,		ovs_flow_hash		)
> +		__field(	u32,		recirc_id		)
> +		__field(	void *,		keyaddr			)
> +		__field(	u16,		key_eth_type		)
> +		__field(	u8,		key_ct_state		)
> +		__field(	u8,		key_ct_orig_proto	)
> +		__field(	unsigned int,	flow_key_valid		)
> +		__field(	u8,		action_type		)
> +		__field(	unsigned int,	action_len		)
> +		__field(	void *,		action_data		)
> +		__field(	u8,		is_last			)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dpaddr = dp;
> +		__assign_str(dp_name, ovs_dp_name(dp));
> +		__assign_str(dev_name, skb->dev->name);
> +		__entry->skbaddr = skb;
> +		__entry->len = skb->len;
> +		__entry->data_len = skb->data_len;
> +		__entry->truesize = skb->truesize;
> +		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
> +		__entry->gso_size = skb_shinfo(skb)->gso_size;
> +		__entry->gso_type = skb_shinfo(skb)->gso_type;
> +		__entry->ovs_flow_hash = key->ovs_flow_hash;
> +		__entry->recirc_id = key->recirc_id;
> +		__entry->keyaddr = key;
> +		__entry->key_eth_type = key->eth.type;
> +		__entry->key_ct_state = key->ct_state;
> +		__entry->key_ct_orig_proto = key->ct_orig_proto;
> +		__entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID);
> +		__entry->action_type = nla_type(a);
> +		__entry->action_len = nla_len(a);
> +		__entry->action_data = nla_data(a);
> +		__entry->is_last = nla_is_last(a, rem);
> +	),
> +
> +	TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d action_type=%u action_len=%u action_data=%p is_last=%d",

I guess %p will be good enough here, as I see no other trace modules use %px. But I guess, for some of the values %*ph might be useful, as it will dump the bytes, i.e., action_data.

> +		  __entry->dpaddr, __get_str(dp_name), __get_str(dev_name),
> +		  __entry->skbaddr, __entry->len, __entry->data_len,
> +		  __entry->truesize, __entry->nr_frags, __entry->gso_size,
> +		  __entry->gso_type, __entry->ovs_flow_hash,
> +		  __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type,
> +		  __entry->key_ct_state, __entry->key_ct_orig_proto,
> +		  __entry->flow_key_valid,
> +		  __entry->action_type, __entry->action_len,
> +		  __entry->action_data, __entry->is_last)
> +);
> +
> +TRACE_EVENT(openvswitch_probe_userspace,
> +
> +	TP_PROTO(struct datapath *dp, struct sk_buff *skb,
> +		 struct sw_flow_key *key,
> +		 const struct dp_upcall_info *upcall_info),
> +
> +	TP_ARGS(dp, skb, key, upcall_info),
> +
> +	TP_STRUCT__entry(
> +		__field(	void *,		dpaddr			)
> +		__string(	dp_name,	ovs_dp_name(dp)		)
> +		__string(	dev_name,	skb->dev->name		)
> +		__field(	void *,		skbaddr			)
> +		__field(	unsigned int,	len			)
> +		__field(	unsigned int,	data_len		)
> +		__field(	unsigned int,	truesize		)
> +		__field(	u8,		nr_frags		)
> +		__field(	u16,		gso_size		)
> +		__field(	u16,		gso_type		)
> +		__field(	u32,		ovs_flow_hash		)
> +		__field(	u32,		recirc_id		)
> +		__field(	void *,		keyaddr			)
> +		__field(	u16,		key_eth_type		)
> +		__field(	u8,		key_ct_state		)
> +		__field(	u8,		key_ct_orig_proto	)
> +		__field(	unsigned int,	flow_key_valid		)
> +		__field(	u8,		upcall_cmd		)
> +		__field(	u32,		upcall_port		)
> +		__field(	u16,		upcall_mru		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dpaddr = dp;
> +		__assign_str(dp_name, ovs_dp_name(dp));
> +		__assign_str(dev_name, skb->dev->name);
> +		__entry->skbaddr = skb;
> +		__entry->len = skb->len;
> +		__entry->data_len = skb->data_len;
> +		__entry->truesize = skb->truesize;
> +		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
> +		__entry->gso_size = skb_shinfo(skb)->gso_size;
> +		__entry->gso_type = skb_shinfo(skb)->gso_type;
> +		__entry->ovs_flow_hash = key->ovs_flow_hash;
> +		__entry->recirc_id = key->recirc_id;
> +		__entry->keyaddr = key;
> +		__entry->key_eth_type = key->eth.type;
> +		__entry->key_ct_state = key->ct_state;
> +		__entry->key_ct_orig_proto = key->ct_orig_proto;
> +		__entry->flow_key_valid =  !(key->mac_proto & SW_FLOW_KEY_INVALID);
> +		__entry->upcall_cmd = upcall_info->cmd;
> +		__entry->upcall_port = upcall_info->portid;
> +		__entry->upcall_mru = upcall_info->mru;
> +	),
> +
> +	TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d upcall_cmd=%u upcall_port=%u upcall_mru=%u",

See %p comment above.

> +		  __entry->dpaddr, __get_str(dp_name), __get_str(dev_name),
> +		  __entry->skbaddr, __entry->len, __entry->data_len,
> +		  __entry->truesize, __entry->nr_frags, __entry->gso_size,
> +		  __entry->gso_type, __entry->ovs_flow_hash,
> +		  __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type,
> +		  __entry->key_ct_state, __entry->key_ct_orig_proto,
> +		  __entry->flow_key_valid,
> +		  __entry->upcall_cmd, __entry->upcall_port,
> +		  __entry->upcall_mru)
> +);
> +
> +#endif /* _TRACE_OPENVSWITCH_H */
> +
> +/* This part must be outside protection */
> +#undef TRACE_INCLUDE_PATH
> +#define TRACE_INCLUDE_PATH .
> +#undef TRACE_INCLUDE_FILE
> +#define TRACE_INCLUDE_FILE openvswitch_trace
> +#include <trace/define_trace.h>
> -- 
> 2.31.1
Aaron Conole June 14, 2021, 1:28 p.m. UTC | #2
Eelco Chaudron <echaudro@redhat.com> writes:

> On 27 May 2021, at 21:15, Aaron Conole wrote:
>
>  This makes openvswitch module use the event tracing framework 
>  to log the upcall interface and action execution pipeline. When 
>  using openvswitch as the packet forwarding engine, some types of 
>  debugging are made possible simply by using the ovs-vswitchd's 
>  ofproto/trace command. However, such a command has some 
>  limitations:
>
>  1. When trying to trace packets that go through the CT action, 
>  the state of the packet can't be determined, and probably 
>  would be potentially wrong.
>
>  2. Deducing problem packets can sometimes be difficult as well 
>  even if many of the flows are known
>
>  3. It's possible to use the openvswitch module even without 
>  the ovs-vswitchd (although, not common use).
>
>  Introduce the event tracing points here to make it possible for 
>  working through these problems in kernel space. The style is 
>  copied from the mac80211 driver-trace / trace code for 
>  consistency.
>
> Thanks for doing this Aaron, it will definitely help when trying to debug some customer issues.

Thanks for the review!

> Just to be sure, I did some tests to make sure these changes do not impact performance, and it looks fine! See some
> small nits/comments below, but other than that I would say please re-send as an official patch.
>
> Cheers,
>
> Eelco
>
>  Signed-off-by: Aaron Conole <aconole@redhat.com> 
>  --- 
>  net/openvswitch/Makefile | 3 + 
>  net/openvswitch/actions.c | 4 + 
>  net/openvswitch/datapath.c | 7 ++ 
>  net/openvswitch/openvswitch_trace.c | 10 ++ 
>  net/openvswitch/openvswitch_trace.h | 152 ++++++++++++++++++++++++++++ 
>  5 files changed, 176 insertions(+) 
>  create mode 100644 net/openvswitch/openvswitch_trace.c 
>  create mode 100644 net/openvswitch/openvswitch_trace.h
>
>  diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile 
>  index 41109c326f3a..28982630bef3 100644 
>  --- a/net/openvswitch/Makefile 
>  +++ b/net/openvswitch/Makefile 
>  @@ -13,6 +13,7 @@ openvswitch-y := \ 
>  flow_netlink.o \ 
>  flow_table.o \ 
>  meter.o \ 
>  + openvswitch_trace.o \ 
>  vport.o \ 
>  vport-internal_dev.o \ 
>  vport-netdev.o 
>  @@ -24,3 +25,5 @@ endif 
>  obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o 
>  obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o 
>  obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o 
>  + 
>  +CFLAGS_openvswitch_trace.o = -I$(src) 
>  diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c 
>  index d858ea580e43..62285453ca79 100644 
>  --- a/net/openvswitch/actions.c 
>  +++ b/net/openvswitch/actions.c 
>  @@ -30,6 +30,7 @@ 
>  #include "conntrack.h" 
>  #include "vport.h" 
>  #include "flow_netlink.h" 
>  +#include "openvswitch_trace.h"
>
>  struct deferred_action { 
>  struct sk_buff *skb; 
>  @@ -1242,6 +1243,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, 
>  a = nla_next(a, &rem)) { 
>  int err = 0;
>
>  + if (trace_openvswitch_probe_action_enabled()) 
>  + trace_openvswitch_probe_action(dp, skb, key, a, rem); 
>  + 
>  switch (nla_type(a)) { 
>  case OVS_ACTION_ATTR_OUTPUT: { 
>  int port = nla_get_u32(a); 
>  diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c 
>  index 9d6ef6cb9b26..63f19a6ed472 100644 
>  --- a/net/openvswitch/datapath.c 
>  +++ b/net/openvswitch/datapath.c 
>  @@ -43,6 +43,7 @@ 
>  #include "flow_table.h" 
>  #include "flow_netlink.h" 
>  #include "meter.h" 
>  +#include "openvswitch_trace.h" 
>  #include "vport-internal_dev.h" 
>  #include "vport-netdev.h"
>
>  @@ -275,6 +276,12 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, 
>  struct dp_stats_percpu *stats; 
>  int err;
>
>  + if (trace_openvswitch_probe_userspace_enabled()) { 
>  + struct sw_flow_key ukey = *key; 
>  + 
>  + trace_openvswitch_probe_userspace(dp, skb, &ukey, upcall_info); 
>  + }
>
> Rather than work around the const here, can we not fix this in the trace include?

Done.

>     TP_PROTO(struct datapath *dp, struct sk_buff *skb,
>
> *            struct sw_flow_key *key,
>
> *            const struct sw_flow_key *key,
>            const struct dp_upcall_info *upcall_info),
>
> @@ -97,7 +97,7 @@ TRACE_EVENT(openvswitch_probe_userspace,
> __field( u16, gso_type )
> __field( u32, ovs_flow_hash )
> __field( u32, recirc_id )
>
> *           __field(        void *,         keyaddr                 )
>
> *           __field(        const void *,   keyaddr                 )
>           __field(        u16,            key_eth_type            )
>
>  + 
>  if (upcall_info->portid == 0) { 
>  err = -ENOTCONN; 
>  goto err; 
>  diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c 
>  new file mode 100644 
>  index 000000000000..62c5f7d6f023 
>  --- /dev/null 
>  +++ b/net/openvswitch/openvswitch_trace.c 
>  @@ -0,0 +1,10 @@ 
>  +// SPDX-License-Identifier: GPL-2.0 
>  +/* bug in tracepoint.h, it should include this */ 
>  +#include <linux/module.h> 
>  + 
>  +/* sparse isn't too happy with all macros... */ 
>  +#ifndef __CHECKER__ 
>  +#define CREATE_TRACE_POINTS 
>  +#include "openvswitch_trace.h" 
>  + 
>  +#endif 
>  diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h 
>  new file mode 100644 
>  index 000000000000..1b350306f622 
>  --- /dev/null 
>  +++ b/net/openvswitch/openvswitch_trace.h 
>  @@ -0,0 +1,152 @@ 
>  +/* SPDX-License-Identifier: GPL-2.0 */ 
>  +#undef TRACE_SYSTEM 
>  +#define TRACE_SYSTEM openvswitch 
>  + 
>  +#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ) 
>  +#define _TRACE_OPENVSWITCH_H 
>  + 
>  +#include <linux/tracepoint.h> 
>  + 
>  +#include "datapath.h" 
>  +
>
> I guess the naming of the events can be a long debate :) But I think just adding the word “probe” doesn’t help much.
>
> Maybe for just for function entry/exit, we could do something like <subsys>_<function_name>_entry/exit. I’ve seen
> this used in other components. For example:
>
> openvswitch_probe_userspace -> openvswitch_ovs_dp_upcall_entry
>
> For none entry/exit trace macros, we could either use a specific name, as we have:
>
> openvswitch_probe_action -> openvswitch_execute_action
>
> Or maybe even prepended by the function name to easily locate it. For example:
>
> openvswitch_probe_action -> openvswitch_do_execute_actions__execute_action
>
> I like the double underscore to differentiate, so it might even be nice for the __entry, so
> openvswitch_ovs_dp_upcall__entry.
>
> Maybe we should also change openvswitch to ovs for both system and events? Or is this too short and might cause
> confusion?

I agree that we can make these more descriptive, and that 'probe'
doesn't add much.  OTOH, if I use
"openvswitch_do_execute_actions__execute_action" or the corresponding
upcall trace point name, the line lengths balloon.  I don't see that
it's as useful to say both 'do_execute_actions' and '__execute_action' -
I don't think there will ever be a reason to add an additional
tracepoint here since the bulk of processing happens during the action
loop.  But maybe something like:

  'openvswitch_execute_action'

and

  'openvswitch_dp_upcall'

?

Just want to get this part right.

>  +TRACE_EVENT(openvswitch_probe_action, 
>  + 
>  + TP_PROTO(struct datapath *dp, struct sk_buff *skb, 
>  + struct sw_flow_key *key, const struct nlattr *a, int rem), 
>  + 
>  + TP_ARGS(dp, skb, key, a, rem), 
>  + 
>  + TP_STRUCT__entry( 
>  + __field( void *, dpaddr ) 
>  + __string( dp_name, ovs_dp_name(dp) ) 
>  + __string( dev_name, skb->dev->name ) 
>  + __field( void *, skbaddr ) 
>  + __field( unsigned int, len ) 
>  + __field( unsigned int, data_len ) 
>  + __field( unsigned int, truesize ) 
>  + __field( u8, nr_frags ) 
>  + __field( u16, gso_size ) 
>  + __field( u16, gso_type ) 
>  + __field( u32, ovs_flow_hash ) 
>  + __field( u32, recirc_id ) 
>  + __field( void *, keyaddr ) 
>  + __field( u16, key_eth_type ) 
>  + __field( u8, key_ct_state ) 
>  + __field( u8, key_ct_orig_proto ) 
>  + __field( unsigned int, flow_key_valid ) 
>  + __field( u8, action_type ) 
>  + __field( unsigned int, action_len ) 
>  + __field( void *, action_data ) 
>  + __field( u8, is_last ) 
>  + ), 
>  + 
>  + TP_fast_assign( 
>  + __entry->dpaddr = dp; 
>  + __assign_str(dp_name, ovs_dp_name(dp)); 
>  + __assign_str(dev_name, skb->dev->name); 
>  + __entry->skbaddr = skb; 
>  + __entry->len = skb->len; 
>  + __entry->data_len = skb->data_len; 
>  + __entry->truesize = skb->truesize; 
>  + __entry->nr_frags = skb_shinfo(skb)->nr_frags; 
>  + __entry->gso_size = skb_shinfo(skb)->gso_size; 
>  + __entry->gso_type = skb_shinfo(skb)->gso_type; 
>  + __entry->ovs_flow_hash = key->ovs_flow_hash; 
>  + __entry->recirc_id = key->recirc_id; 
>  + __entry->keyaddr = key; 
>  + __entry->key_eth_type = key->eth.type; 
>  + __entry->key_ct_state = key->ct_state; 
>  + __entry->key_ct_orig_proto = key->ct_orig_proto; 
>  + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); 
>  + __entry->action_type = nla_type(a); 
>  + __entry->action_len = nla_len(a); 
>  + __entry->action_data = nla_data(a); 
>  + __entry->is_last = nla_is_last(a, rem); 
>  + ), 
>  + 
>  + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d
>  gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x
>  ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d action_type=%u action_len=%u action_data=%p
>  is_last=%d",
>
> I guess %p will be good enough here, as I see no other trace modules use %px. But I guess, for some of the values
> %*ph might be useful, as it will dump the bytes, i.e., action_data.

The issue with using %*ph for action_data is in the cases where a
netlink attribute doesn't have data.  In that case, we don't have any
bytes to print, and I think the behavior would be undefined (see
ct_clear action, or pop_eth action, etc).

>  + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), 
>  + __entry->skbaddr, __entry->len, __entry->data_len, 
>  + __entry->truesize, __entry->nr_frags, __entry->gso_size, 
>  + __entry->gso_type, __entry->ovs_flow_hash, 
>  + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, 
>  + __entry->key_ct_state, __entry->key_ct_orig_proto, 
>  + __entry->flow_key_valid, 
>  + __entry->action_type, __entry->action_len, 
>  + __entry->action_data, __entry->is_last) 
>  +); 
>  + 
>  +TRACE_EVENT(openvswitch_probe_userspace, 
>  + 
>  + TP_PROTO(struct datapath *dp, struct sk_buff *skb, 
>  + struct sw_flow_key *key, 
>  + const struct dp_upcall_info *upcall_info), 
>  + 
>  + TP_ARGS(dp, skb, key, upcall_info), 
>  + 
>  + TP_STRUCT__entry( 
>  + __field( void *, dpaddr ) 
>  + __string( dp_name, ovs_dp_name(dp) ) 
>  + __string( dev_name, skb->dev->name ) 
>  + __field( void *, skbaddr ) 
>  + __field( unsigned int, len ) 
>  + __field( unsigned int, data_len ) 
>  + __field( unsigned int, truesize ) 
>  + __field( u8, nr_frags ) 
>  + __field( u16, gso_size ) 
>  + __field( u16, gso_type ) 
>  + __field( u32, ovs_flow_hash ) 
>  + __field( u32, recirc_id ) 
>  + __field( void *, keyaddr ) 
>  + __field( u16, key_eth_type ) 
>  + __field( u8, key_ct_state ) 
>  + __field( u8, key_ct_orig_proto ) 
>  + __field( unsigned int, flow_key_valid ) 
>  + __field( u8, upcall_cmd ) 
>  + __field( u32, upcall_port ) 
>  + __field( u16, upcall_mru ) 
>  + ), 
>  + 
>  + TP_fast_assign( 
>  + __entry->dpaddr = dp; 
>  + __assign_str(dp_name, ovs_dp_name(dp)); 
>  + __assign_str(dev_name, skb->dev->name); 
>  + __entry->skbaddr = skb; 
>  + __entry->len = skb->len; 
>  + __entry->data_len = skb->data_len; 
>  + __entry->truesize = skb->truesize; 
>  + __entry->nr_frags = skb_shinfo(skb)->nr_frags; 
>  + __entry->gso_size = skb_shinfo(skb)->gso_size; 
>  + __entry->gso_type = skb_shinfo(skb)->gso_type; 
>  + __entry->ovs_flow_hash = key->ovs_flow_hash; 
>  + __entry->recirc_id = key->recirc_id; 
>  + __entry->keyaddr = key; 
>  + __entry->key_eth_type = key->eth.type; 
>  + __entry->key_ct_state = key->ct_state; 
>  + __entry->key_ct_orig_proto = key->ct_orig_proto; 
>  + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); 
>  + __entry->upcall_cmd = upcall_info->cmd; 
>  + __entry->upcall_port = upcall_info->portid; 
>  + __entry->upcall_mru = upcall_info->mru; 
>  + ), 
>  + 
>  + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d
>  gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x
>  ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d upcall_cmd=%u upcall_port=%u upcall_mru=%u",
>
> See %p comment above.

Same applies.

>  + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), 
>  + __entry->skbaddr, __entry->len, __entry->data_len, 
>  + __entry->truesize, __entry->nr_frags, __entry->gso_size, 
>  + __entry->gso_type, __entry->ovs_flow_hash, 
>  + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, 
>  + __entry->key_ct_state, __entry->key_ct_orig_proto, 
>  + __entry->flow_key_valid, 
>  + __entry->upcall_cmd, __entry->upcall_port, 
>  + __entry->upcall_mru) 
>  +); 
>  + 
>  +#endif /* _TRACE_OPENVSWITCH_H */ 
>  + 
>  +/* This part must be outside protection */ 
>  +#undef TRACE_INCLUDE_PATH 
>  +#define TRACE_INCLUDE_PATH . 
>  +#undef TRACE_INCLUDE_FILE 
>  +#define TRACE_INCLUDE_FILE openvswitch_trace 
>  +#include <trace/define_trace.h> 
>  -- 
>  2.31.1
Dumitru Ceara June 21, 2021, 11:02 a.m. UTC | #3
On 5/27/21 9:15 PM, Aaron Conole wrote:
> This makes openvswitch module use the event tracing framework
> to log the upcall interface and action execution pipeline.  When
> using openvswitch as the packet forwarding engine, some types of
> debugging are made possible simply by using the ovs-vswitchd's
> ofproto/trace command.  However, such a command has some
> limitations:
> 
>   1. When trying to trace packets that go through the CT action,
>      the state of the packet can't be determined, and probably
>      would be potentially wrong.
> 
>   2. Deducing problem packets can sometimes be difficult as well
>      even if many of the flows are known
> 
>   3. It's possible to use the openvswitch module even without
>      the ovs-vswitchd (although, not common use).
> 
> Introduce the event tracing points here to make it possible for
> working through these problems in kernel space.  The style is
> copied from the mac80211 driver-trace / trace code for
> consistency.
> 
> Signed-off-by: Aaron Conole <aconole@redhat.com>
> ---

Hi Aaron,

>  net/openvswitch/Makefile            |   3 +
>  net/openvswitch/actions.c           |   4 +
>  net/openvswitch/datapath.c          |   7 ++
>  net/openvswitch/openvswitch_trace.c |  10 ++
>  net/openvswitch/openvswitch_trace.h | 152 ++++++++++++++++++++++++++++
>  5 files changed, 176 insertions(+)
>  create mode 100644 net/openvswitch/openvswitch_trace.c
>  create mode 100644 net/openvswitch/openvswitch_trace.h
> 
> diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
> index 41109c326f3a..28982630bef3 100644
> --- a/net/openvswitch/Makefile
> +++ b/net/openvswitch/Makefile
> @@ -13,6 +13,7 @@ openvswitch-y := \
>  	flow_netlink.o \
>  	flow_table.o \
>  	meter.o \
> +	openvswitch_trace.o \
>  	vport.o \
>  	vport-internal_dev.o \
>  	vport-netdev.o
> @@ -24,3 +25,5 @@ endif
>  obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
>  obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
>  obj-$(CONFIG_OPENVSWITCH_GRE)	+= vport-gre.o
> +
> +CFLAGS_openvswitch_trace.o = -I$(src)
> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index d858ea580e43..62285453ca79 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -30,6 +30,7 @@
>  #include "conntrack.h"
>  #include "vport.h"
>  #include "flow_netlink.h"
> +#include "openvswitch_trace.h"
>  
>  struct deferred_action {
>  	struct sk_buff *skb;
> @@ -1242,6 +1243,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>  	     a = nla_next(a, &rem)) {
>  		int err = 0;
>  
> +		if (trace_openvswitch_probe_action_enabled())
> +			trace_openvswitch_probe_action(dp, skb, key, a, rem);
> +
>  		switch (nla_type(a)) {
>  		case OVS_ACTION_ATTR_OUTPUT: {
>  			int port = nla_get_u32(a);
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index 9d6ef6cb9b26..63f19a6ed472 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -43,6 +43,7 @@
>  #include "flow_table.h"
>  #include "flow_netlink.h"
>  #include "meter.h"
> +#include "openvswitch_trace.h"
>  #include "vport-internal_dev.h"
>  #include "vport-netdev.h"
>  
> @@ -275,6 +276,12 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
>  	struct dp_stats_percpu *stats;
>  	int err;
>  
> +	if (trace_openvswitch_probe_userspace_enabled()) {
> +		struct sw_flow_key ukey = *key;
> +
> +		trace_openvswitch_probe_userspace(dp, skb, &ukey, upcall_info);
> +	}
> +
>  	if (upcall_info->portid == 0) {
>  		err = -ENOTCONN;
>  		goto err;
> diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c
> new file mode 100644
> index 000000000000..62c5f7d6f023
> --- /dev/null
> +++ b/net/openvswitch/openvswitch_trace.c
> @@ -0,0 +1,10 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* bug in tracepoint.h, it should include this */
> +#include <linux/module.h>
> +
> +/* sparse isn't too happy with all macros... */
> +#ifndef __CHECKER__
> +#define CREATE_TRACE_POINTS
> +#include "openvswitch_trace.h"
> +
> +#endif
> diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h
> new file mode 100644
> index 000000000000..1b350306f622
> --- /dev/null
> +++ b/net/openvswitch/openvswitch_trace.h
> @@ -0,0 +1,152 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM openvswitch
> +
> +#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_OPENVSWITCH_H
> +
> +#include <linux/tracepoint.h>
> +
> +#include "datapath.h"
> +
> +TRACE_EVENT(openvswitch_probe_action,
> +
> +	TP_PROTO(struct datapath *dp, struct sk_buff *skb,
> +		 struct sw_flow_key *key, const struct nlattr *a, int rem),
> +
> +	TP_ARGS(dp, skb, key, a, rem),
> +
> +	TP_STRUCT__entry(
> +		__field(	void *,		dpaddr			)
> +		__string(	dp_name,	ovs_dp_name(dp)		)
> +		__string(	dev_name,	skb->dev->name		)
> +		__field(	void *,		skbaddr			)
> +		__field(	unsigned int,	len			)
> +		__field(	unsigned int,	data_len		)
> +		__field(	unsigned int,	truesize		)
> +		__field(	u8,		nr_frags		)
> +		__field(	u16,		gso_size		)
> +		__field(	u16,		gso_type		)
> +		__field(	u32,		ovs_flow_hash		)
> +		__field(	u32,		recirc_id		)
> +		__field(	void *,		keyaddr			)
> +		__field(	u16,		key_eth_type		)
> +		__field(	u8,		key_ct_state		)
> +		__field(	u8,		key_ct_orig_proto	)

I think tracing ct_zone would be useful too.

> +		__field(	unsigned int,	flow_key_valid		)
> +		__field(	u8,		action_type		)
> +		__field(	unsigned int,	action_len		)
> +		__field(	void *,		action_data		)
> +		__field(	u8,		is_last			)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dpaddr = dp;
> +		__assign_str(dp_name, ovs_dp_name(dp));
> +		__assign_str(dev_name, skb->dev->name);
> +		__entry->skbaddr = skb;
> +		__entry->len = skb->len;
> +		__entry->data_len = skb->data_len;
> +		__entry->truesize = skb->truesize;
> +		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
> +		__entry->gso_size = skb_shinfo(skb)->gso_size;
> +		__entry->gso_type = skb_shinfo(skb)->gso_type;
> +		__entry->ovs_flow_hash = key->ovs_flow_hash;
> +		__entry->recirc_id = key->recirc_id;
> +		__entry->keyaddr = key;
> +		__entry->key_eth_type = key->eth.type;
> +		__entry->key_ct_state = key->ct_state;
> +		__entry->key_ct_orig_proto = key->ct_orig_proto;
> +		__entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID);
> +		__entry->action_type = nla_type(a);
> +		__entry->action_len = nla_len(a);
> +		__entry->action_data = nla_data(a);
> +		__entry->is_last = nla_is_last(a, rem);
> +	),
> +
> +	TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d action_type=%u action_len=%u action_data=%p is_last=%d",
> +		  __entry->dpaddr, __get_str(dp_name), __get_str(dev_name),
> +		  __entry->skbaddr, __entry->len, __entry->data_len,
> +		  __entry->truesize, __entry->nr_frags, __entry->gso_size,
> +		  __entry->gso_type, __entry->ovs_flow_hash,
> +		  __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type,
> +		  __entry->key_ct_state, __entry->key_ct_orig_proto,
> +		  __entry->flow_key_valid,
> +		  __entry->action_type, __entry->action_len,
> +		  __entry->action_data, __entry->is_last)
> +);
> +
> +TRACE_EVENT(openvswitch_probe_userspace,
> +
> +	TP_PROTO(struct datapath *dp, struct sk_buff *skb,
> +		 struct sw_flow_key *key,
> +		 const struct dp_upcall_info *upcall_info),
> +
> +	TP_ARGS(dp, skb, key, upcall_info),
> +
> +	TP_STRUCT__entry(
> +		__field(	void *,		dpaddr			)
> +		__string(	dp_name,	ovs_dp_name(dp)		)
> +		__string(	dev_name,	skb->dev->name		)
> +		__field(	void *,		skbaddr			)
> +		__field(	unsigned int,	len			)
> +		__field(	unsigned int,	data_len		)
> +		__field(	unsigned int,	truesize		)
> +		__field(	u8,		nr_frags		)
> +		__field(	u16,		gso_size		)
> +		__field(	u16,		gso_type		)
> +		__field(	u32,		ovs_flow_hash		)
> +		__field(	u32,		recirc_id		)
> +		__field(	void *,		keyaddr			)
> +		__field(	u16,		key_eth_type		)
> +		__field(	u8,		key_ct_state		)
> +		__field(	u8,		key_ct_orig_proto	)

Same here.

Thanks,
Dumitru

> +		__field(	unsigned int,	flow_key_valid		)
> +		__field(	u8,		upcall_cmd		)
> +		__field(	u32,		upcall_port		)
> +		__field(	u16,		upcall_mru		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dpaddr = dp;
> +		__assign_str(dp_name, ovs_dp_name(dp));
> +		__assign_str(dev_name, skb->dev->name);
> +		__entry->skbaddr = skb;
> +		__entry->len = skb->len;
> +		__entry->data_len = skb->data_len;
> +		__entry->truesize = skb->truesize;
> +		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
> +		__entry->gso_size = skb_shinfo(skb)->gso_size;
> +		__entry->gso_type = skb_shinfo(skb)->gso_type;
> +		__entry->ovs_flow_hash = key->ovs_flow_hash;
> +		__entry->recirc_id = key->recirc_id;
> +		__entry->keyaddr = key;
> +		__entry->key_eth_type = key->eth.type;
> +		__entry->key_ct_state = key->ct_state;
> +		__entry->key_ct_orig_proto = key->ct_orig_proto;
> +		__entry->flow_key_valid =  !(key->mac_proto & SW_FLOW_KEY_INVALID);
> +		__entry->upcall_cmd = upcall_info->cmd;
> +		__entry->upcall_port = upcall_info->portid;
> +		__entry->upcall_mru = upcall_info->mru;
> +	),
> +
> +	TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d upcall_cmd=%u upcall_port=%u upcall_mru=%u",
> +		  __entry->dpaddr, __get_str(dp_name), __get_str(dev_name),
> +		  __entry->skbaddr, __entry->len, __entry->data_len,
> +		  __entry->truesize, __entry->nr_frags, __entry->gso_size,
> +		  __entry->gso_type, __entry->ovs_flow_hash,
> +		  __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type,
> +		  __entry->key_ct_state, __entry->key_ct_orig_proto,
> +		  __entry->flow_key_valid,
> +		  __entry->upcall_cmd, __entry->upcall_port,
> +		  __entry->upcall_mru)
> +);
> +
> +#endif /* _TRACE_OPENVSWITCH_H */
> +
> +/* This part must be outside protection */
> +#undef TRACE_INCLUDE_PATH
> +#define TRACE_INCLUDE_PATH .
> +#undef TRACE_INCLUDE_FILE
> +#define TRACE_INCLUDE_FILE openvswitch_trace
> +#include <trace/define_trace.h>
>
Aaron Conole June 21, 2021, 11:59 a.m. UTC | #4
Dumitru Ceara <dceara@redhat.com> writes:

> On 5/27/21 9:15 PM, Aaron Conole wrote:
>> This makes openvswitch module use the event tracing framework
>> to log the upcall interface and action execution pipeline.  When
>> using openvswitch as the packet forwarding engine, some types of
>> debugging are made possible simply by using the ovs-vswitchd's
>> ofproto/trace command.  However, such a command has some
>> limitations:
>> 
>>   1. When trying to trace packets that go through the CT action,
>>      the state of the packet can't be determined, and probably
>>      would be potentially wrong.
>> 
>>   2. Deducing problem packets can sometimes be difficult as well
>>      even if many of the flows are known
>> 
>>   3. It's possible to use the openvswitch module even without
>>      the ovs-vswitchd (although, not common use).
>> 
>> Introduce the event tracing points here to make it possible for
>> working through these problems in kernel space.  The style is
>> copied from the mac80211 driver-trace / trace code for
>> consistency.
>> 
>> Signed-off-by: Aaron Conole <aconole@redhat.com>
>> ---
>
> Hi Aaron,

Hi Dumitru,

>>  net/openvswitch/Makefile            |   3 +
>>  net/openvswitch/actions.c           |   4 +
>>  net/openvswitch/datapath.c          |   7 ++
>>  net/openvswitch/openvswitch_trace.c |  10 ++
>>  net/openvswitch/openvswitch_trace.h | 152 ++++++++++++++++++++++++++++
>>  5 files changed, 176 insertions(+)
>>  create mode 100644 net/openvswitch/openvswitch_trace.c
>>  create mode 100644 net/openvswitch/openvswitch_trace.h
>> 
>> diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
>> index 41109c326f3a..28982630bef3 100644
>> --- a/net/openvswitch/Makefile
>> +++ b/net/openvswitch/Makefile
>> @@ -13,6 +13,7 @@ openvswitch-y := \
>>  	flow_netlink.o \
>>  	flow_table.o \
>>  	meter.o \
>> +	openvswitch_trace.o \
>>  	vport.o \
>>  	vport-internal_dev.o \
>>  	vport-netdev.o
>> @@ -24,3 +25,5 @@ endif
>>  obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
>>  obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
>>  obj-$(CONFIG_OPENVSWITCH_GRE)	+= vport-gre.o
>> +
>> +CFLAGS_openvswitch_trace.o = -I$(src)
>> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
>> index d858ea580e43..62285453ca79 100644
>> --- a/net/openvswitch/actions.c
>> +++ b/net/openvswitch/actions.c
>> @@ -30,6 +30,7 @@
>>  #include "conntrack.h"
>>  #include "vport.h"
>>  #include "flow_netlink.h"
>> +#include "openvswitch_trace.h"
>>  
>>  struct deferred_action {
>>  	struct sk_buff *skb;
>> @@ -1242,6 +1243,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>>  	     a = nla_next(a, &rem)) {
>>  		int err = 0;
>>  
>> +		if (trace_openvswitch_probe_action_enabled())
>> +			trace_openvswitch_probe_action(dp, skb, key, a, rem);
>> +
>>  		switch (nla_type(a)) {
>>  		case OVS_ACTION_ATTR_OUTPUT: {
>>  			int port = nla_get_u32(a);
>> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
>> index 9d6ef6cb9b26..63f19a6ed472 100644
>> --- a/net/openvswitch/datapath.c
>> +++ b/net/openvswitch/datapath.c
>> @@ -43,6 +43,7 @@
>>  #include "flow_table.h"
>>  #include "flow_netlink.h"
>>  #include "meter.h"
>> +#include "openvswitch_trace.h"
>>  #include "vport-internal_dev.h"
>>  #include "vport-netdev.h"
>>  
>> @@ -275,6 +276,12 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
>>  	struct dp_stats_percpu *stats;
>>  	int err;
>>  
>> +	if (trace_openvswitch_probe_userspace_enabled()) {
>> +		struct sw_flow_key ukey = *key;
>> +
>> +		trace_openvswitch_probe_userspace(dp, skb, &ukey, upcall_info);
>> +	}
>> +
>>  	if (upcall_info->portid == 0) {
>>  		err = -ENOTCONN;
>>  		goto err;
>> diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c
>> new file mode 100644
>> index 000000000000..62c5f7d6f023
>> --- /dev/null
>> +++ b/net/openvswitch/openvswitch_trace.c
>> @@ -0,0 +1,10 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* bug in tracepoint.h, it should include this */
>> +#include <linux/module.h>
>> +
>> +/* sparse isn't too happy with all macros... */
>> +#ifndef __CHECKER__
>> +#define CREATE_TRACE_POINTS
>> +#include "openvswitch_trace.h"
>> +
>> +#endif
>> diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h
>> new file mode 100644
>> index 000000000000..1b350306f622
>> --- /dev/null
>> +++ b/net/openvswitch/openvswitch_trace.h
>> @@ -0,0 +1,152 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#undef TRACE_SYSTEM
>> +#define TRACE_SYSTEM openvswitch
>> +
>> +#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ)
>> +#define _TRACE_OPENVSWITCH_H
>> +
>> +#include <linux/tracepoint.h>
>> +
>> +#include "datapath.h"
>> +
>> +TRACE_EVENT(openvswitch_probe_action,
>> +
>> +	TP_PROTO(struct datapath *dp, struct sk_buff *skb,
>> +		 struct sw_flow_key *key, const struct nlattr *a, int rem),
>> +
>> +	TP_ARGS(dp, skb, key, a, rem),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(	void *,		dpaddr			)
>> +		__string(	dp_name,	ovs_dp_name(dp)		)
>> +		__string(	dev_name,	skb->dev->name		)
>> +		__field(	void *,		skbaddr			)
>> +		__field(	unsigned int,	len			)
>> +		__field(	unsigned int,	data_len		)
>> +		__field(	unsigned int,	truesize		)
>> +		__field(	u8,		nr_frags		)
>> +		__field(	u16,		gso_size		)
>> +		__field(	u16,		gso_type		)
>> +		__field(	u32,		ovs_flow_hash		)
>> +		__field(	u32,		recirc_id		)
>> +		__field(	void *,		keyaddr			)
>> +		__field(	u16,		key_eth_type		)
>> +		__field(	u8,		key_ct_state		)
>> +		__field(	u8,		key_ct_orig_proto	)
>
> I think tracing ct_zone would be useful too.

ACK, I'll add it.  I will post a new version today.

Thanks for the review!

>> +		__field(	unsigned int,	flow_key_valid		)
>> +		__field(	u8,		action_type		)
>> +		__field(	unsigned int,	action_len		)
>> +		__field(	void *,		action_data		)
>> +		__field(	u8,		is_last			)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->dpaddr = dp;
>> +		__assign_str(dp_name, ovs_dp_name(dp));
>> +		__assign_str(dev_name, skb->dev->name);
>> +		__entry->skbaddr = skb;
>> +		__entry->len = skb->len;
>> +		__entry->data_len = skb->data_len;
>> +		__entry->truesize = skb->truesize;
>> +		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
>> +		__entry->gso_size = skb_shinfo(skb)->gso_size;
>> +		__entry->gso_type = skb_shinfo(skb)->gso_type;
>> +		__entry->ovs_flow_hash = key->ovs_flow_hash;
>> +		__entry->recirc_id = key->recirc_id;
>> +		__entry->keyaddr = key;
>> +		__entry->key_eth_type = key->eth.type;
>> +		__entry->key_ct_state = key->ct_state;
>> +		__entry->key_ct_orig_proto = key->ct_orig_proto;
>> +		__entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID);
>> +		__entry->action_type = nla_type(a);
>> +		__entry->action_len = nla_len(a);
>> +		__entry->action_data = nla_data(a);
>> +		__entry->is_last = nla_is_last(a, rem);
>> +	),
>> +
>> + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u
>> data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x
>> ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x
>> ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d action_type=%u
>> action_len=%u action_data=%p is_last=%d",
>> +		  __entry->dpaddr, __get_str(dp_name), __get_str(dev_name),
>> +		  __entry->skbaddr, __entry->len, __entry->data_len,
>> +		  __entry->truesize, __entry->nr_frags, __entry->gso_size,
>> +		  __entry->gso_type, __entry->ovs_flow_hash,
>> +		  __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type,
>> +		  __entry->key_ct_state, __entry->key_ct_orig_proto,
>> +		  __entry->flow_key_valid,
>> +		  __entry->action_type, __entry->action_len,
>> +		  __entry->action_data, __entry->is_last)
>> +);
>> +
>> +TRACE_EVENT(openvswitch_probe_userspace,
>> +
>> +	TP_PROTO(struct datapath *dp, struct sk_buff *skb,
>> +		 struct sw_flow_key *key,
>> +		 const struct dp_upcall_info *upcall_info),
>> +
>> +	TP_ARGS(dp, skb, key, upcall_info),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(	void *,		dpaddr			)
>> +		__string(	dp_name,	ovs_dp_name(dp)		)
>> +		__string(	dev_name,	skb->dev->name		)
>> +		__field(	void *,		skbaddr			)
>> +		__field(	unsigned int,	len			)
>> +		__field(	unsigned int,	data_len		)
>> +		__field(	unsigned int,	truesize		)
>> +		__field(	u8,		nr_frags		)
>> +		__field(	u16,		gso_size		)
>> +		__field(	u16,		gso_type		)
>> +		__field(	u32,		ovs_flow_hash		)
>> +		__field(	u32,		recirc_id		)
>> +		__field(	void *,		keyaddr			)
>> +		__field(	u16,		key_eth_type		)
>> +		__field(	u8,		key_ct_state		)
>> +		__field(	u8,		key_ct_orig_proto	)
>
> Same here.
>
> Thanks,
> Dumitru
>
>> +		__field(	unsigned int,	flow_key_valid		)
>> +		__field(	u8,		upcall_cmd		)
>> +		__field(	u32,		upcall_port		)
>> +		__field(	u16,		upcall_mru		)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->dpaddr = dp;
>> +		__assign_str(dp_name, ovs_dp_name(dp));
>> +		__assign_str(dev_name, skb->dev->name);
>> +		__entry->skbaddr = skb;
>> +		__entry->len = skb->len;
>> +		__entry->data_len = skb->data_len;
>> +		__entry->truesize = skb->truesize;
>> +		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
>> +		__entry->gso_size = skb_shinfo(skb)->gso_size;
>> +		__entry->gso_type = skb_shinfo(skb)->gso_type;
>> +		__entry->ovs_flow_hash = key->ovs_flow_hash;
>> +		__entry->recirc_id = key->recirc_id;
>> +		__entry->keyaddr = key;
>> +		__entry->key_eth_type = key->eth.type;
>> +		__entry->key_ct_state = key->ct_state;
>> +		__entry->key_ct_orig_proto = key->ct_orig_proto;
>> +		__entry->flow_key_valid =  !(key->mac_proto & SW_FLOW_KEY_INVALID);
>> +		__entry->upcall_cmd = upcall_info->cmd;
>> +		__entry->upcall_port = upcall_info->portid;
>> +		__entry->upcall_mru = upcall_info->mru;
>> +	),
>> +
>> + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u
>> data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x
>> ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x
>> ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d upcall_cmd=%u
>> upcall_port=%u upcall_mru=%u",
>> +		  __entry->dpaddr, __get_str(dp_name), __get_str(dev_name),
>> +		  __entry->skbaddr, __entry->len, __entry->data_len,
>> +		  __entry->truesize, __entry->nr_frags, __entry->gso_size,
>> +		  __entry->gso_type, __entry->ovs_flow_hash,
>> +		  __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type,
>> +		  __entry->key_ct_state, __entry->key_ct_orig_proto,
>> +		  __entry->flow_key_valid,
>> +		  __entry->upcall_cmd, __entry->upcall_port,
>> +		  __entry->upcall_mru)
>> +);
>> +
>> +#endif /* _TRACE_OPENVSWITCH_H */
>> +
>> +/* This part must be outside protection */
>> +#undef TRACE_INCLUDE_PATH
>> +#define TRACE_INCLUDE_PATH .
>> +#undef TRACE_INCLUDE_FILE
>> +#define TRACE_INCLUDE_FILE openvswitch_trace
>> +#include <trace/define_trace.h>
>>
diff mbox series

Patch

diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 41109c326f3a..28982630bef3 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -13,6 +13,7 @@  openvswitch-y := \
 	flow_netlink.o \
 	flow_table.o \
 	meter.o \
+	openvswitch_trace.o \
 	vport.o \
 	vport-internal_dev.o \
 	vport-netdev.o
@@ -24,3 +25,5 @@  endif
 obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
 obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
 obj-$(CONFIG_OPENVSWITCH_GRE)	+= vport-gre.o
+
+CFLAGS_openvswitch_trace.o = -I$(src)
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index d858ea580e43..62285453ca79 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -30,6 +30,7 @@ 
 #include "conntrack.h"
 #include "vport.h"
 #include "flow_netlink.h"
+#include "openvswitch_trace.h"
 
 struct deferred_action {
 	struct sk_buff *skb;
@@ -1242,6 +1243,9 @@  static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 	     a = nla_next(a, &rem)) {
 		int err = 0;
 
+		if (trace_openvswitch_probe_action_enabled())
+			trace_openvswitch_probe_action(dp, skb, key, a, rem);
+
 		switch (nla_type(a)) {
 		case OVS_ACTION_ATTR_OUTPUT: {
 			int port = nla_get_u32(a);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9d6ef6cb9b26..63f19a6ed472 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -43,6 +43,7 @@ 
 #include "flow_table.h"
 #include "flow_netlink.h"
 #include "meter.h"
+#include "openvswitch_trace.h"
 #include "vport-internal_dev.h"
 #include "vport-netdev.h"
 
@@ -275,6 +276,12 @@  int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
 	struct dp_stats_percpu *stats;
 	int err;
 
+	if (trace_openvswitch_probe_userspace_enabled()) {
+		struct sw_flow_key ukey = *key;
+
+		trace_openvswitch_probe_userspace(dp, skb, &ukey, upcall_info);
+	}
+
 	if (upcall_info->portid == 0) {
 		err = -ENOTCONN;
 		goto err;
diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c
new file mode 100644
index 000000000000..62c5f7d6f023
--- /dev/null
+++ b/net/openvswitch/openvswitch_trace.c
@@ -0,0 +1,10 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* bug in tracepoint.h, it should include this */
+#include <linux/module.h>
+
+/* sparse isn't too happy with all macros... */
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "openvswitch_trace.h"
+
+#endif
diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h
new file mode 100644
index 000000000000..1b350306f622
--- /dev/null
+++ b/net/openvswitch/openvswitch_trace.h
@@ -0,0 +1,152 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM openvswitch
+
+#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OPENVSWITCH_H
+
+#include <linux/tracepoint.h>
+
+#include "datapath.h"
+
+TRACE_EVENT(openvswitch_probe_action,
+
+	TP_PROTO(struct datapath *dp, struct sk_buff *skb,
+		 struct sw_flow_key *key, const struct nlattr *a, int rem),
+
+	TP_ARGS(dp, skb, key, a, rem),
+
+	TP_STRUCT__entry(
+		__field(	void *,		dpaddr			)
+		__string(	dp_name,	ovs_dp_name(dp)		)
+		__string(	dev_name,	skb->dev->name		)
+		__field(	void *,		skbaddr			)
+		__field(	unsigned int,	len			)
+		__field(	unsigned int,	data_len		)
+		__field(	unsigned int,	truesize		)
+		__field(	u8,		nr_frags		)
+		__field(	u16,		gso_size		)
+		__field(	u16,		gso_type		)
+		__field(	u32,		ovs_flow_hash		)
+		__field(	u32,		recirc_id		)
+		__field(	void *,		keyaddr			)
+		__field(	u16,		key_eth_type		)
+		__field(	u8,		key_ct_state		)
+		__field(	u8,		key_ct_orig_proto	)
+		__field(	unsigned int,	flow_key_valid		)
+		__field(	u8,		action_type		)
+		__field(	unsigned int,	action_len		)
+		__field(	void *,		action_data		)
+		__field(	u8,		is_last			)
+	),
+
+	TP_fast_assign(
+		__entry->dpaddr = dp;
+		__assign_str(dp_name, ovs_dp_name(dp));
+		__assign_str(dev_name, skb->dev->name);
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__entry->data_len = skb->data_len;
+		__entry->truesize = skb->truesize;
+		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
+		__entry->gso_size = skb_shinfo(skb)->gso_size;
+		__entry->gso_type = skb_shinfo(skb)->gso_type;
+		__entry->ovs_flow_hash = key->ovs_flow_hash;
+		__entry->recirc_id = key->recirc_id;
+		__entry->keyaddr = key;
+		__entry->key_eth_type = key->eth.type;
+		__entry->key_ct_state = key->ct_state;
+		__entry->key_ct_orig_proto = key->ct_orig_proto;
+		__entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID);
+		__entry->action_type = nla_type(a);
+		__entry->action_len = nla_len(a);
+		__entry->action_data = nla_data(a);
+		__entry->is_last = nla_is_last(a, rem);
+	),
+
+	TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d action_type=%u action_len=%u action_data=%p is_last=%d",
+		  __entry->dpaddr, __get_str(dp_name), __get_str(dev_name),
+		  __entry->skbaddr, __entry->len, __entry->data_len,
+		  __entry->truesize, __entry->nr_frags, __entry->gso_size,
+		  __entry->gso_type, __entry->ovs_flow_hash,
+		  __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type,
+		  __entry->key_ct_state, __entry->key_ct_orig_proto,
+		  __entry->flow_key_valid,
+		  __entry->action_type, __entry->action_len,
+		  __entry->action_data, __entry->is_last)
+);
+
+TRACE_EVENT(openvswitch_probe_userspace,
+
+	TP_PROTO(struct datapath *dp, struct sk_buff *skb,
+		 struct sw_flow_key *key,
+		 const struct dp_upcall_info *upcall_info),
+
+	TP_ARGS(dp, skb, key, upcall_info),
+
+	TP_STRUCT__entry(
+		__field(	void *,		dpaddr			)
+		__string(	dp_name,	ovs_dp_name(dp)		)
+		__string(	dev_name,	skb->dev->name		)
+		__field(	void *,		skbaddr			)
+		__field(	unsigned int,	len			)
+		__field(	unsigned int,	data_len		)
+		__field(	unsigned int,	truesize		)
+		__field(	u8,		nr_frags		)
+		__field(	u16,		gso_size		)
+		__field(	u16,		gso_type		)
+		__field(	u32,		ovs_flow_hash		)
+		__field(	u32,		recirc_id		)
+		__field(	void *,		keyaddr			)
+		__field(	u16,		key_eth_type		)
+		__field(	u8,		key_ct_state		)
+		__field(	u8,		key_ct_orig_proto	)
+		__field(	unsigned int,	flow_key_valid		)
+		__field(	u8,		upcall_cmd		)
+		__field(	u32,		upcall_port		)
+		__field(	u16,		upcall_mru		)
+	),
+
+	TP_fast_assign(
+		__entry->dpaddr = dp;
+		__assign_str(dp_name, ovs_dp_name(dp));
+		__assign_str(dev_name, skb->dev->name);
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__entry->data_len = skb->data_len;
+		__entry->truesize = skb->truesize;
+		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
+		__entry->gso_size = skb_shinfo(skb)->gso_size;
+		__entry->gso_type = skb_shinfo(skb)->gso_type;
+		__entry->ovs_flow_hash = key->ovs_flow_hash;
+		__entry->recirc_id = key->recirc_id;
+		__entry->keyaddr = key;
+		__entry->key_eth_type = key->eth.type;
+		__entry->key_ct_state = key->ct_state;
+		__entry->key_ct_orig_proto = key->ct_orig_proto;
+		__entry->flow_key_valid =  !(key->mac_proto & SW_FLOW_KEY_INVALID);
+		__entry->upcall_cmd = upcall_info->cmd;
+		__entry->upcall_port = upcall_info->portid;
+		__entry->upcall_mru = upcall_info->mru;
+	),
+
+	TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d upcall_cmd=%u upcall_port=%u upcall_mru=%u",
+		  __entry->dpaddr, __get_str(dp_name), __get_str(dev_name),
+		  __entry->skbaddr, __entry->len, __entry->data_len,
+		  __entry->truesize, __entry->nr_frags, __entry->gso_size,
+		  __entry->gso_type, __entry->ovs_flow_hash,
+		  __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type,
+		  __entry->key_ct_state, __entry->key_ct_orig_proto,
+		  __entry->flow_key_valid,
+		  __entry->upcall_cmd, __entry->upcall_port,
+		  __entry->upcall_mru)
+);
+
+#endif /* _TRACE_OPENVSWITCH_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE openvswitch_trace
+#include <trace/define_trace.h>