diff mbox series

[net-next,3/4] net/tc: introduce TC_ACT_MIRRED.

Message ID 78e96c4fd40645bb4bce45b6502ed34cf502356f.1531941678.git.pabeni@redhat.com
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series TC: refactor act_mirred packets re-injection | expand

Commit Message

Paolo Abeni July 19, 2018, 1:02 p.m. UTC
This is similar TC_ACT_REDIRECT, but with a slightly different
semantic:
- on ingress the mirred skbs are passed to the target device
network stack without any additional check not scrubbing.
- the rcu-protected stats provided via the tcf_result struct
  are updated on error conditions.

v1 -> v2: do not touch TC_ACT_REDIRECT code path, introduce
 a new action type instead

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sch_generic.h    | 19 +++++++++++++++++++
 include/uapi/linux/pkt_cls.h |  3 ++-
 net/core/dev.c               |  4 ++++
 net/sched/act_api.c          |  6 ++++--
 4 files changed, 29 insertions(+), 3 deletions(-)

Comments

Cong Wang July 19, 2018, 6:07 p.m. UTC | #1
On Thu, Jul 19, 2018 at 6:03 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> This is similar TC_ACT_REDIRECT, but with a slightly different
> semantic:
> - on ingress the mirred skbs are passed to the target device
> network stack without any additional check not scrubbing.
> - the rcu-protected stats provided via the tcf_result struct
>   are updated on error conditions.

At least its name sucks, it means to skip the skb_clone(),
that is avoid a copy, but you still call it MIRRED...

MIRRED means MIRror and REDirect.

Also, I don't understand why this new TC_ACT code needs
to be visible to user-space, whether to clone or not is purely
internal.
Jiri Pirko July 19, 2018, 6:56 p.m. UTC | #2
Thu, Jul 19, 2018 at 03:02:28PM CEST, pabeni@redhat.com wrote:
>This is similar TC_ACT_REDIRECT, but with a slightly different
>semantic:
>- on ingress the mirred skbs are passed to the target device
>network stack without any additional check not scrubbing.
>- the rcu-protected stats provided via the tcf_result struct
>  are updated on error conditions.
>
>v1 -> v2: do not touch TC_ACT_REDIRECT code path, introduce
> a new action type instead
>
>Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>---
> include/net/sch_generic.h    | 19 +++++++++++++++++++
> include/uapi/linux/pkt_cls.h |  3 ++-
> net/core/dev.c               |  4 ++++
> net/sched/act_api.c          |  6 ++++--
> 4 files changed, 29 insertions(+), 3 deletions(-)
>
>diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>index 056dc1083aa3..667d7b66fee2 100644
>--- a/include/net/sch_generic.h
>+++ b/include/net/sch_generic.h
>@@ -235,6 +235,12 @@ struct tcf_result {
> 			u32		classid;
> 		};
> 		const struct tcf_proto *goto_tp;
>+
>+		/* used by the TC_ACT_MIRRED action */
>+		struct {
>+			bool		ingress;
>+			struct gnet_stats_queue *qstats;
>+		};
> 	};
> };
> 
>@@ -1091,4 +1097,17 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
> void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
> 			  struct mini_Qdisc __rcu **p_miniq);
> 
>+static inline void skb_tc_redirect(struct sk_buff *skb, struct tcf_result *res)
>+{
>+	struct gnet_stats_queue *stats = res->qstats;
>+	int ret;
>+
>+	if (res->ingress)
>+		ret = netif_receive_skb(skb);
>+	else
>+		ret = dev_queue_xmit(skb);
>+	if (ret && stats)
>+		qstats_overlimit_inc(res->qstats);
>+}
>+
> #endif
>diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
>index 7cdd62b51106..1a5e8a3217f3 100644
>--- a/include/uapi/linux/pkt_cls.h
>+++ b/include/uapi/linux/pkt_cls.h
>@@ -45,7 +45,8 @@ enum {
> 				   * the skb and act like everything
> 				   * is alright.
> 				   */
>-#define TC_ACT_LAST		TC_ACT_TRAP
>+#define TC_ACT_MIRRED		9
>+#define TC_ACT_LAST		TC_ACT_MIRRED
> 
> /* There is a special kind of actions called "extended actions",
>  * which need a value parameter. These have a local opcode located in
>diff --git a/net/core/dev.c b/net/core/dev.c
>index 14a748ee8cc9..3822f29d730f 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -4602,6 +4602,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> 		__skb_push(skb, skb->mac_len);
> 		skb_do_redirect(skb);
> 		return NULL;
>+	case TC_ACT_MIRRED:
>+		/* this does not scrub the packet, and updates stats on error */
>+		skb_tc_redirect(skb, &cl_res);

REDIRECT does skb_do_redirect and MIRRED does skb_tc_redirect.
Confusing. I agree with Cong that the name is not correct.


>+		return NULL;
> 	default:
> 		break;
> 	}
>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>index f6438f246dab..029302e2813e 100644
>--- a/net/sched/act_api.c
>+++ b/net/sched/act_api.c
>@@ -895,8 +895,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
> 		}
> 	}
> 
>-	if (a->tcfa_action == TC_ACT_REDIRECT) {
>-		net_warn_ratelimited("TC_ACT_REDIRECT can't be used directly");
>+	if (a->tcfa_action == TC_ACT_REDIRECT ||
>+	    a->tcfa_action == TC_ACT_MIRRED) {
>+		net_warn_ratelimited("action %d can't be used directly",
>+				     a->tcfa_action);
> 		a->tcfa_action = TC_ACT_LAST + 1;
> 	}
> 
>-- 
>2.17.1
>
Paolo Abeni July 20, 2018, 9:54 a.m. UTC | #3
Hi,

Jiri, Cong, thank you for the feedback. Please allow me to give a
single reply to both of you, as you rised similar concers.

On Thu, 2018-07-19 at 11:07 -0700, Cong Wang wrote:
> On Thu, Jul 19, 2018 at 6:03 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > 
> > This is similar TC_ACT_REDIRECT, but with a slightly different
> > semantic:
> > - on ingress the mirred skbs are passed to the target device
> > network stack without any additional check not scrubbing.
> > - the rcu-protected stats provided via the tcf_result struct
> >   are updated on error conditions.
> 
> At least its name sucks, it means to skip the skb_clone(),
> that is avoid a copy, but you still call it MIRRED...
> 
> MIRRED means MIRror and REDirect.

I was not satified with the name, too, but I also wanted to collect
some feedback, as the different time zones are not helping here.

Would TC_ACT_REINJECT be a better choice? (renaming skb_tc_redirect as
skb_tc_reinject, too). Do you have some better name?

Thanks!

> Also, I don't understand why this new TC_ACT code needs
> to be visible to user-space, whether to clone or not is purely
> internal.

Note this is what already happens with TC_ACT_REDIRECT: currently the
user space uses it freely, even if only {cls,act}_bpf can return such
value in a meaningful way, and only from the ingress and the egress
hooks.

I think we can add a clear separation between the values accessible
from user-space, and the ones used interanally by the kernel, with
something like the code below (basically unknown actions are explicitly
mapped to TC_ACT_UNSPEC), WDYT?

Note: as TC_ACT_REDIRECT is already part of the uAPI, it will remain
accessible from user-space, so patch 1/4 would be still needed.

Cheers,

Paolo

---
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e4252a176eec..9079e4ee2bbe 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -7,6 +7,9 @@
 #include <net/sch_generic.h>
 #include <net/act_api.h>
 
+/* TC action not accessible from user space */
+#define TC_ACT_REINJECT		(TC_ACT_MAX + 1)
+
 /* Basic packet classifier frontend definitions. */
 
 struct tcf_walker {
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index c4262d911596..c8a24861d4c8 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -45,6 +45,7 @@ enum {
 				   * the skb and act like everything
 				   * is alright.
 				   */
+#define TC_ACT_VALUE_MAX	TC_ACT_TRAP
 
 /* There is a special kind of actions called "extended actions",
  * which need a value parameter. These have a local opcode located in
@@ -55,11 +56,12 @@ enum {
 #define __TC_ACT_EXT_SHIFT 28
 #define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT)
 #define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1)
-#define TC_ACT_EXT_CMP(combined, opcode) \
-	(((combined) & (~TC_ACT_EXT_VAL_MASK)) == opcode)
+#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK))
+#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode)
 
 #define TC_ACT_JUMP __TC_ACT_EXT(1)
 #define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2)
+#define TC_ACT_EXT_OPCODE_MAX	TC_ACT_GOTO_CHAIN
 
 /* Action type identifiers*/
 enum {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 148a89ab789b..657c3d99698d 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -798,6 +798,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 	char act_name[IFNAMSIZ];
 	struct nlattr *tb[TCA_ACT_MAX + 1];
 	struct nlattr *kind;
+	int opcode;
 	int err;
 
 	if (name == NULL) {
@@ -895,6 +896,14 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 		}
 	}
 
+	opcode = TC_ACT_EXT_OPCODE(a->tcfa_action);
+	if ((!opcode && a->tcfa_action > TC_ACT_VALUE_MAX) ||
+	    (opcode && opcode > TC_ACT_EXT_OPCODE_MAX)) {
+		net_warn_ratelimited("invalid %d action value",
+				     a->tcfa_action);
+		a->tcfa_action = TC_ACT_UNSPEC;
+	}
+
 	return a;
 
 err_mod:
Cong Wang July 23, 2018, 9:12 p.m. UTC | #4
On Fri, Jul 20, 2018 at 2:54 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> Hi,
>
> Jiri, Cong, thank you for the feedback. Please allow me to give a
> single reply to both of you, as you rised similar concers.
>
> On Thu, 2018-07-19 at 11:07 -0700, Cong Wang wrote:
> > On Thu, Jul 19, 2018 at 6:03 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > >
> > > This is similar TC_ACT_REDIRECT, but with a slightly different
> > > semantic:
> > > - on ingress the mirred skbs are passed to the target device
> > > network stack without any additional check not scrubbing.
> > > - the rcu-protected stats provided via the tcf_result struct
> > >   are updated on error conditions.
> >
> > At least its name sucks, it means to skip the skb_clone(),
> > that is avoid a copy, but you still call it MIRRED...
> >
> > MIRRED means MIRror and REDirect.
>
> I was not satified with the name, too, but I also wanted to collect
> some feedback, as the different time zones are not helping here.
>
> Would TC_ACT_REINJECT be a better choice? (renaming skb_tc_redirect as
> skb_tc_reinject, too). Do you have some better name?


Any name not implying a copy is better. I don't worry about it, please
see below.


> > Also, I don't understand why this new TC_ACT code needs
> > to be visible to user-space, whether to clone or not is purely
> > internal.
>
> Note this is what already happens with TC_ACT_REDIRECT: currently the
> user space uses it freely, even if only {cls,act}_bpf can return such
> value in a meaningful way, and only from the ingress and the egress
> hooks.
>

Yes, my question is why do we give user such a freedom?

In other words, what do you want users to choose here? To scrub or not
to scrub? To clone or not to clone?

From my understanding of your whole patchset, your goal is to get rid
of clone, and users definitely don't care about clone or not clone for
redirections, this is why I insist it doesn't need to be visible to user.

If your goal is not just skipping clone, but also, let's say, scrub or not
scrub, then it should be visible to users. However, I don't see why
users care about scrub or not, they have to understand what scrub
is at least, it is a purely kernel-internal behavior.


> I think we can add a clear separation between the values accessible
> from user-space, and the ones used interanally by the kernel, with
> something like the code below (basically unknown actions are explicitly
> mapped to TC_ACT_UNSPEC), WDYT?
>
> Note: as TC_ACT_REDIRECT is already part of the uAPI, it will remain
> accessible from user-space, so patch 1/4 would be still needed.

I think that is doable too, but we should understand whether we
need to do it or not at first.

Thanks.
Paolo Abeni July 24, 2018, 6:48 a.m. UTC | #5
Hi,

On Mon, 2018-07-23 at 14:12 -0700, Cong Wang wrote:
> On Fri, Jul 20, 2018 at 2:54 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > Note this is what already happens with TC_ACT_REDIRECT: currently the
> > user space uses it freely, even if only {cls,act}_bpf can return such
> > value in a meaningful way, and only from the ingress and the egress
> > hooks.
>
> Yes, my question is why do we give user such a freedom?
> 
> In other words, what do you want users to choose here? To scrub or not
> to scrub? To clone or not to clone?
> 
> From my understanding of your whole patchset, your goal is to get rid
> of clone, and users definitely don't care about clone or not clone for
> redirections, this is why I insist it doesn't need to be visible to user.

Thank you for your kind reply!

No, my intention is not to expose to the user-space another option. I
added the  additional tcfa_action value in response to concerns exposed
vs the v1 version of this series (it changed the act_mirred behaviour
and possibly broke some use-case).

When assembling the v2 I did not implemented the (deserved) isolation
vs user-space because of the already existing TC_ACT_REDIRECT: its
current implementation fooled me to think such considerations were not
relevant.

> If your goal is not just skipping clone, but also, let's say, scrub or not
> scrub, then it should be visible to users. However, I don't see why
> users care about scrub or not, they have to understand what scrub
> is at least, it is a purely kernel-internal behavior.

I agree to hide TC_ACT_REINJECT and any choice about scrubbing to user-
space, as per the code chunk I  posted before. I'll send a v3
implementing such schema.

Cheers,

Paolo
diff mbox series

Patch

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 056dc1083aa3..667d7b66fee2 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -235,6 +235,12 @@  struct tcf_result {
 			u32		classid;
 		};
 		const struct tcf_proto *goto_tp;
+
+		/* used by the TC_ACT_MIRRED action */
+		struct {
+			bool		ingress;
+			struct gnet_stats_queue *qstats;
+		};
 	};
 };
 
@@ -1091,4 +1097,17 @@  void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
 void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
 			  struct mini_Qdisc __rcu **p_miniq);
 
+static inline void skb_tc_redirect(struct sk_buff *skb, struct tcf_result *res)
+{
+	struct gnet_stats_queue *stats = res->qstats;
+	int ret;
+
+	if (res->ingress)
+		ret = netif_receive_skb(skb);
+	else
+		ret = dev_queue_xmit(skb);
+	if (ret && stats)
+		qstats_overlimit_inc(res->qstats);
+}
+
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 7cdd62b51106..1a5e8a3217f3 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -45,7 +45,8 @@  enum {
 				   * the skb and act like everything
 				   * is alright.
 				   */
-#define TC_ACT_LAST		TC_ACT_TRAP
+#define TC_ACT_MIRRED		9
+#define TC_ACT_LAST		TC_ACT_MIRRED
 
 /* There is a special kind of actions called "extended actions",
  * which need a value parameter. These have a local opcode located in
diff --git a/net/core/dev.c b/net/core/dev.c
index 14a748ee8cc9..3822f29d730f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4602,6 +4602,10 @@  sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		__skb_push(skb, skb->mac_len);
 		skb_do_redirect(skb);
 		return NULL;
+	case TC_ACT_MIRRED:
+		/* this does not scrub the packet, and updates stats on error */
+		skb_tc_redirect(skb, &cl_res);
+		return NULL;
 	default:
 		break;
 	}
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f6438f246dab..029302e2813e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -895,8 +895,10 @@  struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 		}
 	}
 
-	if (a->tcfa_action == TC_ACT_REDIRECT) {
-		net_warn_ratelimited("TC_ACT_REDIRECT can't be used directly");
+	if (a->tcfa_action == TC_ACT_REDIRECT ||
+	    a->tcfa_action == TC_ACT_MIRRED) {
+		net_warn_ratelimited("action %d can't be used directly",
+				     a->tcfa_action);
 		a->tcfa_action = TC_ACT_LAST + 1;
 	}