diff mbox

[net-next,3/4] net: sched: cls_u32 add bit to specify software only rules

Message ID 20160223190321.5970.58924.stgit@john-Precision-Tower-5810
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

John Fastabend Feb. 23, 2016, 7:03 p.m. UTC
In the initial implementation the only way to stop a rule from being
inserted into the hardware table was via the device feature flag.
However this doesn't work well when working on an end host system
where packets are expect to hit both the hardware and software
datapaths.

For example we can imagine a rule that will match an IP address and
increment a field. If we install this rule in both hardware and
software we may increment the field twice. To date we have only
added support for the drop action so we have been able to ignore
these cases. But as we extend the action support we will hit this
example plus more such cases. Arguably these are not even corner
cases in many working systems these cases will be common.

To avoid forcing the driver to always abort (i.e. the above example)
this patch adds a flag to add a rule in software only. A careful
user can use this flag to build software and hardware datapaths
that work together. One example we have found particularly useful
is to use hardware resources to set the skb->mark on the skb when
the match may be expensive to run in software but a mark lookup
in a hash table is cheap. The idea here is hardware can do in one
lookup what the u32 classifier may need to traverse multiple lists
and hash tables to compute. The flag is only passed down on inserts
on deletion to avoid stale references in hardware we always try
to remove a rule if it exists.

Notice we do not add a hardware only case here. If you were to
add a hardware only case then you are stuck with the problem
of where to stick the software representation of that filter
rule. If its stuck on the same filter list as the software only and
software/hardware rules it then has to be walked over and ignored
in the classify path. The overhead is not huge but is measurable.
And with so much work being invested in speeding up rx/tx of
pkt processing this is unacceptable IMO. The other option is to
have a special hook just for hardware only resources. This is
implemented in the next patch.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/uapi/linux/pkt_cls.h |    3 +++
 net/sched/cls_u32.c          |   43 ++++++++++++++++++++++++++++++------------
 2 files changed, 34 insertions(+), 12 deletions(-)

Comments

Samudrala, Sridhar Feb. 23, 2016, 10:29 p.m. UTC | #1
On 2/23/2016 11:03 AM, John Fastabend wrote:
> In the initial implementation the only way to stop a rule from being
> inserted into the hardware table was via the device feature flag.
> However this doesn't work well when working on an end host system
> where packets are expect to hit both the hardware and software
> datapaths.
>
> For example we can imagine a rule that will match an IP address and
> increment a field. If we install this rule in both hardware and
> software we may increment the field twice. To date we have only
> added support for the drop action so we have been able to ignore
> these cases. But as we extend the action support we will hit this
> example plus more such cases. Arguably these are not even corner
> cases in many working systems these cases will be common.
Is there a usecase to support running the same rule in both hardware as well
as software? If a rule is offloaded to hardware,  isn't the assumption that
we don't need to do that in software.
>
> To avoid forcing the driver to always abort (i.e. the above example)
> this patch adds a flag to add a rule in software only. A careful
> user can use this flag to build software and hardware datapaths
> that work together. One example we have found particularly useful
> is to use hardware resources to set the skb->mark on the skb when
> the match may be expensive to run in software but a mark lookup
> in a hash table is cheap. The idea here is hardware can do in one
> lookup what the u32 classifier may need to traverse multiple lists
> and hash tables to compute. The flag is only passed down on inserts
> on deletion to avoid stale references in hardware we always try
> to remove a rule if it exists.
>
> Notice we do not add a hardware only case here. If you were to
> add a hardware only case then you are stuck with the problem
> of where to stick the software representation of that filter
> rule. If its stuck on the same filter list as the software only and
> software/hardware rules it then has to be walked over and ignored
> in the classify path. The overhead is not huge but is measurable.
> And with so much work being invested in speeding up rx/tx of
> pkt processing this is unacceptable IMO. The other option is to
> have a special hook just for hardware only resources. This is
> implemented in the next patch.
>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>   include/uapi/linux/pkt_cls.h |    3 +++
>   net/sched/cls_u32.c          |   43 ++++++++++++++++++++++++++++++------------
>   2 files changed, 34 insertions(+), 12 deletions(-)
>
> diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
> index 4398737..143ea21 100644
> --- a/include/uapi/linux/pkt_cls.h
> +++ b/include/uapi/linux/pkt_cls.h
> @@ -160,6 +160,8 @@ enum {
>   #define TC_U32_UNSPEC	0
>   #define TC_U32_ROOT	(0xFFF00000)
>   
> +#define TCA_U32_FLAGS_SOFTWARE 1
Does this flag indicate software-only rule?
Instead should we add a flag to indicate HW only.

> +
>   enum {
>   	TCA_U32_UNSPEC,
>   	TCA_U32_CLASSID,
> @@ -172,6 +174,7 @@ enum {
>   	TCA_U32_INDEV,
>   	TCA_U32_PCNT,
>   	TCA_U32_MARK,
> +	TCA_U32_FLAGS,
>   	__TCA_U32_MAX
>   };
>   
> diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
> index f766bcb..c509fc8 100644
> --- a/net/sched/cls_u32.c
> +++ b/net/sched/cls_u32.c
> @@ -59,6 +59,7 @@ struct tc_u_knode {
>   #ifdef CONFIG_CLS_U32_PERF
>   	struct tc_u32_pcnt __percpu *pf;
>   #endif
> +	u32			flags;
>   #ifdef CONFIG_CLS_U32_MARK
>   	u32			val;
>   	u32			mask;
> @@ -425,12 +426,18 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
>   	return 0;
>   }
>   
> -static bool u32_should_offload(struct net_device *dev)
> +static bool u32_should_offload(struct net_device *dev, u32 flags)
>   {
>   	if (!(dev->features & NETIF_F_HW_TC))
>   		return false;
>   
> -	return dev->netdev_ops->ndo_setup_tc;
> +	if (flags & TCA_U32_FLAGS_SOFTWARE)
> +		return false;
> +
> +	if (!dev->netdev_ops->ndo_setup_tc)
> +		return false;
> +
> +	return true;
>   }
>   
>   static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
> @@ -442,7 +449,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
>   	offload.type = TC_SETUP_CLSU32;
>   	offload.cls_u32 = &u32_offload;
>   
> -	if (u32_should_offload(dev)) {
> +	if (u32_should_offload(dev, 0)) {
>   		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
>   		offload.cls_u32->knode.handle = handle;
>   		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
> @@ -450,7 +457,9 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
>   	}
>   }
>   
> -static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
> +static void u32_replace_hw_hnode(struct tcf_proto *tp,
> +				 struct tc_u_hnode *h,
> +				 u32 flags)
>   {
>   	struct net_device *dev = tp->q->dev_queue->dev;
>   	struct tc_cls_u32_offload u32_offload = {0};
> @@ -459,7 +468,7 @@ static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
>   	offload.type = TC_SETUP_CLSU32;
>   	offload.cls_u32 = &u32_offload;
>   
> -	if (u32_should_offload(dev)) {
> +	if (u32_should_offload(dev, flags)) {
>   		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
>   		offload.cls_u32->hnode.divisor = h->divisor;
>   		offload.cls_u32->hnode.handle = h->handle;
> @@ -479,7 +488,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
>   	offload.type = TC_SETUP_CLSU32;
>   	offload.cls_u32 = &u32_offload;
>   
> -	if (u32_should_offload(dev)) {
> +	if (u32_should_offload(dev, 0)) {
>   		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
>   		offload.cls_u32->hnode.divisor = h->divisor;
>   		offload.cls_u32->hnode.handle = h->handle;
> @@ -490,7 +499,9 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
>   	}
>   }
>   
> -static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
> +static void u32_replace_hw_knode(struct tcf_proto *tp,
> +				 struct tc_u_knode *n,
> +				 u32 flags)
>   {
>   	struct net_device *dev = tp->q->dev_queue->dev;
>   	struct tc_cls_u32_offload u32_offload = {0};
> @@ -499,7 +510,7 @@ static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
>   	offload.type = TC_SETUP_CLSU32;
>   	offload.cls_u32 = &u32_offload;
>   
> -	if (u32_should_offload(dev)) {
> +	if (u32_should_offload(dev, flags)) {
>   		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
>   		offload.cls_u32->knode.handle = n->handle;
>   		offload.cls_u32->knode.fshift = n->fshift;
> @@ -687,6 +698,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
>   	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
>   	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
>   	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
> +	[TCA_U32_FLAGS]		= { .type = NLA_U32 },
>   };
>   
>   static int u32_set_parms(struct net *net, struct tcf_proto *tp,
> @@ -833,7 +845,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
>   	struct tc_u32_sel *s;
>   	struct nlattr *opt = tca[TCA_OPTIONS];
>   	struct nlattr *tb[TCA_U32_MAX + 1];
> -	u32 htid;
> +	u32 htid, flags = 0;
>   	int err;
>   #ifdef CONFIG_CLS_U32_PERF
>   	size_t size;
> @@ -846,6 +858,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
>   	if (err < 0)
>   		return err;
>   
> +	if (tb[TCA_U32_FLAGS])
> +		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
> +
>   	n = (struct tc_u_knode *)*arg;
>   	if (n) {
>   		struct tc_u_knode *new;
> @@ -869,7 +884,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
>   		u32_replace_knode(tp, tp_c, new);
>   		tcf_unbind_filter(tp, &n->res);
>   		call_rcu(&n->rcu, u32_delete_key_rcu);
> -		u32_replace_hw_knode(tp, new);
> +		u32_replace_hw_knode(tp, new, flags);
>   		return 0;
>   	}
>   
> @@ -897,7 +912,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
>   		rcu_assign_pointer(tp_c->hlist, ht);
>   		*arg = (unsigned long)ht;
>   
> -		u32_replace_hw_hnode(tp, ht);
> +		u32_replace_hw_hnode(tp, ht, flags);
>   		return 0;
>   	}
>   
> @@ -948,6 +963,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
>   	RCU_INIT_POINTER(n->ht_up, ht);
>   	n->handle = handle;
>   	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
> +	n->flags = flags;
>   	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
>   	n->tp = tp;
>   
> @@ -980,7 +996,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
>   
>   		RCU_INIT_POINTER(n->next, pins);
>   		rcu_assign_pointer(*ins, n);
> -		u32_replace_hw_knode(tp, n);
> +		u32_replace_hw_knode(tp, n, flags);
>   		*arg = (unsigned long)n;
>   		return 0;
>   	}
> @@ -1085,6 +1101,9 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
>   		    nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
>   			goto nla_put_failure;
>   
> +		if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
> +			goto nla_put_failure;
> +
>   #ifdef CONFIG_CLS_U32_MARK
>   		if ((n->val || n->mask)) {
>   			struct tc_u32_mark mark = {.val = n->val,
>
John Fastabend Feb. 23, 2016, 11:30 p.m. UTC | #2
On 16-02-23 02:29 PM, Samudrala, Sridhar wrote:
> 
> 
> On 2/23/2016 11:03 AM, John Fastabend wrote:
>> In the initial implementation the only way to stop a rule from being
>> inserted into the hardware table was via the device feature flag.
>> However this doesn't work well when working on an end host system
>> where packets are expect to hit both the hardware and software
>> datapaths.
>>
>> For example we can imagine a rule that will match an IP address and
>> increment a field. If we install this rule in both hardware and
>> software we may increment the field twice. To date we have only
>> added support for the drop action so we have been able to ignore
>> these cases. But as we extend the action support we will hit this
>> example plus more such cases. Arguably these are not even corner
>> cases in many working systems these cases will be common.
> Is there a usecase to support running the same rule in both hardware as
> well
> as software? If a rule is offloaded to hardware,  isn't the assumption that
> we don't need to do that in software.

The default behaviour today is to load a rule into both software
and opportunistically hardware. This is the same model as many of
the other hardware offloads. This way user space doesn't have to
be concerned if it is running on hardware that supports the offload
or not.

If software is a bit "smarter" it can load software only rules and
hardware only rules. This allows the logic to decide if a rule is
"safe" to offload to be less restrictive because we avoid the
class of examples noted in the commit msg.

>>
>> To avoid forcing the driver to always abort (i.e. the above example)
>> this patch adds a flag to add a rule in software only. A careful
>> user can use this flag to build software and hardware datapaths
>> that work together. One example we have found particularly useful
>> is to use hardware resources to set the skb->mark on the skb when
>> the match may be expensive to run in software but a mark lookup
>> in a hash table is cheap. The idea here is hardware can do in one
>> lookup what the u32 classifier may need to traverse multiple lists
>> and hash tables to compute. The flag is only passed down on inserts
>> on deletion to avoid stale references in hardware we always try
>> to remove a rule if it exists.
>>
>> Notice we do not add a hardware only case here. If you were to
>> add a hardware only case then you are stuck with the problem
>> of where to stick the software representation of that filter
>> rule. If its stuck on the same filter list as the software only and
>> software/hardware rules it then has to be walked over and ignored
>> in the classify path. The overhead is not huge but is measurable.
>> And with so much work being invested in speeding up rx/tx of
>> pkt processing this is unacceptable IMO. The other option is to
>> have a special hook just for hardware only resources. This is
>> implemented in the next patch.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>   include/uapi/linux/pkt_cls.h |    3 +++
>>   net/sched/cls_u32.c          |   43
>> ++++++++++++++++++++++++++++++------------
>>   2 files changed, 34 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
>> index 4398737..143ea21 100644
>> --- a/include/uapi/linux/pkt_cls.h
>> +++ b/include/uapi/linux/pkt_cls.h
>> @@ -160,6 +160,8 @@ enum {
>>   #define TC_U32_UNSPEC    0
>>   #define TC_U32_ROOT    (0xFFF00000)
>>   +#define TCA_U32_FLAGS_SOFTWARE 1
> Does this flag indicate software-only rule?
> Instead should we add a flag to indicate HW only.
>

Yes software only rule. As I said in the commit log having
a HW only flag has issues with how to represent it in software
in such a way that software performance is not impacted. The
next patch in this series provides this functionality.

.John
Simon Horman Feb. 24, 2016, 6:11 a.m. UTC | #3
On Tue, Feb 23, 2016 at 11:03:21AM -0800, John Fastabend wrote:
> In the initial implementation the only way to stop a rule from being
> inserted into the hardware table was via the device feature flag.
> However this doesn't work well when working on an end host system
> where packets are expect to hit both the hardware and software
> datapaths.
>
> For example we can imagine a rule that will match an IP address and
> increment a field. If we install this rule in both hardware and
> software we may increment the field twice. To date we have only
> added support for the drop action so we have been able to ignore
> these cases. But as we extend the action support we will hit this
> example plus more such cases. Arguably these are not even corner
> cases in many working systems these cases will be common.
>
> To avoid forcing the driver to always abort (i.e. the above example)
> this patch adds a flag to add a rule in software only. A careful
> user can use this flag to build software and hardware datapaths
> that work together. One example we have found particularly useful
> is to use hardware resources to set the skb->mark on the skb when
> the match may be expensive to run in software but a mark lookup
> in a hash table is cheap. The idea here is hardware can do in one
> lookup what the u32 classifier may need to traverse multiple lists
> and hash tables to compute. The flag is only passed down on inserts
> on deletion to avoid stale references in hardware we always try
> to remove a rule if it exists.
>
> Notice we do not add a hardware only case here. If you were to
> add a hardware only case then you are stuck with the problem
> of where to stick the software representation of that filter
> rule. If its stuck on the same filter list as the software only and
> software/hardware rules it then has to be walked over and ignored
> in the classify path. The overhead is not huge but is measurable.
> And with so much work being invested in speeding up rx/tx of
> pkt processing this is unacceptable IMO. The other option is to
> have a special hook just for hardware only resources. This is
> implemented in the next patch.
>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

[snip]

> diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
> index f766bcb..c509fc8 100644
> --- a/net/sched/cls_u32.c
> +++ b/net/sched/cls_u32.c
> @@ -59,6 +59,7 @@ struct tc_u_knode {
>  #ifdef CONFIG_CLS_U32_PERF
>  	struct tc_u32_pcnt __percpu *pf;
>  #endif
> +	u32			flags;
>  #ifdef CONFIG_CLS_U32_MARK
>  	u32			val;
>  	u32			mask;
> @@ -425,12 +426,18 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
>  	return 0;
>  }
>  
> -static bool u32_should_offload(struct net_device *dev)
> +static bool u32_should_offload(struct net_device *dev, u32 flags)
>  {
>  	if (!(dev->features & NETIF_F_HW_TC))
>  		return false;
>  
> -	return dev->netdev_ops->ndo_setup_tc;
> +	if (flags & TCA_U32_FLAGS_SOFTWARE)
> +		return false;
> +
> +	if (!dev->netdev_ops->ndo_setup_tc)
> +		return false;
> +
> +	return true;
>  }
>  
>  static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
> @@ -442,7 +449,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
>  	offload.type = TC_SETUP_CLSU32;
>  	offload.cls_u32 = &u32_offload;
>  
> -	if (u32_should_offload(dev)) {
> +	if (u32_should_offload(dev, 0)) {
>  		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
>  		offload.cls_u32->knode.handle = handle;
>  		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,

Here a request is made to delete the classifier from hardware regardless
of if TCA_U32_FLAGS_SOFTWARE is set or not. This seems sensible to me.


> @@ -450,7 +457,9 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
>  	}
>  }
>  
> -static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
> +static void u32_replace_hw_hnode(struct tcf_proto *tp,
> +				 struct tc_u_hnode *h,
> +				 u32 flags)
>  {
>  	struct net_device *dev = tp->q->dev_queue->dev;
>  	struct tc_cls_u32_offload u32_offload = {0};
> @@ -459,7 +468,7 @@ static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
>  	offload.type = TC_SETUP_CLSU32;
>  	offload.cls_u32 = &u32_offload;
>  
> -	if (u32_should_offload(dev)) {
> +	if (u32_should_offload(dev, flags)) {
>  		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
>  		offload.cls_u32->hnode.divisor = h->divisor;
>  		offload.cls_u32->hnode.handle = h->handle;

But here an update is only made if flag is TCA_U32_FLAGS_SOFTWARE.

I wonder if this means we can get into a situation where the classifier
present in software and hardware differ. Something like this.

1. classifier is added to software and hardware (TCA_U32_FLAGS_SOFTWARE is set)
2. classifier is updated in software only (TCA_U32_FLAGS_SOFTWARE is not set)

> @@ -479,7 +488,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
>  	offload.type = TC_SETUP_CLSU32;
>  	offload.cls_u32 = &u32_offload;
>  
> -	if (u32_should_offload(dev)) {
> +	if (u32_should_offload(dev, 0)) {
>  		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
>  		offload.cls_u32->hnode.divisor = h->divisor;
>  		offload.cls_u32->hnode.handle = h->handle;
> @@ -490,7 +499,9 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
>  	}
>  }
>  
> -static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
> +static void u32_replace_hw_knode(struct tcf_proto *tp,
> +				 struct tc_u_knode *n,
> +				 u32 flags)
>  {
>  	struct net_device *dev = tp->q->dev_queue->dev;
>  	struct tc_cls_u32_offload u32_offload = {0};
> @@ -499,7 +510,7 @@ static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
>  	offload.type = TC_SETUP_CLSU32;
>  	offload.cls_u32 = &u32_offload;
>  
> -	if (u32_should_offload(dev)) {
> +	if (u32_should_offload(dev, flags)) {
>  		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
>  		offload.cls_u32->knode.handle = n->handle;
>  		offload.cls_u32->knode.fshift = n->fshift;

[snip]
John Fastabend Feb. 24, 2016, 7:24 a.m. UTC | #4
On 16-02-23 10:11 PM, Simon Horman wrote:
> On Tue, Feb 23, 2016 at 11:03:21AM -0800, John Fastabend wrote:
>> In the initial implementation the only way to stop a rule from being
>> inserted into the hardware table was via the device feature flag.
>> However this doesn't work well when working on an end host system
>> where packets are expect to hit both the hardware and software
>> datapaths.
>>
>> For example we can imagine a rule that will match an IP address and
>> increment a field. If we install this rule in both hardware and
>> software we may increment the field twice. To date we have only
>> added support for the drop action so we have been able to ignore
>> these cases. But as we extend the action support we will hit this
>> example plus more such cases. Arguably these are not even corner
>> cases in many working systems these cases will be common.
>>
>> To avoid forcing the driver to always abort (i.e. the above example)
>> this patch adds a flag to add a rule in software only. A careful
>> user can use this flag to build software and hardware datapaths
>> that work together. One example we have found particularly useful
>> is to use hardware resources to set the skb->mark on the skb when
>> the match may be expensive to run in software but a mark lookup
>> in a hash table is cheap. The idea here is hardware can do in one
>> lookup what the u32 classifier may need to traverse multiple lists
>> and hash tables to compute. The flag is only passed down on inserts
>> on deletion to avoid stale references in hardware we always try
>> to remove a rule if it exists.
>>
>> Notice we do not add a hardware only case here. If you were to
>> add a hardware only case then you are stuck with the problem
>> of where to stick the software representation of that filter
>> rule. If its stuck on the same filter list as the software only and
>> software/hardware rules it then has to be walked over and ignored
>> in the classify path. The overhead is not huge but is measurable.
>> And with so much work being invested in speeding up rx/tx of
>> pkt processing this is unacceptable IMO. The other option is to
>> have a special hook just for hardware only resources. This is
>> implemented in the next patch.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> 
> [snip]
> 
>> diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
>> index f766bcb..c509fc8 100644
>> --- a/net/sched/cls_u32.c
>> +++ b/net/sched/cls_u32.c
>> @@ -59,6 +59,7 @@ struct tc_u_knode {
>>  #ifdef CONFIG_CLS_U32_PERF
>>  	struct tc_u32_pcnt __percpu *pf;
>>  #endif
>> +	u32			flags;
>>  #ifdef CONFIG_CLS_U32_MARK
>>  	u32			val;
>>  	u32			mask;
>> @@ -425,12 +426,18 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
>>  	return 0;
>>  }
>>  
>> -static bool u32_should_offload(struct net_device *dev)
>> +static bool u32_should_offload(struct net_device *dev, u32 flags)
>>  {
>>  	if (!(dev->features & NETIF_F_HW_TC))
>>  		return false;
>>  
>> -	return dev->netdev_ops->ndo_setup_tc;
>> +	if (flags & TCA_U32_FLAGS_SOFTWARE)
>> +		return false;
>> +
>> +	if (!dev->netdev_ops->ndo_setup_tc)
>> +		return false;
>> +
>> +	return true;
>>  }
>>  
>>  static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
>> @@ -442,7 +449,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
>>  	offload.type = TC_SETUP_CLSU32;
>>  	offload.cls_u32 = &u32_offload;
>>  
>> -	if (u32_should_offload(dev)) {
>> +	if (u32_should_offload(dev, 0)) {
>>  		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
>>  		offload.cls_u32->knode.handle = handle;
>>  		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
> 
> Here a request is made to delete the classifier from hardware regardless
> of if TCA_U32_FLAGS_SOFTWARE is set or not. This seems sensible to me.
> 
> 
>> @@ -450,7 +457,9 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
>>  	}
>>  }
>>  
>> -static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
>> +static void u32_replace_hw_hnode(struct tcf_proto *tp,
>> +				 struct tc_u_hnode *h,
>> +				 u32 flags)
>>  {
>>  	struct net_device *dev = tp->q->dev_queue->dev;
>>  	struct tc_cls_u32_offload u32_offload = {0};
>> @@ -459,7 +468,7 @@ static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
>>  	offload.type = TC_SETUP_CLSU32;
>>  	offload.cls_u32 = &u32_offload;
>>  
>> -	if (u32_should_offload(dev)) {
>> +	if (u32_should_offload(dev, flags)) {
>>  		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
>>  		offload.cls_u32->hnode.divisor = h->divisor;
>>  		offload.cls_u32->hnode.handle = h->handle;
> 
> But here an update is only made if flag is TCA_U32_FLAGS_SOFTWARE.
> 
> I wonder if this means we can get into a situation where the classifier
> present in software and hardware differ. Something like this.
> 
> 1. classifier is added to software and hardware (TCA_U32_FLAGS_SOFTWARE is set)
> 2. classifier is updated in software only (TCA_U32_FLAGS_SOFTWARE is not set)
> 

So the update will only update the software copy in this case but the
handle wont change so when the delete eventually does come it will
delete both the software copy and hardware copy but they will be out
of sync.

It does seem better if we do not let the flags get changed and if a
rule is created with SOFTWARE only require it to stay SOFTWARE only
for the duration of the rule. And going the other way if a rule is
configured in both hardware/software do not let it get changed to
SOFTWARE only.

I'll send a v2 with this change.

>> @@ -479,7 +488,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
>>  	offload.type = TC_SETUP_CLSU32;
>>  	offload.cls_u32 = &u32_offload;
>>  
>> -	if (u32_should_offload(dev)) {
>> +	if (u32_should_offload(dev, 0)) {
>>  		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
>>  		offload.cls_u32->hnode.divisor = h->divisor;
>>  		offload.cls_u32->hnode.handle = h->handle;
>> @@ -490,7 +499,9 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
>>  	}
>>  }
>>  
>> -static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
>> +static void u32_replace_hw_knode(struct tcf_proto *tp,
>> +				 struct tc_u_knode *n,
>> +				 u32 flags)
>>  {
>>  	struct net_device *dev = tp->q->dev_queue->dev;
>>  	struct tc_cls_u32_offload u32_offload = {0};
>> @@ -499,7 +510,7 @@ static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
>>  	offload.type = TC_SETUP_CLSU32;
>>  	offload.cls_u32 = &u32_offload;
>>  
>> -	if (u32_should_offload(dev)) {
>> +	if (u32_should_offload(dev, flags)) {
>>  		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
>>  		offload.cls_u32->knode.handle = n->handle;
>>  		offload.cls_u32->knode.fshift = n->fshift;
> 
> [snip]
>
Amir Vadai Feb. 24, 2016, 8:04 a.m. UTC | #5
On Tue, Feb 23, 2016 at 11:03:21AM -0800, John Fastabend wrote:
> In the initial implementation the only way to stop a rule from being
> inserted into the hardware table was via the device feature flag.
> However this doesn't work well when working on an end host system
> where packets are expect to hit both the hardware and software
> datapaths.
> 
> For example we can imagine a rule that will match an IP address and
> increment a field. If we install this rule in both hardware and
> software we may increment the field twice. To date we have only
> added support for the drop action so we have been able to ignore
> these cases. But as we extend the action support we will hit this
> example plus more such cases. Arguably these are not even corner
> cases in many working systems these cases will be common.
> 
> To avoid forcing the driver to always abort (i.e. the above example)
> this patch adds a flag to add a rule in software only. A careful
> user can use this flag to build software and hardware datapaths
> that work together. One example we have found particularly useful
> is to use hardware resources to set the skb->mark on the skb when
> the match may be expensive to run in software but a mark lookup
> in a hash table is cheap. The idea here is hardware can do in one
> lookup what the u32 classifier may need to traverse multiple lists
> and hash tables to compute. The flag is only passed down on inserts
> on deletion to avoid stale references in hardware we always try
> to remove a rule if it exists.
> 
> Notice we do not add a hardware only case here. If you were to
> add a hardware only case then you are stuck with the problem
> of where to stick the software representation of that filter
> rule. If its stuck on the same filter list as the software only and
> software/hardware rules it then has to be walked over and ignored
> in the classify path. The overhead is not huge but is measurable.
> And with so much work being invested in speeding up rx/tx of
> pkt processing this is unacceptable IMO. The other option is to
> have a special hook just for hardware only resources. This is
> implemented in the next patch.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

[...]

>  
> -static bool u32_should_offload(struct net_device *dev)
> +static bool u32_should_offload(struct net_device *dev, u32 flags)
>  {
>  	if (!(dev->features & NETIF_F_HW_TC))
>  		return false;
>  
> -	return dev->netdev_ops->ndo_setup_tc;
> +	if (flags & TCA_U32_FLAGS_SOFTWARE)
> +		return false;
> +
> +	if (!dev->netdev_ops->ndo_setup_tc)
> +		return false;
> +
> +	return true;
>  }
This function and flag should be a generic filter attribute - not just
u32.

Thanks,
Amir

[...]
Jiri Pirko Feb. 24, 2016, 8:40 a.m. UTC | #6
Wed, Feb 24, 2016 at 09:04:40AM CET, amir@vadai.me wrote:
>On Tue, Feb 23, 2016 at 11:03:21AM -0800, John Fastabend wrote:
>> In the initial implementation the only way to stop a rule from being
>> inserted into the hardware table was via the device feature flag.
>> However this doesn't work well when working on an end host system
>> where packets are expect to hit both the hardware and software
>> datapaths.
>> 
>> For example we can imagine a rule that will match an IP address and
>> increment a field. If we install this rule in both hardware and
>> software we may increment the field twice. To date we have only
>> added support for the drop action so we have been able to ignore
>> these cases. But as we extend the action support we will hit this
>> example plus more such cases. Arguably these are not even corner
>> cases in many working systems these cases will be common.
>> 
>> To avoid forcing the driver to always abort (i.e. the above example)
>> this patch adds a flag to add a rule in software only. A careful
>> user can use this flag to build software and hardware datapaths
>> that work together. One example we have found particularly useful
>> is to use hardware resources to set the skb->mark on the skb when
>> the match may be expensive to run in software but a mark lookup
>> in a hash table is cheap. The idea here is hardware can do in one
>> lookup what the u32 classifier may need to traverse multiple lists
>> and hash tables to compute. The flag is only passed down on inserts
>> on deletion to avoid stale references in hardware we always try
>> to remove a rule if it exists.
>> 
>> Notice we do not add a hardware only case here. If you were to
>> add a hardware only case then you are stuck with the problem
>> of where to stick the software representation of that filter
>> rule. If its stuck on the same filter list as the software only and
>> software/hardware rules it then has to be walked over and ignored
>> in the classify path. The overhead is not huge but is measurable.
>> And with so much work being invested in speeding up rx/tx of
>> pkt processing this is unacceptable IMO. The other option is to
>> have a special hook just for hardware only resources. This is
>> implemented in the next patch.
>> 
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>
>[...]
>
>>  
>> -static bool u32_should_offload(struct net_device *dev)
>> +static bool u32_should_offload(struct net_device *dev, u32 flags)
>>  {
>>  	if (!(dev->features & NETIF_F_HW_TC))
>>  		return false;
>>  
>> -	return dev->netdev_ops->ndo_setup_tc;
>> +	if (flags & TCA_U32_FLAGS_SOFTWARE)
>> +		return false;
>> +
>> +	if (!dev->netdev_ops->ndo_setup_tc)
>> +		return false;
>> +
>> +	return true;
>>  }
>This function and flag should be a generic filter attribute - not just
>u32.

I agree, this should be generic.

Regarding flags attr, we have the same situation as with other common
attrs:
TCA_U32_POLICE
TCA_FLOW_POLICE
TCA_CGROUP_POLICE
TCA_BPF_POLICE

TCA_U32_ACT
TCA_FLOW_ACT
TCA_CGROUP_ACT
TCA_BPF_ACT
TCA_FLOWER_ACT

I guess we have no other choice then to have
TCA_U32_FLAGS
TCA_FLOWER_FLAGS etc :(
John Fastabend Feb. 24, 2016, 8:55 a.m. UTC | #7
On 16-02-24 12:40 AM, Jiri Pirko wrote:
> Wed, Feb 24, 2016 at 09:04:40AM CET, amir@vadai.me wrote:
>> On Tue, Feb 23, 2016 at 11:03:21AM -0800, John Fastabend wrote:
>>> In the initial implementation the only way to stop a rule from being
>>> inserted into the hardware table was via the device feature flag.
>>> However this doesn't work well when working on an end host system
>>> where packets are expect to hit both the hardware and software
>>> datapaths.
>>>
>>> For example we can imagine a rule that will match an IP address and
>>> increment a field. If we install this rule in both hardware and
>>> software we may increment the field twice. To date we have only
>>> added support for the drop action so we have been able to ignore
>>> these cases. But as we extend the action support we will hit this
>>> example plus more such cases. Arguably these are not even corner
>>> cases in many working systems these cases will be common.
>>>
>>> To avoid forcing the driver to always abort (i.e. the above example)
>>> this patch adds a flag to add a rule in software only. A careful
>>> user can use this flag to build software and hardware datapaths
>>> that work together. One example we have found particularly useful
>>> is to use hardware resources to set the skb->mark on the skb when
>>> the match may be expensive to run in software but a mark lookup
>>> in a hash table is cheap. The idea here is hardware can do in one
>>> lookup what the u32 classifier may need to traverse multiple lists
>>> and hash tables to compute. The flag is only passed down on inserts
>>> on deletion to avoid stale references in hardware we always try
>>> to remove a rule if it exists.
>>>
>>> Notice we do not add a hardware only case here. If you were to
>>> add a hardware only case then you are stuck with the problem
>>> of where to stick the software representation of that filter
>>> rule. If its stuck on the same filter list as the software only and
>>> software/hardware rules it then has to be walked over and ignored
>>> in the classify path. The overhead is not huge but is measurable.
>>> And with so much work being invested in speeding up rx/tx of
>>> pkt processing this is unacceptable IMO. The other option is to
>>> have a special hook just for hardware only resources. This is
>>> implemented in the next patch.
>>>
>>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>>
>> [...]
>>
>>>  
>>> -static bool u32_should_offload(struct net_device *dev)
>>> +static bool u32_should_offload(struct net_device *dev, u32 flags)
>>>  {
>>>  	if (!(dev->features & NETIF_F_HW_TC))
>>>  		return false;
>>>  
>>> -	return dev->netdev_ops->ndo_setup_tc;
>>> +	if (flags & TCA_U32_FLAGS_SOFTWARE)
>>> +		return false;
>>> +
>>> +	if (!dev->netdev_ops->ndo_setup_tc)
>>> +		return false;
>>> +
>>> +	return true;
>>>  }
>> This function and flag should be a generic filter attribute - not just
>> u32.
> 
> I agree, this should be generic.
> 
> Regarding flags attr, we have the same situation as with other common
> attrs:
> TCA_U32_POLICE
> TCA_FLOW_POLICE
> TCA_CGROUP_POLICE
> TCA_BPF_POLICE
> 
> TCA_U32_ACT
> TCA_FLOW_ACT
> TCA_CGROUP_ACT
> TCA_BPF_ACT
> TCA_FLOWER_ACT
> 
> I guess we have no other choice then to have
> TCA_U32_FLAGS
> TCA_FLOWER_FLAGS etc :(
> 

Sure if you want to lift it out of u32 I can do that. Seeing there are
no other users I planned to do it when I added the next hardware
classifier. But sure I can do it now and save a patch later.

The flags however likely stays with with TCA_U32_FLAGS until there is
some better way to group common attributes in 'tc' framework.

.John
Jiri Benc Feb. 24, 2016, 9:29 a.m. UTC | #8
On Wed, 24 Feb 2016 00:55:55 -0800, John Fastabend wrote:
> The flags however likely stays with with TCA_U32_FLAGS until there is
> some better way to group common attributes in 'tc' framework.

That's pretty bad, as this is uAPI and will need to be supported
forever. And having a different attribute in every filter won't ease
things for user space tools. I'd say we need the "better way" to be
added before this patchset.

 Jiri
Jamal Hadi Salim Feb. 24, 2016, 1:31 p.m. UTC | #9
On 16-02-23 02:03 PM, John Fastabend wrote:
> In the initial implementation the only way to stop a rule from being
> inserted into the hardware table was via the device feature flag.
> However this doesn't work well when working on an end host system
> where packets are expect to hit both the hardware and software
> datapaths.
>
> For example we can imagine a rule that will match an IP address and
> increment a field. If we install this rule in both hardware and
> software we may increment the field twice. To date we have only
> added support for the drop action so we have been able to ignore
> these cases. But as we extend the action support we will hit this
> example plus more such cases. Arguably these are not even corner
> cases in many working systems these cases will be common.
>
> To avoid forcing the driver to always abort (i.e. the above example)
> this patch adds a flag to add a rule in software only. A careful
> user can use this flag to build software and hardware datapaths
> that work together. One example we have found particularly useful
> is to use hardware resources to set the skb->mark on the skb when
> the match may be expensive to run in software but a mark lookup
> in a hash table is cheap. The idea here is hardware can do in one
> lookup what the u32 classifier may need to traverse multiple lists
> and hash tables to compute. The flag is only passed down on inserts
> on deletion to avoid stale references in hardware we always try
> to remove a rule if it exists.
>
> Notice we do not add a hardware only case here. If you were to
> add a hardware only case then you are stuck with the problem
> of where to stick the software representation of that filter
> rule. If its stuck on the same filter list as the software only and
> software/hardware rules it then has to be walked over and ignored
> in the classify path. The overhead is not huge but is measurable.
> And with so much work being invested in speeding up rx/tx of
> pkt processing this is unacceptable IMO. The other option is to
> have a special hook just for hardware only resources. This is
> implemented in the next patch.
>

Dont have much time to look closely - will do later. Just wanted
to quip:
Would it make sense to have a user flag which says, "please store
this in s/ware - dont use it in s/ware just install it in h/ware."
This should be totally optional policy, but would help find the rules
faster from a control plane if i look for them in s/ware first.
There's some really freaking slow hardware interfaces out there...
(a record of 60 seconds to find something is not unheard of).

cheers,
jamal
John Fastabend Feb. 25, 2016, 4:04 a.m. UTC | #10
On 16-02-24 05:31 AM, Jamal Hadi Salim wrote:
> On 16-02-23 02:03 PM, John Fastabend wrote:
>> In the initial implementation the only way to stop a rule from being
>> inserted into the hardware table was via the device feature flag.
>> However this doesn't work well when working on an end host system
>> where packets are expect to hit both the hardware and software
>> datapaths.
>>
>> For example we can imagine a rule that will match an IP address and
>> increment a field. If we install this rule in both hardware and
>> software we may increment the field twice. To date we have only
>> added support for the drop action so we have been able to ignore
>> these cases. But as we extend the action support we will hit this
>> example plus more such cases. Arguably these are not even corner
>> cases in many working systems these cases will be common.
>>
>> To avoid forcing the driver to always abort (i.e. the above example)
>> this patch adds a flag to add a rule in software only. A careful
>> user can use this flag to build software and hardware datapaths
>> that work together. One example we have found particularly useful
>> is to use hardware resources to set the skb->mark on the skb when
>> the match may be expensive to run in software but a mark lookup
>> in a hash table is cheap. The idea here is hardware can do in one
>> lookup what the u32 classifier may need to traverse multiple lists
>> and hash tables to compute. The flag is only passed down on inserts
>> on deletion to avoid stale references in hardware we always try
>> to remove a rule if it exists.
>>
>> Notice we do not add a hardware only case here. If you were to
>> add a hardware only case then you are stuck with the problem
>> of where to stick the software representation of that filter
>> rule. If its stuck on the same filter list as the software only and
>> software/hardware rules it then has to be walked over and ignored
>> in the classify path. The overhead is not huge but is measurable.
>> And with so much work being invested in speeding up rx/tx of
>> pkt processing this is unacceptable IMO. The other option is to
>> have a special hook just for hardware only resources. This is
>> implemented in the next patch.
>>
> 
> Dont have much time to look closely - will do later. Just wanted
> to quip:
> Would it make sense to have a user flag which says, "please store
> this in s/ware - dont use it in s/ware just install it in h/ware."
> This should be totally optional policy, but would help find the rules
> faster from a control plane if i look for them in s/ware first.
> There's some really freaking slow hardware interfaces out there...
> (a record of 60 seconds to find something is not unheard of).

I think this is absolutely necessary not only for performance of
reporting the rules back to software but if we don't do it generically
the driver will have to do it anyways because doing the inverse
transformation from hw implementation to u32 is really tricky and in
fact with hnodes and knodes there are multiple cls_u32 "programs" that
functionally are the same so we have no way to return what the user
actually programmed without it. Further eBPF (the next classifier I'm
working on) is even worse in this regard. You can see my solution to
this "load in hardware" filter list in patch 4/4. See Jiri's comment
also on that and see if you agree.

> 
> cheers,
> jamal
John Fastabend Feb. 25, 2016, 4:09 a.m. UTC | #11
On 16-02-24 01:29 AM, Jiri Benc wrote:
> On Wed, 24 Feb 2016 00:55:55 -0800, John Fastabend wrote:
>> The flags however likely stays with with TCA_U32_FLAGS until there is
>> some better way to group common attributes in 'tc' framework.
> 
> That's pretty bad, as this is uAPI and will need to be supported
> forever. And having a different attribute in every filter won't ease
> things for user space tools. I'd say we need the "better way" to be
> added before this patchset.
> 
>  Jiri
> 

The 'tc' semantics seem to support this "pretty bad" API design
with many of the fields already duplicated. I suppose we could
put the flags at the same level as the TCA_* attributes but this
also doesn't seem right to me as it isn't actually handled until
we get into the TCA_#CLASSIFIER#_* set of attributes.

.John
Jamal Hadi Salim Feb. 25, 2016, 12:56 p.m. UTC | #12
On 16-02-24 11:04 PM, John Fastabend wrote:
> On 16-02-24 05:31 AM, Jamal Hadi Salim wrote:

> I think this is absolutely necessary not only for performance of
> reporting the rules back to software but if we don't do it generically
> the driver will have to do it anyways because doing the inverse
> transformation from hw implementation to u32 is really tricky and in
> fact with hnodes and knodes there are multiple cls_u32 "programs" that
> functionally are the same so we have no way to return what the user
> actually programmed without it. Further eBPF (the next classifier I'm
> working on) is even worse in this regard.

Ok, I guess there are multiple use cases for it ;->
Yes, with ebpf it will be worse because data and instructions are
inter- mingled (and our interest is in data only). Note:
Over the years this has been a big struggle for human
friendliness. I thought we didnt care about humans (as in automation)
but you are saying this will affect machines too ;-> We cant allow
that ;->

Note: You could decode u32 descriptions but it is an involved effort.
Example, see this feature in tc:
---
jhs@jhs1 tc -pretty filter ls dev $ETH parent ffff: protocol ip
filter pref 11 u32
filter pref 11 u32 fh 800: ht divisor 1
filter pref 11 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1
   match IP src 10.0.0.130/32
	action order 1: gact action drop
	 random type none pass val 0
	 index 1 ref 1 bind 1
----
See that "match" field reading in anglais? It requires more and more
additions of pretty printers that translate back.

What about adding some tag to allow for easy "babel translation"?

> You can see my solution to
> this "load in hardware" filter list in patch 4/4. See Jiri's comment
> also on that and see if you agree.

Ok, will do.

cheers,
jamal
Jamal Hadi Salim Feb. 25, 2016, 1:19 p.m. UTC | #13
On 16-02-24 11:09 PM, John Fastabend wrote:
> On 16-02-24 01:29 AM, Jiri Benc wrote:
>> On Wed, 24 Feb 2016 00:55:55 -0800, John Fastabend wrote:
>>> The flags however likely stays with with TCA_U32_FLAGS until there is
>>> some better way to group common attributes in 'tc' framework.
>>
>> That's pretty bad, as this is uAPI and will need to be supported
>> forever. And having a different attribute in every filter won't ease
>> things for user space tools. I'd say we need the "better way" to be
>> added before this patchset.
>>
>>   Jiri
>>
>
> The 'tc' semantics seem to support this "pretty bad" API design
> with many of the fields already duplicated.

Mostly this is a netlink-ism. Netlink has the same problem with
command name spaces. The problem is mixing verbs and nouns together.
I like the switchdev approach where you have very few verbs
(SET, GET etc) and the content of the object or path describes
the nouns. This is why initially i thought it was better to have
this offload passing by switchdev. Could we leverage some of that?

> I suppose we could
> put the flags at the same level as the TCA_* attributes but this
> also doesn't seem right to me as it isn't actually handled until
> we get into the TCA_#CLASSIFIER#_* set of attributes.
>

Could we not steal a couple of bits off netlink flags? Then it applies
to all subssystems.

cheers,
jamal
John Fastabend Feb. 25, 2016, 4:39 p.m. UTC | #14
On 16-02-25 05:19 AM, Jamal Hadi Salim wrote:
> On 16-02-24 11:09 PM, John Fastabend wrote:
>> On 16-02-24 01:29 AM, Jiri Benc wrote:
>>> On Wed, 24 Feb 2016 00:55:55 -0800, John Fastabend wrote:
>>>> The flags however likely stays with with TCA_U32_FLAGS until there is
>>>> some better way to group common attributes in 'tc' framework.
>>>
>>> That's pretty bad, as this is uAPI and will need to be supported
>>> forever. And having a different attribute in every filter won't ease
>>> things for user space tools. I'd say we need the "better way" to be
>>> added before this patchset.
>>>
>>>   Jiri
>>>
>>
>> The 'tc' semantics seem to support this "pretty bad" API design
>> with many of the fields already duplicated.
> 
> Mostly this is a netlink-ism. Netlink has the same problem with
> command name spaces. The problem is mixing verbs and nouns together.
> I like the switchdev approach where you have very few verbs
> (SET, GET etc) and the content of the object or path describes
> the nouns. This is why initially i thought it was better to have
> this offload passing by switchdev. Could we leverage some of that?
> 

No I don't think so. Either way you need some flag at the 'tc' layer
to push this down to hardware offload ops.

>> I suppose we could
>> put the flags at the same level as the TCA_* attributes but this
>> also doesn't seem right to me as it isn't actually handled until
>> we get into the TCA_#CLASSIFIER#_* set of attributes.
>>
> 
> Could we not steal a couple of bits off netlink flags? Then it applies
> to all subssystems.

I did that in some original patch I never sent but I didn't like it.
Netlink is used for all sorts of things and those flags already have
a meaning. We've already started this trend of using flags inside the
specific messages for other things like the bridge code. And there are
only a few bits left in the rtnetlink flags field I would prefer to
save them for something necessary.

In the end I think the solution here is really not that bad the
userspace code to add it is trivial (~5 lines of code). Its not
how I would do it if I was writing the entire OS from scratch
but we have to maintain UAPI so not sure I have any better solutions.

> 
> cheers,
> jamal
>
John Fastabend Feb. 25, 2016, 9:56 p.m. UTC | #15
On 16-02-25 04:56 AM, Jamal Hadi Salim wrote:
> On 16-02-24 11:04 PM, John Fastabend wrote:
>> On 16-02-24 05:31 AM, Jamal Hadi Salim wrote:
> 
>> I think this is absolutely necessary not only for performance of
>> reporting the rules back to software but if we don't do it generically
>> the driver will have to do it anyways because doing the inverse
>> transformation from hw implementation to u32 is really tricky and in
>> fact with hnodes and knodes there are multiple cls_u32 "programs" that
>> functionally are the same so we have no way to return what the user
>> actually programmed without it. Further eBPF (the next classifier I'm
>> working on) is even worse in this regard.
> 
> Ok, I guess there are multiple use cases for it ;->
> Yes, with ebpf it will be worse because data and instructions are
> inter- mingled (and our interest is in data only). Note:
> Over the years this has been a big struggle for human
> friendliness. I thought we didnt care about humans (as in automation)
> but you are saying this will affect machines too ;-> We cant allow
> that ;->
> 
> Note: You could decode u32 descriptions but it is an involved effort.
> Example, see this feature in tc:
> ---
> jhs@jhs1 tc -pretty filter ls dev $ETH parent ffff: protocol ip
> filter pref 11 u32
> filter pref 11 u32 fh 800: ht divisor 1
> filter pref 11 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1
>   match IP src 10.0.0.130/32
>     action order 1: gact action drop
>      random type none pass val 0
>      index 1 ref 1 bind 1
> ----

decoding that is not a problem. The ixgbe driver code already applied
can decode that without much trouble. The thing I want to avoid is
requiring my driver to do the inverse translation because although it
is possible its entirely unnecessary.

To do the above example without a software cache for example
means I read out row 2048 of a hardware table then you get a bunch of
bits. From those bits I consult what fields are being loaded into
the table in the packet processing pipeline. I learn its the src_ip
fields then I have the value/mask and can unwind it. Finally if I
collapsed some hash tables onto this hardware table I have to do the
inverse of my collapsing scheme. The ixgbe one is sort of simple just
because I only have one table in hardware but with multiple tables
its a bit more difficult. Finally I've unwound the thing and can
print something back out of 'tc' but it seems easier to just cache
the hardware rules somewhere. Maybe other driver/hardware will have
a different opinion though depending on how much your firmware can
store and how ambitious you are. Personally I don't see any need
for the above code.

> See that "match" field reading in anglais? It requires more and more
> additions of pretty printers that translate back.
> 
> What about adding some tag to allow for easy "babel translation"?
> 

not sure what this would be...

>> You can see my solution to
>> this "load in hardware" filter list in patch 4/4. See Jiri's comment
>> also on that and see if you agree.
> 
> Ok, will do.
> 
> cheers,
> jamal
Jamal Hadi Salim Feb. 25, 2016, 11:05 p.m. UTC | #16
On 16-02-25 04:56 PM, John Fastabend wrote:
> On 16-02-25 04:56 AM, Jamal Hadi Salim wrote:

>
> decoding that is not a problem. The ixgbe driver code already applied
> can decode that without much trouble. The thing I want to avoid is
> requiring my driver to do the inverse translation because although it
> is possible its entirely unnecessary.
>
> To do the above example without a software cache for example
> means I read out row 2048 of a hardware table then you get a bunch of
> bits. From those bits I consult what fields are being loaded into
> the table in the packet processing pipeline. I learn its the src_ip
> fields then I have the value/mask and can unwind it. Finally if I
> collapsed some hash tables onto this hardware table I have to do the
> inverse of my collapsing scheme. The ixgbe one is sort of simple just
> because I only have one table in hardware but with multiple tables
> its a bit more difficult. Finally I've unwound the thing and can
> print something back out of 'tc' but it seems easier to just cache
> the hardware rules somewhere. Maybe other driver/hardware will have
> a different opinion though depending on how much your firmware can
> store and how ambitious you are. Personally I don't see any need
> for the above code.
>

I think if you can cache the rules and have a way to easily map to
the hardware then this would work fine.

cheers,
jamal
John Fastabend Feb. 25, 2016, 11:08 p.m. UTC | #17
On 16-02-25 03:05 PM, Jamal Hadi Salim wrote:
> On 16-02-25 04:56 PM, John Fastabend wrote:
>> On 16-02-25 04:56 AM, Jamal Hadi Salim wrote:
> 
>>
>> decoding that is not a problem. The ixgbe driver code already applied
>> can decode that without much trouble. The thing I want to avoid is
>> requiring my driver to do the inverse translation because although it
>> is possible its entirely unnecessary.
>>
>> To do the above example without a software cache for example
>> means I read out row 2048 of a hardware table then you get a bunch of
>> bits. From those bits I consult what fields are being loaded into
>> the table in the packet processing pipeline. I learn its the src_ip
>> fields then I have the value/mask and can unwind it. Finally if I
>> collapsed some hash tables onto this hardware table I have to do the
>> inverse of my collapsing scheme. The ixgbe one is sort of simple just
>> because I only have one table in hardware but with multiple tables
>> its a bit more difficult. Finally I've unwound the thing and can
>> print something back out of 'tc' but it seems easier to just cache
>> the hardware rules somewhere. Maybe other driver/hardware will have
>> a different opinion though depending on how much your firmware can
>> store and how ambitious you are. Personally I don't see any need
>> for the above code.
>>
> 
> I think if you can cache the rules and have a way to easily map to
> the hardware then this would work fine.

Yep that is the goal. I think the debate is if its acceptable to do
it with an entirely new filter list ingress_hw Jiri's opinion is that
it would be best to do it inline inside the classifier. At the moment
I'm looking at the code to see if there is a clean way to do it. IMO
using a ingress_hw classifier is a nice solution.

In the meantime I just respun patches 1-3 with the feedback and will
submit those while I work out patch 4.

> 
> cheers,
> jamal
diff mbox

Patch

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 4398737..143ea21 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -160,6 +160,8 @@  enum {
 #define TC_U32_UNSPEC	0
 #define TC_U32_ROOT	(0xFFF00000)
 
+#define TCA_U32_FLAGS_SOFTWARE 1
+
 enum {
 	TCA_U32_UNSPEC,
 	TCA_U32_CLASSID,
@@ -172,6 +174,7 @@  enum {
 	TCA_U32_INDEV,
 	TCA_U32_PCNT,
 	TCA_U32_MARK,
+	TCA_U32_FLAGS,
 	__TCA_U32_MAX
 };
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index f766bcb..c509fc8 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -59,6 +59,7 @@  struct tc_u_knode {
 #ifdef CONFIG_CLS_U32_PERF
 	struct tc_u32_pcnt __percpu *pf;
 #endif
+	u32			flags;
 #ifdef CONFIG_CLS_U32_MARK
 	u32			val;
 	u32			mask;
@@ -425,12 +426,18 @@  static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 	return 0;
 }
 
-static bool u32_should_offload(struct net_device *dev)
+static bool u32_should_offload(struct net_device *dev, u32 flags)
 {
 	if (!(dev->features & NETIF_F_HW_TC))
 		return false;
 
-	return dev->netdev_ops->ndo_setup_tc;
+	if (flags & TCA_U32_FLAGS_SOFTWARE)
+		return false;
+
+	if (!dev->netdev_ops->ndo_setup_tc)
+		return false;
+
+	return true;
 }
 
 static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
@@ -442,7 +449,7 @@  static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (u32_should_offload(dev)) {
+	if (u32_should_offload(dev, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
 		offload.cls_u32->knode.handle = handle;
 		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -450,7 +457,9 @@  static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	}
 }
 
-static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+static void u32_replace_hw_hnode(struct tcf_proto *tp,
+				 struct tc_u_hnode *h,
+				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -459,7 +468,7 @@  static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (u32_should_offload(dev)) {
+	if (u32_should_offload(dev, flags)) {
 		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -479,7 +488,7 @@  static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (u32_should_offload(dev)) {
+	if (u32_should_offload(dev, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -490,7 +499,9 @@  static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	}
 }
 
-static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
+static void u32_replace_hw_knode(struct tcf_proto *tp,
+				 struct tc_u_knode *n,
+				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -499,7 +510,7 @@  static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (u32_should_offload(dev)) {
+	if (u32_should_offload(dev, flags)) {
 		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
 		offload.cls_u32->knode.handle = n->handle;
 		offload.cls_u32->knode.fshift = n->fshift;
@@ -687,6 +698,7 @@  static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
 	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
 	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
+	[TCA_U32_FLAGS]		= { .type = NLA_U32 },
 };
 
 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
@@ -833,7 +845,7 @@  static int u32_change(struct net *net, struct sk_buff *in_skb,
 	struct tc_u32_sel *s;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_U32_MAX + 1];
-	u32 htid;
+	u32 htid, flags = 0;
 	int err;
 #ifdef CONFIG_CLS_U32_PERF
 	size_t size;
@@ -846,6 +858,9 @@  static int u32_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
+	if (tb[TCA_U32_FLAGS])
+		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
+
 	n = (struct tc_u_knode *)*arg;
 	if (n) {
 		struct tc_u_knode *new;
@@ -869,7 +884,7 @@  static int u32_change(struct net *net, struct sk_buff *in_skb,
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		call_rcu(&n->rcu, u32_delete_key_rcu);
-		u32_replace_hw_knode(tp, new);
+		u32_replace_hw_knode(tp, new, flags);
 		return 0;
 	}
 
@@ -897,7 +912,7 @@  static int u32_change(struct net *net, struct sk_buff *in_skb,
 		rcu_assign_pointer(tp_c->hlist, ht);
 		*arg = (unsigned long)ht;
 
-		u32_replace_hw_hnode(tp, ht);
+		u32_replace_hw_hnode(tp, ht, flags);
 		return 0;
 	}
 
@@ -948,6 +963,7 @@  static int u32_change(struct net *net, struct sk_buff *in_skb,
 	RCU_INIT_POINTER(n->ht_up, ht);
 	n->handle = handle;
 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+	n->flags = flags;
 	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
 	n->tp = tp;
 
@@ -980,7 +996,7 @@  static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(n->next, pins);
 		rcu_assign_pointer(*ins, n);
-		u32_replace_hw_knode(tp, n);
+		u32_replace_hw_knode(tp, n, flags);
 		*arg = (unsigned long)n;
 		return 0;
 	}
@@ -1085,6 +1101,9 @@  static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		    nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
 			goto nla_put_failure;
 
+		if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
+			goto nla_put_failure;
+
 #ifdef CONFIG_CLS_U32_MARK
 		if ((n->val || n->mask)) {
 			struct tc_u32_mark mark = {.val = n->val,