diff mbox

[net-next,v8,2/3] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch

Message ID 1493121247-11863-3-git-send-email-jhs@emojatatu.com
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Jamal Hadi Salim April 25, 2017, 11:54 a.m. UTC
From: Jamal Hadi Salim <jhs@mojatatu.com>

When you dump hundreds of thousands of actions, getting only 32 per
dump batch even when the socket buffer and memory allocations allow
is inefficient.

With this change, the user will get as many as possibly fitting
within the given constraints available to the kernel.

The top level action TLV space is extended. An attribute
TCA_ROOT_FLAGS is used to carry flags; flag TCA_FLAG_LARGE_DUMP_ON
is set by the user indicating the user is capable of processing
these large dumps. Older user space which doesnt set this flag
doesnt get the large (than 32) batches.
The kernel uses the TCA_ROOT_COUNT attribute to tell the user how many
actions are put in a single batch. As such user space app knows how long
to iterate (independent of the type of action being dumped)
instead of hardcoded maximum of 32.

Some results dumping 1.5M actions, first unpatched tc which the
kernel doesnt help:

prompt$ time -p tc actions ls action gact | grep index | wc -l
1500000
real 1388.43
user 2.07
sys 1386.79

Now lets see a patched tc which sets the correct flags when requesting
a dump:

prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
1500000
real 178.13
user 2.02
sys 176.96

That is about 8x performance improvement for tc which sets its
receive buffer to about 32K.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 include/uapi/linux/rtnetlink.h | 22 ++++++++++++++--
 net/sched/act_api.c            | 59 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 12 deletions(-)

Comments

Jiri Pirko April 25, 2017, 12:13 p.m. UTC | #1
Tue, Apr 25, 2017 at 01:54:06PM CEST, jhs@mojatatu.com wrote:
>From: Jamal Hadi Salim <jhs@mojatatu.com>
>
>When you dump hundreds of thousands of actions, getting only 32 per
>dump batch even when the socket buffer and memory allocations allow
>is inefficient.
>
>With this change, the user will get as many as possibly fitting
>within the given constraints available to the kernel.
>
>The top level action TLV space is extended. An attribute
>TCA_ROOT_FLAGS is used to carry flags; flag TCA_FLAG_LARGE_DUMP_ON
>is set by the user indicating the user is capable of processing
>these large dumps. Older user space which doesnt set this flag
>doesnt get the large (than 32) batches.
>The kernel uses the TCA_ROOT_COUNT attribute to tell the user how many
>actions are put in a single batch. As such user space app knows how long
>to iterate (independent of the type of action being dumped)
>instead of hardcoded maximum of 32.
>
>Some results dumping 1.5M actions, first unpatched tc which the
>kernel doesnt help:
>
>prompt$ time -p tc actions ls action gact | grep index | wc -l
>1500000
>real 1388.43
>user 2.07
>sys 1386.79
>
>Now lets see a patched tc which sets the correct flags when requesting
>a dump:
>
>prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
>1500000
>real 178.13
>user 2.02
>sys 176.96
>
>That is about 8x performance improvement for tc which sets its
>receive buffer to about 32K.
>
>Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
>---
> include/uapi/linux/rtnetlink.h | 22 ++++++++++++++--
> net/sched/act_api.c            | 59 +++++++++++++++++++++++++++++++++++-------
> 2 files changed, 69 insertions(+), 12 deletions(-)
>
>diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
>index cce0613..5875467 100644
>--- a/include/uapi/linux/rtnetlink.h
>+++ b/include/uapi/linux/rtnetlink.h
>@@ -674,10 +674,28 @@ struct tcamsg {
> 	unsigned char	tca__pad1;
> 	unsigned short	tca__pad2;
> };
>+
>+enum {
>+	TCA_ROOT_UNSPEC,
>+	TCA_ROOT_TAB,
>+#define TCA_ACT_TAB TCA_ROOT_TAB
>+	TCA_ROOT_FLAGS,
>+	TCA_ROOT_COUNT,
>+	__TCA_ROOT_MAX,
>+#define	TCA_ROOT_MAX (__TCA_ROOT_MAX - 1)
>+};
>+
> #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>-#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>-#define TCAA_MAX 1
>+/* tcamsg flags stored in attribute TCA_ROOT_FLAGS
>+ *
>+ * TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>+ * actions in a dump. All dump responses will contain the number of actions
>+ * being dumped stored in for user app's consumption in TCA_ROOT_COUNT
>+ *
>+ */
>+#define TCA_FLAG_LARGE_DUMP_ON		(1 << 0)

BIT (I think I mentioned this before)

>+#define VALID_TCA_FLAGS TCA_FLAG_LARGE_DUMP_ON

Again, namespace please. "TCA_ROOT_FLAGS_VALID"
Why is this UAPI?


> 
> /* New extended info filters for IFLA_EXT_MASK */
> #define RTEXT_FILTER_VF		(1 << 0)
>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>index 9ce22b7..2756213 100644
>--- a/net/sched/act_api.c
>+++ b/net/sched/act_api.c
>@@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
> 			   struct netlink_callback *cb)
> {
> 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
>+	u32 act_flags = cb->args[2];
> 	struct nlattr *nest;
> 
> 	spin_lock_bh(&hinfo->lock);
>@@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
> 			}
> 			nla_nest_end(skb, nest);
> 			n_i++;
>-			if (n_i >= TCA_ACT_MAX_PRIO)
>+			if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
>+			    n_i >= TCA_ACT_MAX_PRIO)
> 				goto done;
> 		}
> 	}
> done:
> 	spin_unlock_bh(&hinfo->lock);
>-	if (n_i)
>+	if (n_i) {
> 		cb->args[0] += n_i;
>+		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
>+			cb->args[1] = n_i;
>+	}
> 	return n_i;
> 
> nla_put_failure:
>@@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
> 	return tcf_add_notify(net, n, &actions, portid);
> }
> 
>+static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
>+	[TCA_ROOT_FLAGS]      = { .type = NLA_U32 },
>+};
>+
> static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
> 			 struct netlink_ext_ack *extack)
> {
> 	struct net *net = sock_net(skb->sk);
>-	struct nlattr *tca[TCAA_MAX + 1];
>+	struct nlattr *tca[TCA_ROOT_MAX + 1];
> 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
> 	int ret = 0, ovr = 0;
> 
>@@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
> 	    !netlink_capable(skb, CAP_NET_ADMIN))
> 		return -EPERM;
> 
>-	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, NULL,
>+	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL,
> 			  extack);
> 	if (ret < 0)
> 		return ret;
>@@ -1046,16 +1055,12 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
> 	return ret;
> }
> 
>-static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
>+static struct nlattr *find_dump_kind(struct nlattr **nla)
> {
> 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
> 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
>-	struct nlattr *nla[TCAA_MAX + 1];
> 	struct nlattr *kind;
> 
>-	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
>-			NULL, NULL) < 0)
>-		return NULL;
> 	tb1 = nla[TCA_ACT_TAB];
> 	if (tb1 == NULL)
> 		return NULL;
>@@ -1073,6 +1078,16 @@ static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
> 	return kind;
> }
> 
>+static inline bool tca_flags_valid(u32 act_flags)
>+{
>+	u32 invalid_flags_mask  = ~VALID_TCA_FLAGS;
>+
>+	if (act_flags & invalid_flags_mask)
>+		return false;

I don't see how this resolves anything. VALID_TCA_FLAGS is set in stone
not going to change anytime in future, right? Then the TCA_ROOT_FLAGS
attr will always contain only one flag, right?
Then, I don't see why do we need this dance... u8 flag attr resolves
this in elegant way. I know I sound like a broken record. This is the
last time I'm commenting on this. I give up.


>+
>+	return true;
>+}
>+
> static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> {
> 	struct net *net = sock_net(skb->sk);
>@@ -1082,8 +1097,18 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	struct tc_action_ops *a_o;
> 	int ret = 0;
> 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
>-	struct nlattr *kind = find_dump_kind(cb->nlh);
>+	struct nlattr *count_attr = NULL;
>+	struct nlattr *tb[TCA_ROOT_MAX + 1];
>+	struct nlattr *kind = NULL;
>+	u32 act_flags = 0;
>+	u32 act_count = 0;
>+
>+	ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
>+			  tcaa_policy, NULL);
>+	if (ret < 0)
>+		return ret;
> 
>+	kind = find_dump_kind(tb);
> 	if (kind == NULL) {
> 		pr_info("tc_dump_action: action bad kind\n");
> 		return 0;
>@@ -1093,14 +1118,25 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	if (a_o == NULL)
> 		return 0;
> 
>+	if (tb[TCA_ROOT_FLAGS])
>+		act_flags = nla_get_u32(tb[TCA_ROOT_FLAGS]);
>+
>+	if (act_flags && !tca_flags_valid(act_flags))
>+		return -EINVAL;
>+
> 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
> 			cb->nlh->nlmsg_type, sizeof(*t), 0);
> 	if (!nlh)
> 		goto out_module_put;
>+
>+	cb->args[2] = act_flags;
> 	t = nlmsg_data(nlh);
> 	t->tca_family = AF_UNSPEC;
> 	t->tca__pad1 = 0;
> 	t->tca__pad2 = 0;
>+	count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32));
>+	if (!count_attr)
>+		goto out_module_put;
> 
> 	nest = nla_nest_start(skb, TCA_ACT_TAB);
> 	if (nest == NULL)
>@@ -1113,6 +1149,9 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	if (ret > 0) {
> 		nla_nest_end(skb, nest);
> 		ret = skb->len;
>+		act_count = cb->args[1];
>+		memcpy(nla_data(count_attr), &act_count, sizeof(u32));
>+		cb->args[1] = 0;
> 	} else
> 		nlmsg_trim(skb, b);
> 
>-- 
>1.9.1
>
Jamal Hadi Salim April 25, 2017, 1:01 p.m. UTC | #2
On 17-04-25 08:13 AM, Jiri Pirko wrote:
> Tue, Apr 25, 2017 at 01:54:06PM CEST, jhs@mojatatu.com wrote:


[..]

>> -#define TCAA_MAX 1
>> +/* tcamsg flags stored in attribute TCA_ROOT_FLAGS
>> + *
>> + * TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>> + * actions in a dump. All dump responses will contain the number of actions
>> + * being dumped stored in for user app's consumption in TCA_ROOT_COUNT
>> + *
>> + */
>> +#define TCA_FLAG_LARGE_DUMP_ON		(1 << 0)
>
> BIT (I think I mentioned this before)
>

You did - but i took it out about two submissions back (per cover
letter) because it is no part of UAPI today. I noticed devlink was
using it but they defined  their own variant.
So if i added this, iproute2 doesnt compile. I could fix iproute2
to move it somewhere to a common header then restore this.

>> +#define VALID_TCA_FLAGS TCA_FLAG_LARGE_DUMP_ON
>
> Again, namespace please. "TCA_ROOT_FLAGS_VALID"

Good point.

> Why is this UAPI?
>

Should not be. I will fix in next update.


>
>>
>> /* New extended info filters for IFLA_EXT_MASK */
>> #define RTEXT_FILTER_VF		(1 << 0)
>> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>> index 9ce22b7..2756213 100644
>> --- a/net/sched/act_api.c
>> +++ b/net/sched/act_api.c
>> @@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>> 			   struct netlink_callback *cb)
>> {
>> 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
>> +	u32 act_flags = cb->args[2];
>> 	struct nlattr *nest;
>>
>> 	spin_lock_bh(&hinfo->lock);
>> @@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>> 			}
>> 			nla_nest_end(skb, nest);
>> 			n_i++;
>> -			if (n_i >= TCA_ACT_MAX_PRIO)
>> +			if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
>> +			    n_i >= TCA_ACT_MAX_PRIO)
>> 				goto done;
>> 		}
>> 	}
>> done:
>> 	spin_unlock_bh(&hinfo->lock);
>> -	if (n_i)
>> +	if (n_i) {
>> 		cb->args[0] += n_i;
>> +		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
>> +			cb->args[1] = n_i;
>> +	}
>> 	return n_i;
>>
>> nla_put_failure:
>> @@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
>> 	return tcf_add_notify(net, n, &actions, portid);
>> }
>>
>> +static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
>> +	[TCA_ROOT_FLAGS]      = { .type = NLA_U32 },
>> +};
>> +
>> static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> 			 struct netlink_ext_ack *extack)
>> {
>> 	struct net *net = sock_net(skb->sk);
>> -	struct nlattr *tca[TCAA_MAX + 1];
>> +	struct nlattr *tca[TCA_ROOT_MAX + 1];
>> 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
>> 	int ret = 0, ovr = 0;
>>
>> @@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> 	    !netlink_capable(skb, CAP_NET_ADMIN))
>> 		return -EPERM;
>>
>> -	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, NULL,
>> +	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL,
>> 			  extack);
>> 	if (ret < 0)
>> 		return ret;
>> @@ -1046,16 +1055,12 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> 	return ret;
>> }
>>
>> -static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
>> +static struct nlattr *find_dump_kind(struct nlattr **nla)
>> {
>> 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
>> 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
>> -	struct nlattr *nla[TCAA_MAX + 1];
>> 	struct nlattr *kind;
>>
>> -	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
>> -			NULL, NULL) < 0)
>> -		return NULL;
>> 	tb1 = nla[TCA_ACT_TAB];
>> 	if (tb1 == NULL)
>> 		return NULL;
>> @@ -1073,6 +1078,16 @@ static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
>> 	return kind;
>> }
>>
>> +static inline bool tca_flags_valid(u32 act_flags)
>> +{
>> +	u32 invalid_flags_mask  = ~VALID_TCA_FLAGS;
>> +
>> +	if (act_flags & invalid_flags_mask)
>> +		return false;
>
> I don't see how this resolves anything. VALID_TCA_FLAGS is set in stone
> not going to change anytime in future, right?

Every time a new bit gets added VALID_TCA_FLAGS changes.

>Then the TCA_ROOT_FLAGS
> attr will always contain only one flag, right?

Not true - forever. Just in this patch discussion:
I have added 2 other flags since removed. So I cant predict the
future. You keep coming back to this assumption there will always
ever be this one flag. I am not following how you reach this
conclusion.

> Then, I don't see why do we need this dance... u8 flag attr resolves
> this in elegant way. I know I sound like a broken record. This is the
> last time I'm commenting on this. I give up.
>

Why is a u8 better than u32 Jiri?
The TLV space consumed is still 64 bits in both cases. If I define u8,
u16, u32 - it is still 64 bits being alloced. If i use a u8 then i have
24 bits which are pads - given current discussions I can never ever use
again!

If i keep 32 bits I am free to use those without ever changing the
data structure (except for the restrictions to now make sure nothing
gets set).

cheers,
jamal
Jiri Pirko April 25, 2017, 4:04 p.m. UTC | #3
Tue, Apr 25, 2017 at 03:01:22PM CEST, jhs@mojatatu.com wrote:
>On 17-04-25 08:13 AM, Jiri Pirko wrote:
>> Tue, Apr 25, 2017 at 01:54:06PM CEST, jhs@mojatatu.com wrote:
>
>
>[..]
>
>> > -#define TCAA_MAX 1
>> > +/* tcamsg flags stored in attribute TCA_ROOT_FLAGS
>> > + *
>> > + * TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>> > + * actions in a dump. All dump responses will contain the number of actions
>> > + * being dumped stored in for user app's consumption in TCA_ROOT_COUNT
>> > + *
>> > + */
>> > +#define TCA_FLAG_LARGE_DUMP_ON		(1 << 0)
>> 
>> BIT (I think I mentioned this before)
>> 
>
>You did - but i took it out about two submissions back (per cover
>letter) because it is no part of UAPI today. I noticed devlink was
>using it but they defined  their own variant.
>So if i added this, iproute2 doesnt compile. I could fix iproute2
>to move it somewhere to a common header then restore this.

So fix iproute2. It is always first kernel, then iproute2.


>
>> > +#define VALID_TCA_FLAGS TCA_FLAG_LARGE_DUMP_ON
>> 
>> Again, namespace please. "TCA_ROOT_FLAGS_VALID"
>
>Good point.
>
>> Why is this UAPI?
>> 
>
>Should not be. I will fix in next update.
>
>
>> 
>> > 
>> > /* New extended info filters for IFLA_EXT_MASK */
>> > #define RTEXT_FILTER_VF		(1 << 0)
>> > diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>> > index 9ce22b7..2756213 100644
>> > --- a/net/sched/act_api.c
>> > +++ b/net/sched/act_api.c
>> > @@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>> > 			   struct netlink_callback *cb)
>> > {
>> > 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
>> > +	u32 act_flags = cb->args[2];
>> > 	struct nlattr *nest;
>> > 
>> > 	spin_lock_bh(&hinfo->lock);
>> > @@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>> > 			}
>> > 			nla_nest_end(skb, nest);
>> > 			n_i++;
>> > -			if (n_i >= TCA_ACT_MAX_PRIO)
>> > +			if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
>> > +			    n_i >= TCA_ACT_MAX_PRIO)
>> > 				goto done;
>> > 		}
>> > 	}
>> > done:
>> > 	spin_unlock_bh(&hinfo->lock);
>> > -	if (n_i)
>> > +	if (n_i) {
>> > 		cb->args[0] += n_i;
>> > +		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
>> > +			cb->args[1] = n_i;
>> > +	}
>> > 	return n_i;
>> > 
>> > nla_put_failure:
>> > @@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
>> > 	return tcf_add_notify(net, n, &actions, portid);
>> > }
>> > 
>> > +static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
>> > +	[TCA_ROOT_FLAGS]      = { .type = NLA_U32 },
>> > +};
>> > +
>> > static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> > 			 struct netlink_ext_ack *extack)
>> > {
>> > 	struct net *net = sock_net(skb->sk);
>> > -	struct nlattr *tca[TCAA_MAX + 1];
>> > +	struct nlattr *tca[TCA_ROOT_MAX + 1];
>> > 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
>> > 	int ret = 0, ovr = 0;
>> > 
>> > @@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> > 	    !netlink_capable(skb, CAP_NET_ADMIN))
>> > 		return -EPERM;
>> > 
>> > -	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, NULL,
>> > +	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL,
>> > 			  extack);
>> > 	if (ret < 0)
>> > 		return ret;
>> > @@ -1046,16 +1055,12 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> > 	return ret;
>> > }
>> > 
>> > -static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
>> > +static struct nlattr *find_dump_kind(struct nlattr **nla)
>> > {
>> > 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
>> > 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
>> > -	struct nlattr *nla[TCAA_MAX + 1];
>> > 	struct nlattr *kind;
>> > 
>> > -	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
>> > -			NULL, NULL) < 0)
>> > -		return NULL;
>> > 	tb1 = nla[TCA_ACT_TAB];
>> > 	if (tb1 == NULL)
>> > 		return NULL;
>> > @@ -1073,6 +1078,16 @@ static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
>> > 	return kind;
>> > }
>> > 
>> > +static inline bool tca_flags_valid(u32 act_flags)
>> > +{
>> > +	u32 invalid_flags_mask  = ~VALID_TCA_FLAGS;
>> > +
>> > +	if (act_flags & invalid_flags_mask)
>> > +		return false;
>> 
>> I don't see how this resolves anything. VALID_TCA_FLAGS is set in stone
>> not going to change anytime in future, right?
>
>Every time a new bit gets added VALID_TCA_FLAGS changes.

You mean flag that user can set? If that is the case, you are breaking
UAPI for newer app running on older kernel.


>
>> Then the TCA_ROOT_FLAGS
>> attr will always contain only one flag, right?
>
>Not true - forever. Just in this patch discussion:
>I have added 2 other flags since removed. So I cant predict the
>future. You keep coming back to this assumption there will always
>ever be this one flag. I am not following how you reach this
>conclusion.

I read this paragraph 5 times, still I don't get what you say :/


>
>> Then, I don't see why do we need this dance... u8 flag attr resolves
>> this in elegant way. I know I sound like a broken record. This is the
>> last time I'm commenting on this. I give up.
>> 
>
>Why is a u8 better than u32 Jiri?
>The TLV space consumed is still 64 bits in both cases. If I define u8,
>u16, u32 - it is still 64 bits being alloced. If i use a u8 then i have
>24 bits which are pads - given current discussions I can never ever use
>again!

I don't care, use u8 or u32. Just 1 attr - 1 flag.


>
>If i keep 32 bits I am free to use those without ever changing the
>data structure (except for the restrictions to now make sure nothing
>gets set).

what datastructure? I'm confused.

>
>cheers,
>jamal
Jamal Hadi Salim April 25, 2017, 8:29 p.m. UTC | #4
On 17-04-25 12:04 PM, Jiri Pirko wrote:
> Tue, Apr 25, 2017 at 03:01:22PM CEST, jhs@mojatatu.com wrote:
>> On 17-04-25 08:13 AM, Jiri Pirko wrote:
>>> Tue, Apr 25, 2017 at 01:54:06PM CEST, jhs@mojatatu.com wrote:

>>>>
>>>> +static inline bool tca_flags_valid(u32 act_flags)
>>>> +{
>>>> +	u32 invalid_flags_mask  = ~VALID_TCA_FLAGS;
>>>> +
>>>> +	if (act_flags & invalid_flags_mask)
>>>> +		return false;
>>>
>>> I don't see how this resolves anything. VALID_TCA_FLAGS is set in stone
>>> not going to change anytime in future, right?
>>
>> Every time a new bit gets added VALID_TCA_FLAGS changes.
>
> You mean flag that user can set? If that is the case, you are breaking
> UAPI for newer app running on older kernel.
>

Ok, let me try to explain with more clarity. The rules Iam
trying to follow are:
if i see any bit set that i dont understand I will reject.

So lets in first kernel I have support for bit 0.
My validation check is to make sure only bit 0 is set.
The valid_flags currently then only constitutes bit 0.
i.e
If you set bit 2 or 3, the function above will reject and i return
the error to the user.

That is expected behavior correct?

3 months down the road:
I add two flags - bit 1 and 2.
So now my valid_flags changes to bits 1, 2 and 0.

The function above will now return true for bits 0-2 but
will reject if you set bit 3.

That is expected behavior, correct?

On u32/16/8:
I am choosing u32 so it allows me to add more upto 32 bit flags.
Not all 32 are needed today but it is better insurance.
If I used u8 then the 24 of those 32 bits i dont use will be used
as pads in the TLV. So it doesnt make sense for me to use a u8/16.

Does that make more sense?

cheers,
jamal
Jiri Pirko April 26, 2017, 6:19 a.m. UTC | #5
Tue, Apr 25, 2017 at 10:29:40PM CEST, jhs@mojatatu.com wrote:
>On 17-04-25 12:04 PM, Jiri Pirko wrote:
>> Tue, Apr 25, 2017 at 03:01:22PM CEST, jhs@mojatatu.com wrote:
>> > On 17-04-25 08:13 AM, Jiri Pirko wrote:
>> > > Tue, Apr 25, 2017 at 01:54:06PM CEST, jhs@mojatatu.com wrote:
>
>> > > > 
>> > > > +static inline bool tca_flags_valid(u32 act_flags)
>> > > > +{
>> > > > +	u32 invalid_flags_mask  = ~VALID_TCA_FLAGS;
>> > > > +
>> > > > +	if (act_flags & invalid_flags_mask)
>> > > > +		return false;
>> > > 
>> > > I don't see how this resolves anything. VALID_TCA_FLAGS is set in stone
>> > > not going to change anytime in future, right?
>> > 
>> > Every time a new bit gets added VALID_TCA_FLAGS changes.
>> 
>> You mean flag that user can set? If that is the case, you are breaking
>> UAPI for newer app running on older kernel.
>> 
>
>Ok, let me try to explain with more clarity. The rules Iam
>trying to follow are:
>if i see any bit set that i dont understand I will reject.
>
>So lets in first kernel I have support for bit 0.
>My validation check is to make sure only bit 0 is set.
>The valid_flags currently then only constitutes bit 0.
>i.e
>If you set bit 2 or 3, the function above will reject and i return
>the error to the user.
>
>That is expected behavior correct?
>
>3 months down the road:
>I add two flags - bit 1 and 2.
>So now my valid_flags changes to bits 1, 2 and 0.
>
>The function above will now return true for bits 0-2 but
>will reject if you set bit 3.
>
>That is expected behavior, correct?

The same app compiled against new kernel with bits (0, 1, 2) will run with
this kernel good. But if you run it with older kernel, the kernel (0)
would refuse. Is that ok?



>
>On u32/16/8:
>I am choosing u32 so it allows me to add more upto 32 bit flags.
>Not all 32 are needed today but it is better insurance.
>If I used u8 then the 24 of those 32 bits i dont use will be used
>as pads in the TLV. So it doesnt make sense for me to use a u8/16.

Jamal, note that I never suggested having more flags in a single attr.
Therefore I suggested u8 to carry a single flag.

You say that it has performance impact having 3 flag attrs in compare to
one bit flag attr. Could you please provide some numbers?

I expect that you will not be able to show the difference. And if there
is no difference in performance, your main argument goes away. And we
can do this in a nice, clear, TLV fashion.


>
>Does that make more sense?
>
>cheers,
>jamal
Simon Horman April 26, 2017, 11:02 a.m. UTC | #6
On Tue, Apr 25, 2017 at 06:04:45PM +0200, Jiri Pirko wrote:
> Tue, Apr 25, 2017 at 03:01:22PM CEST, jhs@mojatatu.com wrote:
> >On 17-04-25 08:13 AM, Jiri Pirko wrote:
> >> Tue, Apr 25, 2017 at 01:54:06PM CEST, jhs@mojatatu.com wrote:
> >
> >
> >[..]
> >
> >> > -#define TCAA_MAX 1
> >> > +/* tcamsg flags stored in attribute TCA_ROOT_FLAGS
> >> > + *
> >> > + * TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
> >> > + * actions in a dump. All dump responses will contain the number of actions
> >> > + * being dumped stored in for user app's consumption in TCA_ROOT_COUNT
> >> > + *
> >> > + */
> >> > +#define TCA_FLAG_LARGE_DUMP_ON		(1 << 0)
> >> 
> >> BIT (I think I mentioned this before)
> >> 
> >
> >You did - but i took it out about two submissions back (per cover
> >letter) because it is no part of UAPI today. I noticed devlink was
> >using it but they defined  their own variant.
> >So if i added this, iproute2 doesnt compile. I could fix iproute2
> >to move it somewhere to a common header then restore this.
> 
> So fix iproute2. It is always first kernel, then iproute2.

Perhaps I am missing the point or somehow misguided but I would expect that
if the UAPI uses BIT() it also provides BIT().

...
Simon Horman April 26, 2017, 11:07 a.m. UTC | #7
On Wed, Apr 26, 2017 at 08:19:04AM +0200, Jiri Pirko wrote:
> Tue, Apr 25, 2017 at 10:29:40PM CEST, jhs@mojatatu.com wrote:

...

> >So lets in first kernel I have support for bit 0.
> >My validation check is to make sure only bit 0 is set.
> >The valid_flags currently then only constitutes bit 0.
> >i.e
> >If you set bit 2 or 3, the function above will reject and i return
> >the error to the user.
> >
> >That is expected behavior correct?
> >
> >3 months down the road:
> >I add two flags - bit 1 and 2.
> >So now my valid_flags changes to bits 1, 2 and 0.
> >
> >The function above will now return true for bits 0-2 but
> >will reject if you set bit 3.
> >
> >That is expected behavior, correct?
> 
> The same app compiled against new kernel with bits (0, 1, 2) will run with
> this kernel good. But if you run it with older kernel, the kernel (0)
> would refuse. Is that ok?

Conversely, if an app is compiled against a new kernel and uses ATTR0, ATTR1
and ATTR2 all will be well. But if you run it against the older kernel
ATTR1 and ATTR2 will be silently ignored. I believe that is how its always
been but is that ok?

> >On u32/16/8:
> >I am choosing u32 so it allows me to add more upto 32 bit flags.
> >Not all 32 are needed today but it is better insurance.
> >If I used u8 then the 24 of those 32 bits i dont use will be used
> >as pads in the TLV. So it doesnt make sense for me to use a u8/16.
> 
> Jamal, note that I never suggested having more flags in a single attr.
> Therefore I suggested u8 to carry a single flag.
> 
> You say that it has performance impact having 3 flag attrs in compare to
> one bit flag attr. Could you please provide some numbers?
> 
> I expect that you will not be able to show the difference. And if there
> is no difference in performance, your main argument goes away. And we
> can do this in a nice, clear, TLV fashion.
Jamal Hadi Salim April 26, 2017, 11:48 a.m. UTC | #8
On 17-04-26 02:19 AM, Jiri Pirko wrote:
> Tue, Apr 25, 2017 at 10:29:40PM CEST, jhs@mojatatu.com wrote:
>> On 17-04-25 12:04 PM, Jiri Pirko wrote:
[..]
>> That is expected behavior correct?
>>
>> 3 months down the road:
>> I add two flags - bit 1 and 2.
>> So now my valid_flags changes to bits 1, 2 and 0.
>>
>> The function above will now return true for bits 0-2 but
>> will reject if you set bit 3.
>>
>> That is expected behavior, correct?
>
> The same app compiled against new kernel with bits (0, 1, 2) will run with
> this kernel good. But if you run it with older kernel, the kernel (0)
> would refuse. Is that ok?
>


Dave said that is what has to be done.
To quote from the cover letter:

--------- START QUOTE -------------
changes since v6:
-----------------

1) DaveM:
New rules for netlink messages. From now on we are going to start
checking for bits that are not used and rejecting anything we dont
understand. In the future this is going to require major changes
to user space code (tc etc). This is just a start.

To quote, David:
"
  Again, bits you aren't using now, make sure userspace doesn't
    set them.  And if it does, reject.
"

---------- END QUOTE -----------

I am going to send the patches - if you dont like this then speak up and
David needs to be convinced.  This is UAPI - once patches are in it is
cast in stone and I dont mind a discussion to make sure we get it right.

> Jamal, note that I never suggested having more flags in a single attr.
> Therefore I suggested u8 to carry a single flag.
>

Jiri, thats our main difference unless I am misunderstanding you.

I believe you should squeeze as many as you can in one single attribute.
You believe you should have only one flag per attribute.

Aesthetically a u8 looks good. Performance wise it is bad when you
have many entries to deal with.


> You say that it has performance impact having 3 flag attrs in compare to
> one bit flag attr. Could you please provide some numbers?
>

I have experience with dealing with a massive amount of various dumps
and (batch) sets and it always boils down to one thing:
_how much data is exchanged between user and kernel_
3 flags encoded as bits in a u32 attribute cost 64 bits.
Encoded separately cost 3x that.

Believe me, it _does make a difference_ in performance.

My least favorite subsystem is bridge. The bridge code has
tons of flags in those entries that are sent to/from kernel as u8
attributes. It is painful.

For something more recent, lets look at this commit from Ben on Flower:
+       TCA_FLOWER_KEY_MPLS_TTL,        /* u8 - 8 bits */
+       TCA_FLOWER_KEY_MPLS_BOS,        /* u8 - 1 bit */
+       TCA_FLOWER_KEY_MPLS_TC,         /* u8 - 3 bits */
+       TCA_FLOWER_KEY_MPLS_LABEL,      /* be32 - 20 bits */

Yes, that looks pretty, but:
That would have fit in one attribute with a u32. Mask attributes would
be eliminated with a second 32 bit - all in the same singular
attribute.

Imagine if i have 1M flower entries. If you add up the mask the cost
of these things is about 3*2*64 bits more per entry compared to putting
the mpls info/mask in one attribute.
At 1M entries that is a few MBs of data being exchanged.

> I expect that you will not be able to show the difference. And if there
> is no difference in performance, your main argument goes away. And we
> can do this in a nice, clear, TLV fashion.
>

I love TLVs Jiri. But there is a difference between management and
control. The former cares more about humans the later needs to get shit
done faster. The extreme version of the later is using json. But you
try to get the json guy to do setting or dumping 1M entries and you
can take a long distance trip and come back and they are not done.

I want to use TLVs but plan for optimization/performance as well.

cheers,
jamal
Jiri Pirko April 26, 2017, 12:08 p.m. UTC | #9
Wed, Apr 26, 2017 at 01:48:29PM CEST, jhs@mojatatu.com wrote:
>On 17-04-26 02:19 AM, Jiri Pirko wrote:
>> Tue, Apr 25, 2017 at 10:29:40PM CEST, jhs@mojatatu.com wrote:
>> > On 17-04-25 12:04 PM, Jiri Pirko wrote:
>[..]
>> > That is expected behavior correct?
>> > 
>> > 3 months down the road:
>> > I add two flags - bit 1 and 2.
>> > So now my valid_flags changes to bits 1, 2 and 0.
>> > 
>> > The function above will now return true for bits 0-2 but
>> > will reject if you set bit 3.
>> > 
>> > That is expected behavior, correct?
>> 
>> The same app compiled against new kernel with bits (0, 1, 2) will run with
>> this kernel good. But if you run it with older kernel, the kernel (0)
>> would refuse. Is that ok?
>> 
>
>
>Dave said that is what has to be done.
>To quote from the cover letter:
>
>--------- START QUOTE -------------
>changes since v6:
>-----------------
>
>1) DaveM:
>New rules for netlink messages. From now on we are going to start
>checking for bits that are not used and rejecting anything we dont
>understand. In the future this is going to require major changes
>to user space code (tc etc). This is just a start.
>
>To quote, David:
>"
> Again, bits you aren't using now, make sure userspace doesn't
>   set them.  And if it does, reject.
>"
>
>---------- END QUOTE -----------
>
>I am going to send the patches - if you dont like this then speak up and
>David needs to be convinced.  This is UAPI - once patches are in it is
>cast in stone and I dont mind a discussion to make sure we get it right.
>
>> Jamal, note that I never suggested having more flags in a single attr.
>> Therefore I suggested u8 to carry a single flag.
>> 
>
>Jiri, thats our main difference unless I am misunderstanding you.
>
>I believe you should squeeze as many as you can in one single attribute.
>You believe you should have only one flag per attribute.
>
>Aesthetically a u8 looks good. Performance wise it is bad when you
>have many entries to deal with.
>
>
>> You say that it has performance impact having 3 flag attrs in compare to
>> one bit flag attr. Could you please provide some numbers?
>> 
>
>I have experience with dealing with a massive amount of various dumps
>and (batch) sets and it always boils down to one thing:
>_how much data is exchanged between user and kernel_
>3 flags encoded as bits in a u32 attribute cost 64 bits.
>Encoded separately cost 3x that.
>
>Believe me, it _does make a difference_ in performance.
>
>My least favorite subsystem is bridge. The bridge code has
>tons of flags in those entries that are sent to/from kernel as u8
>attributes. It is painful.
>
>For something more recent, lets look at this commit from Ben on Flower:
>+       TCA_FLOWER_KEY_MPLS_TTL,        /* u8 - 8 bits */
>+       TCA_FLOWER_KEY_MPLS_BOS,        /* u8 - 1 bit */
>+       TCA_FLOWER_KEY_MPLS_TC,         /* u8 - 3 bits */
>+       TCA_FLOWER_KEY_MPLS_LABEL,      /* be32 - 20 bits */
>
>Yes, that looks pretty, but:
>That would have fit in one attribute with a u32. Mask attributes would
>be eliminated with a second 32 bit - all in the same singular
>attribute.
>
>Imagine if i have 1M flower entries. If you add up the mask the cost
>of these things is about 3*2*64 bits more per entry compared to putting
>the mpls info/mask in one attribute.
>At 1M entries that is a few MBs of data being exchanged.

I can do the math :) Yet still, I would like to see the numbers :)
Because I believe that is the only way to end this lenghty converstation
once and forever...


>
>> I expect that you will not be able to show the difference. And if there
>> is no difference in performance, your main argument goes away. And we
>> can do this in a nice, clear, TLV fashion.
>> 
>
>I love TLVs Jiri. But there is a difference between management and
>control. The former cares more about humans the later needs to get shit
>done faster. The extreme version of the later is using json. But you
>try to get the json guy to do setting or dumping 1M entries and you
>can take a long distance trip and come back and they are not done.
>
>I want to use TLVs but plan for optimization/performance as well.
>
>cheers,
>jamal
>
>
>
Jamal Hadi Salim April 26, 2017, 12:46 p.m. UTC | #10
On 17-04-26 07:02 AM, Simon Horman wrote:
> On Tue, Apr 25, 2017 at 06:04:45PM +0200, Jiri Pirko wrote:
[..]

>> So fix iproute2. It is always first kernel, then iproute2.
>
> Perhaps I am missing the point or somehow misguided but I would expect that
> if the UAPI uses BIT() it also provides BIT().

There is a user of BIT() already in iproute2 (devlink). We can move
the code to be more generally available for other iproute2 users.
Then this UAPI change makes use of it.

cheers,
jamal
Jamal Hadi Salim April 26, 2017, 1 p.m. UTC | #11
On 17-04-26 07:07 AM, Simon Horman wrote:
> On Wed, Apr 26, 2017 at 08:19:04AM +0200, Jiri Pirko wrote:
>> Tue, Apr 25, 2017 at 10:29:40PM CEST, jhs@mojatatu.com wrote:
>
> ...
>
>>> So lets in first kernel I have support for bit 0.
>>> My validation check is to make sure only bit 0 is set.
>>> The valid_flags currently then only constitutes bit 0.
>>> i.e
>>> If you set bit 2 or 3, the function above will reject and i return
>>> the error to the user.
>>>
>>> That is expected behavior correct?
>>>
>>> 3 months down the road:
>>> I add two flags - bit 1 and 2.
>>> So now my valid_flags changes to bits 1, 2 and 0.
>>>
>>> The function above will now return true for bits 0-2 but
>>> will reject if you set bit 3.
>>>
>>> That is expected behavior, correct?
>>
>> The same app compiled against new kernel with bits (0, 1, 2) will run with
>> this kernel good. But if you run it with older kernel, the kernel (0)
>> would refuse. Is that ok?
>
> Conversely, if an app is compiled against a new kernel and uses ATTR0, ATTR1
> and ATTR2 all will be well. But if you run it against the older kernel
> ATTR1 and ATTR2 will be silently ignored. I believe that is how its always
> been but is that ok?
>

I think the answer is much complex than ok/notok.

If i have bits that when not supported by the kernel would result in
bad operations then of course kernel ignoring their presence is a very
bad thing (Dave's example is "I want you to encrypt" which an old
kernel wont understand).
There's also a security concern where flags are being set and are being
randomly accepted by old kernels resulting in root access etc.

The other side of the coin, if I really dont care if you dont understand
the extra bits I have set i would like you to do what you can.
Thats status quo upto now.

So my overall view:
Ignoring how it used to work is a revolution not an evolution.
There will be a blood bath with existing apps before the new norm comes
into proper effect.

cheers,
jamal
Jiri Pirko April 26, 2017, 1:05 p.m. UTC | #12
Wed, Apr 26, 2017 at 02:46:22PM CEST, jhs@mojatatu.com wrote:
>On 17-04-26 07:02 AM, Simon Horman wrote:
>> On Tue, Apr 25, 2017 at 06:04:45PM +0200, Jiri Pirko wrote:
>[..]
>
>> > So fix iproute2. It is always first kernel, then iproute2.
>> 
>> Perhaps I am missing the point or somehow misguided but I would expect that
>> if the UAPI uses BIT() it also provides BIT().
>
>There is a user of BIT() already in iproute2 (devlink). We can move
>the code to be more generally available for other iproute2 users.
>Then this UAPI change makes use of it.

Should be part of UAPI as well
I see that include/uapi/rdma/vmw_pvrdma-abi.h is using BIT macro.
I don't see BIT macro defined in UAPI (I thought it is). So either
define it there (not sure where) or just use "<<"
Jamal Hadi Salim April 26, 2017, 1:14 p.m. UTC | #13
On 17-04-26 08:08 AM, Jiri Pirko wrote:
> Wed, Apr 26, 2017 at 01:48:29PM CEST, jhs@mojatatu.com wrote:
>> On 17-04-26 02:19 AM, Jiri Pirko wrote:
>>> Tue, Apr 25, 2017 at 10:29:40PM CEST, jhs@mojatatu.com wrote:
>>>> On 17-04-25 12:04 PM, Jiri Pirko wrote:

>> I have experience with dealing with a massive amount of various dumps
>> and (batch) sets and it always boils down to one thing:
>> _how much data is exchanged between user and kernel_
>> 3 flags encoded as bits in a u32 attribute cost 64 bits.
>> Encoded separately cost 3x that.
>>
>> Believe me, it _does make a difference_ in performance.
>>
>> My least favorite subsystem is bridge. The bridge code has
>> tons of flags in those entries that are sent to/from kernel as u8
>> attributes. It is painful.
>>
>> For something more recent, lets look at this commit from Ben on Flower:
>> +       TCA_FLOWER_KEY_MPLS_TTL,        /* u8 - 8 bits */
>> +       TCA_FLOWER_KEY_MPLS_BOS,        /* u8 - 1 bit */
>> +       TCA_FLOWER_KEY_MPLS_TC,         /* u8 - 3 bits */
>> +       TCA_FLOWER_KEY_MPLS_LABEL,      /* be32 - 20 bits */
>>
>> Yes, that looks pretty, but:
>> That would have fit in one attribute with a u32. Mask attributes would
>> be eliminated with a second 32 bit - all in the same singular
>> attribute.
>>
>> Imagine if i have 1M flower entries. If you add up the mask the cost
>> of these things is about 3*2*64 bits more per entry compared to putting
>> the mpls info/mask in one attribute.
>> At 1M entries that is a few MBs of data being exchanged.
>
> I can do the math :) Yet still, I would like to see the numbers :)
> Because I believe that is the only way to end this lenghty converstation
> once and forever...
>

Jiri, what are you arguing about if you have done the math? ;->
You want me to show you that getting or setting less data is good for
performance?
Look at the third patch: Why do i think it is necessary to send only
actions that have changed? Precisely to reduce the amount of data
being transported. The second patch - to reduce the amount of crossing
user space to kernel space (which is going to happen more with increased
data I have to transport between the user and the kernel).

Again: You are looking at this from a manageability point of view which
is useful but not the only input into a design. If i can squeeze more
data without killing usability - I am all for it. It just doesnt
compute that it is ok to use a flag per attribute because it looks
beautiful.

cheers,
jamal
Jiri Pirko April 26, 2017, 1:56 p.m. UTC | #14
Wed, Apr 26, 2017 at 03:14:38PM CEST, jhs@mojatatu.com wrote:
>On 17-04-26 08:08 AM, Jiri Pirko wrote:
>> Wed, Apr 26, 2017 at 01:48:29PM CEST, jhs@mojatatu.com wrote:
>> > On 17-04-26 02:19 AM, Jiri Pirko wrote:
>> > > Tue, Apr 25, 2017 at 10:29:40PM CEST, jhs@mojatatu.com wrote:
>> > > > On 17-04-25 12:04 PM, Jiri Pirko wrote:
>
>> > I have experience with dealing with a massive amount of various dumps
>> > and (batch) sets and it always boils down to one thing:
>> > _how much data is exchanged between user and kernel_
>> > 3 flags encoded as bits in a u32 attribute cost 64 bits.
>> > Encoded separately cost 3x that.
>> > 
>> > Believe me, it _does make a difference_ in performance.
>> > 
>> > My least favorite subsystem is bridge. The bridge code has
>> > tons of flags in those entries that are sent to/from kernel as u8
>> > attributes. It is painful.
>> > 
>> > For something more recent, lets look at this commit from Ben on Flower:
>> > +       TCA_FLOWER_KEY_MPLS_TTL,        /* u8 - 8 bits */
>> > +       TCA_FLOWER_KEY_MPLS_BOS,        /* u8 - 1 bit */
>> > +       TCA_FLOWER_KEY_MPLS_TC,         /* u8 - 3 bits */
>> > +       TCA_FLOWER_KEY_MPLS_LABEL,      /* be32 - 20 bits */
>> > 
>> > Yes, that looks pretty, but:
>> > That would have fit in one attribute with a u32. Mask attributes would
>> > be eliminated with a second 32 bit - all in the same singular
>> > attribute.
>> > 
>> > Imagine if i have 1M flower entries. If you add up the mask the cost
>> > of these things is about 3*2*64 bits more per entry compared to putting
>> > the mpls info/mask in one attribute.
>> > At 1M entries that is a few MBs of data being exchanged.
>> 
>> I can do the math :) Yet still, I would like to see the numbers :)
>> Because I believe that is the only way to end this lenghty converstation
>> once and forever...
>> 
>
>Jiri, what are you arguing about if you have done the math? ;->

I can do 3*2*64. What I cannot do is to figure out the real performance
impact.


>You want me to show you that getting or setting less data is good for
>performance?
>Look at the third patch: Why do i think it is necessary to send only
>actions that have changed? Precisely to reduce the amount of data
>being transported. The second patch - to reduce the amount of crossing
>user space to kernel space (which is going to happen more with increased
>data I have to transport between the user and the kernel).
>
>Again: You are looking at this from a manageability point of view which
>is useful but not the only input into a design. If i can squeeze more
>data without killing usability - I am all for it. It just doesnt
>compute that it is ok to use a flag per attribute because it looks
>beautiful.

Hmm. Now that I'm thinking about it, why don't we have NLA_FLAGS with
couple of helpers around it? It will be obvious what the attr is, all
kernel code would use the same helpers. Would be nice.
David Miller April 26, 2017, 2:46 p.m. UTC | #15
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 26 Apr 2017 15:05:06 +0200

> Wed, Apr 26, 2017 at 02:46:22PM CEST, jhs@mojatatu.com wrote:
>>On 17-04-26 07:02 AM, Simon Horman wrote:
>>> On Tue, Apr 25, 2017 at 06:04:45PM +0200, Jiri Pirko wrote:
>>[..]
>>
>>> > So fix iproute2. It is always first kernel, then iproute2.
>>> 
>>> Perhaps I am missing the point or somehow misguided but I would expect that
>>> if the UAPI uses BIT() it also provides BIT().
>>
>>There is a user of BIT() already in iproute2 (devlink). We can move
>>the code to be more generally available for other iproute2 users.
>>Then this UAPI change makes use of it.
> 
> Should be part of UAPI as well
> I see that include/uapi/rdma/vmw_pvrdma-abi.h is using BIT macro.
> I don't see BIT macro defined in UAPI (I thought it is). So either
> define it there (not sure where) or just use "<<"

"BIT" is a pretty crazy small simple name to pollute into the global
namespace, IMHO.
Jiri Pirko April 26, 2017, 2:58 p.m. UTC | #16
Wed, Apr 26, 2017 at 04:46:58PM CEST, davem@davemloft.net wrote:
>From: Jiri Pirko <jiri@resnulli.us>
>Date: Wed, 26 Apr 2017 15:05:06 +0200
>
>> Wed, Apr 26, 2017 at 02:46:22PM CEST, jhs@mojatatu.com wrote:
>>>On 17-04-26 07:02 AM, Simon Horman wrote:
>>>> On Tue, Apr 25, 2017 at 06:04:45PM +0200, Jiri Pirko wrote:
>>>[..]
>>>
>>>> > So fix iproute2. It is always first kernel, then iproute2.
>>>> 
>>>> Perhaps I am missing the point or somehow misguided but I would expect that
>>>> if the UAPI uses BIT() it also provides BIT().
>>>
>>>There is a user of BIT() already in iproute2 (devlink). We can move
>>>the code to be more generally available for other iproute2 users.
>>>Then this UAPI change makes use of it.
>> 
>> Should be part of UAPI as well
>> I see that include/uapi/rdma/vmw_pvrdma-abi.h is using BIT macro.
>> I don't see BIT macro defined in UAPI (I thought it is). So either
>> define it there (not sure where) or just use "<<"
>
>"BIT" is a pretty crazy small simple name to pollute into the global
>namespace, IMHO.

Btw, this is also something resolvable nicely if we have NLA_FLAGS
netlink attribute type. We can have some helper in UAPI like:

#define TCA_FLAG_LARGE_DUMP_ON	NLA_FLAGS_F(0)
Jamal Hadi Salim April 26, 2017, 8:07 p.m. UTC | #17
On 17-04-26 09:56 AM, Jiri Pirko wrote:
> Wed, Apr 26, 2017 at 03:14:38PM CEST, jhs@mojatatu.com wrote:
>> On 17-04-26 08:08 AM, Jiri Pirko wrote:

[..]

>> Jiri, what are you arguing about if you have done the math? ;->
>
> I can do 3*2*64. What I cannot do is to figure out the real performance
> impact.
>

Jiri, I do a lot of very large data dumping and setting towards the
kernel. You know that. It is why I even have these patches to begin
with.

The math should be convincing enough.
48B per rule extra for just MPLS in a filter rule. I havent started
testing the overhead of flower but i do plan to use it - with about a
million rules for offloading. I will give you the numbers then.

I think we are at a stalemate.
You are not going to convince me to use an attribute with a
u8 for a bit flag when I can fit 32 of them in one attribute (with
the same cost). And I am not able to convince you that you are
wrong to put beauty first.

>> Again: You are looking at this from a manageability point of view which
>> is useful but not the only input into a design. If i can squeeze more
>> data without killing usability - I am all for it. It just doesnt
>> compute that it is ok to use a flag per attribute because it looks
>> beautiful.
>
> Hmm. Now that I'm thinking about it, why don't we have NLA_FLAGS with
> couple of helpers around it? It will be obvious what the attr is, all
> kernel code would use the same helpers. Would be nice.
>

I think to have flags at that level is useful but it
is a different hierarchy level. I am not sure the
"actions dump large messages" is a fit for that level.

cheers,
jamal
Jiri Pirko April 27, 2017, 6:30 a.m. UTC | #18
Wed, Apr 26, 2017 at 10:07:08PM CEST, jhs@mojatatu.com wrote:
>On 17-04-26 09:56 AM, Jiri Pirko wrote:
>> Wed, Apr 26, 2017 at 03:14:38PM CEST, jhs@mojatatu.com wrote:
>> > On 17-04-26 08:08 AM, Jiri Pirko wrote:
>
>[..]
>

[...]

>
>> > Again: You are looking at this from a manageability point of view which
>> > is useful but not the only input into a design. If i can squeeze more
>> > data without killing usability - I am all for it. It just doesnt
>> > compute that it is ok to use a flag per attribute because it looks
>> > beautiful.
>> 
>> Hmm. Now that I'm thinking about it, why don't we have NLA_FLAGS with
>> couple of helpers around it? It will be obvious what the attr is, all
>> kernel code would use the same helpers. Would be nice.
>> 
>
>I think to have flags at that level is useful but it
>is a different hierarchy level. I am not sure the
>"actions dump large messages" is a fit for that level.

Jamal, the idea is to have exactly what you want to have. Only does not
use NLA_U32 attr for that but a special attr NLA_FLAGS which would have
well defined semantics and set of helpers to work with and enforce it.

Then, this could be easily reused in other subsystem that uses netlink
Jamal Hadi Salim April 28, 2017, 1:22 a.m. UTC | #19
On 17-04-27 02:30 AM, Jiri Pirko wrote:
> Wed, Apr 26, 2017 at 10:07:08PM CEST, jhs@mojatatu.com wrote:
>> On 17-04-26 09:56 AM, Jiri Pirko wrote:
>>> Wed, Apr 26, 2017 at 03:14:38PM CEST, jhs@mojatatu.com wrote:

>> I think to have flags at that level is useful but it
>> is a different hierarchy level. I am not sure the
>> "actions dump large messages" is a fit for that level.
>
> Jamal, the idea is to have exactly what you want to have. Only does not
> use NLA_U32 attr for that but a special attr NLA_FLAGS which would have
> well defined semantics and set of helpers to work with and enforce it.
>
> Then, this could be easily reused in other subsystem that uses netlink
>

Maybe I am misunderstanding:
Recall, this is what it looks like with this patchset:
<nlh><subsytem-header>[TCA_ROOT_XXXX]

TCA_ROOT_XXX is very subsystem specific. classifiers, qdiscs and many
subsystems defined their own semantics for that TLV level. This specific
"dump max" is very very specific to actions. They were crippled by the
fact you could only send 32 at a time - this allows more to be sent.

I thought initially you meant:
<nlh>[NLA_XXX]<subsytem-header>[TCA_ROOT_XXXX]

I think at the NLA_XXX you could fit netlink wide TLVs - but if i said
"do a large dump" it is of no use to any other subsystem.

cheers,
jamal
Jiri Pirko April 28, 2017, 7:02 a.m. UTC | #20
Fri, Apr 28, 2017 at 03:22:53AM CEST, jhs@mojatatu.com wrote:
>On 17-04-27 02:30 AM, Jiri Pirko wrote:
>> Wed, Apr 26, 2017 at 10:07:08PM CEST, jhs@mojatatu.com wrote:
>> > On 17-04-26 09:56 AM, Jiri Pirko wrote:
>> > > Wed, Apr 26, 2017 at 03:14:38PM CEST, jhs@mojatatu.com wrote:
>
>> > I think to have flags at that level is useful but it
>> > is a different hierarchy level. I am not sure the
>> > "actions dump large messages" is a fit for that level.
>> 
>> Jamal, the idea is to have exactly what you want to have. Only does not
>> use NLA_U32 attr for that but a special attr NLA_FLAGS which would have
>> well defined semantics and set of helpers to work with and enforce it.
>> 
>> Then, this could be easily reused in other subsystem that uses netlink
>> 
>
>Maybe I am misunderstanding:
>Recall, this is what it looks like with this patchset:
><nlh><subsytem-header>[TCA_ROOT_XXXX]
>
>TCA_ROOT_XXX is very subsystem specific. classifiers, qdiscs and many
>subsystems defined their own semantics for that TLV level. This specific
>"dump max" is very very specific to actions. They were crippled by the
>fact you could only send 32 at a time - this allows more to be sent.
>
>I thought initially you meant:
><nlh>[NLA_XXX]<subsytem-header>[TCA_ROOT_XXXX]
>
>I think at the NLA_XXX you could fit netlink wide TLVs - but if i said
>"do a large dump" it is of no use to any other subsystem.

Okay, I'm sorry, I had couple of beers yesterday so that might be
the cause why your msg makes me totally confused :O

All I suggest is to replace NLA_U32 flags you want that does not
have any semantics with NLA_FLAGS flags, which eventually will carry
the exact same u32, but with predefined semantics, helpers, everything.
Simon Horman April 28, 2017, 12:21 p.m. UTC | #21
On Wed, Apr 26, 2017 at 10:46:58AM -0400, David Miller wrote:
> From: Jiri Pirko <jiri@resnulli.us>
> Date: Wed, 26 Apr 2017 15:05:06 +0200
> 
> > Wed, Apr 26, 2017 at 02:46:22PM CEST, jhs@mojatatu.com wrote:
> >>On 17-04-26 07:02 AM, Simon Horman wrote:
> >>> On Tue, Apr 25, 2017 at 06:04:45PM +0200, Jiri Pirko wrote:
> >>[..]
> >>
> >>> > So fix iproute2. It is always first kernel, then iproute2.
> >>> 
> >>> Perhaps I am missing the point or somehow misguided but I would expect that
> >>> if the UAPI uses BIT() it also provides BIT().
> >>
> >>There is a user of BIT() already in iproute2 (devlink). We can move
> >>the code to be more generally available for other iproute2 users.
> >>Then this UAPI change makes use of it.
> > 
> > Should be part of UAPI as well
> > I see that include/uapi/rdma/vmw_pvrdma-abi.h is using BIT macro.
> > I don't see BIT macro defined in UAPI (I thought it is). So either
> > define it there (not sure where) or just use "<<"
> 
> "BIT" is a pretty crazy small simple name to pollute into the global
> namespace, IMHO.

It sounds to me that it would be best to just use "<<" rather than
spending cycles posturing on how to add it to the UAPI. Existing users
of BIT in the UAPI could also be updated to use "<<" to avoid having
a misleading precedence in-tree.
Jamal Hadi Salim April 28, 2017, 12:30 p.m. UTC | #22
On 17-04-28 03:02 AM, Jiri Pirko wrote:
> Fri, Apr 28, 2017 at 03:22:53AM CEST, jhs@mojatatu.com wrote:

[..]
>> Maybe I am misunderstanding:
>> Recall, this is what it looks like with this patchset:
>> <nlh><subsytem-header>[TCA_ROOT_XXXX]
>>
>> TCA_ROOT_XXX is very subsystem specific. classifiers, qdiscs and many
>> subsystems defined their own semantics for that TLV level. This specific
>> "dump max" is very very specific to actions. They were crippled by the
>> fact you could only send 32 at a time - this allows more to be sent.
>>
>> I thought initially you meant:
>> <nlh>[NLA_XXX]<subsytem-header>[TCA_ROOT_XXXX]
>>
>> I think at the NLA_XXX you could fit netlink wide TLVs - but if i said
>> "do a large dump" it is of no use to any other subsystem.
>
> Okay, I'm sorry, I had couple of beers yesterday so that might be
> the cause why your msg makes me totally confused :O
>
> All I suggest is to replace NLA_U32 flags you want that does not
> have any semantics with NLA_FLAGS flags, which eventually will carry
> the exact same u32, but with predefined semantics, helpers, everything.
>

I didnt understand fully Jiri. Are you suggesting a new type called
NLA_FLAGS which is re-usable elsewhere?

cheers,
jamal
Jamal Hadi Salim April 28, 2017, 12:55 p.m. UTC | #23
On 17-04-28 08:21 AM, Simon Horman wrote:
> On Wed, Apr 26, 2017 at 10:46:58AM -0400, David Miller wrote:
>> From: Jiri Pirko <jiri@resnulli.us>
>> Date: Wed, 26 Apr 2017 15:05:06 +0200
>>

[..]
>>> Should be part of UAPI as well
>>> I see that include/uapi/rdma/vmw_pvrdma-abi.h is using BIT macro.
>>> I don't see BIT macro defined in UAPI (I thought it is). So either
>>> define it there (not sure where) or just use "<<"
>>
>> "BIT" is a pretty crazy small simple name to pollute into the global
>> namespace, IMHO.
>
> It sounds to me that it would be best to just use "<<" rather than
> spending cycles posturing on how to add it to the UAPI. Existing users
> of BIT in the UAPI could also be updated to use "<<" to avoid having
> a misleading precedence in-tree.
>

We need to convince Jiri ;-> He has plans to do a lot of cleanups and
I feel like I am pioneering (A lot of new things on my patchset).
Jiri, could this come in your cleanups later?

cheers,
jamal
Jiri Pirko April 28, 2017, 1:21 p.m. UTC | #24
Fri, Apr 28, 2017 at 02:30:17PM CEST, jhs@mojatatu.com wrote:
>On 17-04-28 03:02 AM, Jiri Pirko wrote:
>> Fri, Apr 28, 2017 at 03:22:53AM CEST, jhs@mojatatu.com wrote:
>
>[..]
>> > Maybe I am misunderstanding:
>> > Recall, this is what it looks like with this patchset:
>> > <nlh><subsytem-header>[TCA_ROOT_XXXX]
>> > 
>> > TCA_ROOT_XXX is very subsystem specific. classifiers, qdiscs and many
>> > subsystems defined their own semantics for that TLV level. This specific
>> > "dump max" is very very specific to actions. They were crippled by the
>> > fact you could only send 32 at a time - this allows more to be sent.
>> > 
>> > I thought initially you meant:
>> > <nlh>[NLA_XXX]<subsytem-header>[TCA_ROOT_XXXX]
>> > 
>> > I think at the NLA_XXX you could fit netlink wide TLVs - but if i said
>> > "do a large dump" it is of no use to any other subsystem.
>> 
>> Okay, I'm sorry, I had couple of beers yesterday so that might be
>> the cause why your msg makes me totally confused :O
>> 
>> All I suggest is to replace NLA_U32 flags you want that does not
>> have any semantics with NLA_FLAGS flags, which eventually will carry
>> the exact same u32, but with predefined semantics, helpers, everything.
>> 
>
>I didnt understand fully Jiri. Are you suggesting a new type called
>NLA_FLAGS which is re-usable elsewhere?

Exactly. That's what I'm saying.
Jiri Pirko April 28, 2017, 1:21 p.m. UTC | #25
Fri, Apr 28, 2017 at 02:55:40PM CEST, jhs@mojatatu.com wrote:
>On 17-04-28 08:21 AM, Simon Horman wrote:
>> On Wed, Apr 26, 2017 at 10:46:58AM -0400, David Miller wrote:
>> > From: Jiri Pirko <jiri@resnulli.us>
>> > Date: Wed, 26 Apr 2017 15:05:06 +0200
>> > 
>
>[..]
>> > > Should be part of UAPI as well
>> > > I see that include/uapi/rdma/vmw_pvrdma-abi.h is using BIT macro.
>> > > I don't see BIT macro defined in UAPI (I thought it is). So either
>> > > define it there (not sure where) or just use "<<"
>> > 
>> > "BIT" is a pretty crazy small simple name to pollute into the global
>> > namespace, IMHO.
>> 
>> It sounds to me that it would be best to just use "<<" rather than
>> spending cycles posturing on how to add it to the UAPI. Existing users
>> of BIT in the UAPI could also be updated to use "<<" to avoid having
>> a misleading precedence in-tree.
>> 
>
>We need to convince Jiri ;-> He has plans to do a lot of cleanups and
>I feel like I am pioneering (A lot of new things on my patchset).
>Jiri, could this come in your cleanups later?
 
Sure.

>
>cheers,
>jamal
Jamal Hadi Salim April 28, 2017, 1:42 p.m. UTC | #26
On 17-04-28 09:21 AM, Jiri Pirko wrote:
> Fri, Apr 28, 2017 at 02:30:17PM CEST, jhs@mojatatu.com wrote:
>> On 17-04-28 03:02 AM, Jiri Pirko wrote:
>>> Fri, Apr 28, 2017 at 03:22:53AM CEST, jhs@mojatatu.com wrote:
>>
>> [..]
>>>> Maybe I am misunderstanding:
>>>> Recall, this is what it looks like with this patchset:
>>>> <nlh><subsytem-header>[TCA_ROOT_XXXX]
>>>>
>>>> TCA_ROOT_XXX is very subsystem specific. classifiers, qdiscs and many
>>>> subsystems defined their own semantics for that TLV level. This specific
>>>> "dump max" is very very specific to actions. They were crippled by the
>>>> fact you could only send 32 at a time - this allows more to be sent.

>>>
>>> All I suggest is to replace NLA_U32 flags you want that does not
>>> have any semantics with NLA_FLAGS flags, which eventually will carry
>>> the exact same u32, but with predefined semantics, helpers, everything.
>>>
>>
>> I didnt understand fully Jiri. Are you suggesting a new type called
>> NLA_FLAGS which is re-usable elsewhere?
>
> Exactly. That's what I'm saying.
>

If you want to make it general:
I see the semantics of this thing as more detailed than what I had.
It would have a u32 bitmap + u32 bitmask.

cheers,
jamal
Jiri Pirko April 28, 2017, 2:02 p.m. UTC | #27
Fri, Apr 28, 2017 at 03:42:31PM CEST, jhs@mojatatu.com wrote:
>On 17-04-28 09:21 AM, Jiri Pirko wrote:
>> Fri, Apr 28, 2017 at 02:30:17PM CEST, jhs@mojatatu.com wrote:
>> > On 17-04-28 03:02 AM, Jiri Pirko wrote:
>> > > Fri, Apr 28, 2017 at 03:22:53AM CEST, jhs@mojatatu.com wrote:
>> > 
>> > [..]
>> > > > Maybe I am misunderstanding:
>> > > > Recall, this is what it looks like with this patchset:
>> > > > <nlh><subsytem-header>[TCA_ROOT_XXXX]
>> > > > 
>> > > > TCA_ROOT_XXX is very subsystem specific. classifiers, qdiscs and many
>> > > > subsystems defined their own semantics for that TLV level. This specific
>> > > > "dump max" is very very specific to actions. They were crippled by the
>> > > > fact you could only send 32 at a time - this allows more to be sent.
>
>> > > 
>> > > All I suggest is to replace NLA_U32 flags you want that does not
>> > > have any semantics with NLA_FLAGS flags, which eventually will carry
>> > > the exact same u32, but with predefined semantics, helpers, everything.
>> > > 
>> > 
>> > I didnt understand fully Jiri. Are you suggesting a new type called
>> > NLA_FLAGS which is re-usable elsewhere?
>> 
>> Exactly. That's what I'm saying.
>> 
>
>If you want to make it general:
>I see the semantics of this thing as more detailed than what I had.
>It would have a u32 bitmap + u32 bitmask.

Sure, lets make this nice.
Jamal Hadi Salim April 30, 2017, 10:34 a.m. UTC | #28
On 17-04-28 09:21 AM, Jiri Pirko wrote:
> Fri, Apr 28, 2017 at 02:30:17PM CEST, jhs@mojatatu.com wrote:
[..]
>> I didnt understand fully Jiri. Are you suggesting a new type called
>> NLA_FLAGS which is re-usable elsewhere?
>
> Exactly. That's what I'm saying.
>

Okay, I will post something.

cheers,
jamal
diff mbox

Patch

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cce0613..5875467 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -674,10 +674,28 @@  struct tcamsg {
 	unsigned char	tca__pad1;
 	unsigned short	tca__pad2;
 };
+
+enum {
+	TCA_ROOT_UNSPEC,
+	TCA_ROOT_TAB,
+#define TCA_ACT_TAB TCA_ROOT_TAB
+	TCA_ROOT_FLAGS,
+	TCA_ROOT_COUNT,
+	__TCA_ROOT_MAX,
+#define	TCA_ROOT_MAX (__TCA_ROOT_MAX - 1)
+};
+
 #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
 #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
-#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
-#define TCAA_MAX 1
+/* tcamsg flags stored in attribute TCA_ROOT_FLAGS
+ *
+ * TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
+ * actions in a dump. All dump responses will contain the number of actions
+ * being dumped stored in for user app's consumption in TCA_ROOT_COUNT
+ *
+ */
+#define TCA_FLAG_LARGE_DUMP_ON		(1 << 0)
+#define VALID_TCA_FLAGS TCA_FLAG_LARGE_DUMP_ON
 
 /* New extended info filters for IFLA_EXT_MASK */
 #define RTEXT_FILTER_VF		(1 << 0)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9ce22b7..2756213 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -83,6 +83,7 @@  static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+	u32 act_flags = cb->args[2];
 	struct nlattr *nest;
 
 	spin_lock_bh(&hinfo->lock);
@@ -111,14 +112,18 @@  static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			}
 			nla_nest_end(skb, nest);
 			n_i++;
-			if (n_i >= TCA_ACT_MAX_PRIO)
+			if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
+			    n_i >= TCA_ACT_MAX_PRIO)
 				goto done;
 		}
 	}
 done:
 	spin_unlock_bh(&hinfo->lock);
-	if (n_i)
+	if (n_i) {
 		cb->args[0] += n_i;
+		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
+			cb->args[1] = n_i;
+	}
 	return n_i;
 
 nla_put_failure:
@@ -993,11 +998,15 @@  static int tcf_action_add(struct net *net, struct nlattr *nla,
 	return tcf_add_notify(net, n, &actions, portid);
 }
 
+static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
+	[TCA_ROOT_FLAGS]      = { .type = NLA_U32 },
+};
+
 static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 			 struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
-	struct nlattr *tca[TCAA_MAX + 1];
+	struct nlattr *tca[TCA_ROOT_MAX + 1];
 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
 	int ret = 0, ovr = 0;
 
@@ -1005,7 +1014,7 @@  static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
-	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, NULL,
+	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL,
 			  extack);
 	if (ret < 0)
 		return ret;
@@ -1046,16 +1055,12 @@  static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 	return ret;
 }
 
-static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
+static struct nlattr *find_dump_kind(struct nlattr **nla)
 {
 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
-	struct nlattr *nla[TCAA_MAX + 1];
 	struct nlattr *kind;
 
-	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
-			NULL, NULL) < 0)
-		return NULL;
 	tb1 = nla[TCA_ACT_TAB];
 	if (tb1 == NULL)
 		return NULL;
@@ -1073,6 +1078,16 @@  static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
 	return kind;
 }
 
+static inline bool tca_flags_valid(u32 act_flags)
+{
+	u32 invalid_flags_mask  = ~VALID_TCA_FLAGS;
+
+	if (act_flags & invalid_flags_mask)
+		return false;
+
+	return true;
+}
+
 static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
@@ -1082,8 +1097,18 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	struct tc_action_ops *a_o;
 	int ret = 0;
 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
-	struct nlattr *kind = find_dump_kind(cb->nlh);
+	struct nlattr *count_attr = NULL;
+	struct nlattr *tb[TCA_ROOT_MAX + 1];
+	struct nlattr *kind = NULL;
+	u32 act_flags = 0;
+	u32 act_count = 0;
+
+	ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
+			  tcaa_policy, NULL);
+	if (ret < 0)
+		return ret;
 
+	kind = find_dump_kind(tb);
 	if (kind == NULL) {
 		pr_info("tc_dump_action: action bad kind\n");
 		return 0;
@@ -1093,14 +1118,25 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (a_o == NULL)
 		return 0;
 
+	if (tb[TCA_ROOT_FLAGS])
+		act_flags = nla_get_u32(tb[TCA_ROOT_FLAGS]);
+
+	if (act_flags && !tca_flags_valid(act_flags))
+		return -EINVAL;
+
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			cb->nlh->nlmsg_type, sizeof(*t), 0);
 	if (!nlh)
 		goto out_module_put;
+
+	cb->args[2] = act_flags;
 	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
+	count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32));
+	if (!count_attr)
+		goto out_module_put;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
@@ -1113,6 +1149,9 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (ret > 0) {
 		nla_nest_end(skb, nest);
 		ret = skb->len;
+		act_count = cb->args[1];
+		memcpy(nla_data(count_attr), &act_count, sizeof(u32));
+		cb->args[1] = 0;
 	} else
 		nlmsg_trim(skb, b);