[net-next,v5,1/2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch

Submitted by Jamal Hadi Salim on April 20, 2017, 1:06 p.m.

Details

Message ID 1492693582-26810-2-git-send-email-jhs@emojatatu.com
State Changes Requested
Delegated to: David Miller
Headers show

Commit Message

Jamal Hadi Salim April 20, 2017, 1:06 p.m.
From: Jamal Hadi Salim <jhs@mojatatu.com>

When you dump hundreds of thousands of actions, getting only 32 per
dump batch even when the socket buffer and memory allocations allow
is inefficient.

With this change, the user will get as many as possibly fitting
within the given constraints available to the kernel.

A new top level TLV space is introduced. An attribute
TCAA_ACT_FLAGS is used to carry the flags indicating the user
is capable of processing these large dumps. Older user space which
doesn't set this flag doesn't get the large (than 32) batches.
The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
actions are put in a single batch. As such user space app knows how long
to iterate (independent of the type of action being dumped)
instead of hardcoded maximum of 32.

Some results dumping 1.5M actions, first unpatched tc which the
kernel doesn't help:

prompt$ time -p tc actions ls action gact | grep index | wc -l
1500000
real 1388.43
user 2.07
sys 1386.79

Now lets see a patched tc which sets the correct flags when requesting
a dump:

prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
1500000
real 178.13
user 2.02
sys 176.96

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++--
 net/sched/act_api.c            | 46 +++++++++++++++++++++++++++++++++---------
 2 files changed, 55 insertions(+), 12 deletions(-)

Comments

Jiri Pirko April 20, 2017, 1:59 p.m.
Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote:
>From: Jamal Hadi Salim <jhs@mojatatu.com>
>
>When you dump hundreds of thousands of actions, getting only 32 per
>dump batch even when the socket buffer and memory allocations allow
>is inefficient.
>
>With this change, the user will get as many as possibly fitting
>within the given constraints available to the kernel.
>
>A new top level TLV space is introduced. An attribute
>TCAA_ACT_FLAGS is used to carry the flags indicating the user
>is capable of processing these large dumps. Older user space which
>doesn't set this flag doesn't get the large (than 32) batches.
>The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
>actions are put in a single batch. As such user space app knows how long
>to iterate (independent of the type of action being dumped)
>instead of hardcoded maximum of 32.
>
>Some results dumping 1.5M actions, first unpatched tc which the
>kernel doesn't help:
>
>prompt$ time -p tc actions ls action gact | grep index | wc -l
>1500000
>real 1388.43
>user 2.07
>sys 1386.79
>
>Now lets see a patched tc which sets the correct flags when requesting
>a dump:
>
>prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
>1500000
>real 178.13
>user 2.02
>sys 176.96
>
>Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
>---
> include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++--
> net/sched/act_api.c            | 46 +++++++++++++++++++++++++++++++++---------
> 2 files changed, 55 insertions(+), 12 deletions(-)
>
>diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
>index cce0613..d7d28ec 100644
>--- a/include/uapi/linux/rtnetlink.h
>+++ b/include/uapi/linux/rtnetlink.h
>@@ -674,10 +674,27 @@ struct tcamsg {
> 	unsigned char	tca__pad1;
> 	unsigned short	tca__pad2;
> };
>+
>+enum {
>+	TCAA_UNSPEC,

TCAA stands for "traffic control action action". I don't get it :( 
Prefix still sounds wrong to me, sorry :/
Should be:
TCA_SOMETHING_*


>+	TCAA_ACT_TAB,
>+#define TCA_ACT_TAB TCAA_ACT_TAB
>+	TCAA_ACT_FLAGS,
>+	TCAA_ACT_COUNT,
>+	__TCAA_MAX,
>+#define	TCAA_MAX (__TCAA_MAX - 1)
>+};
>+
> #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>-#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>-#define TCAA_MAX 1
>+/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
>+ *
>+ * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>+ * actions in a dump. All dump responses will contain the number of actions
>+ * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
>+ *
>+ */
>+#define ACT_LARGE_DUMP_ON		BIT(0)

Please put some prefix to the name, as I asked for in the previous
version.	
	
	
> 
> /* New extended info filters for IFLA_EXT_MASK */
> #define RTEXT_FILTER_VF		(1 << 0)
>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>index 82b1d48..f85b8fd 100644
>--- a/net/sched/act_api.c
>+++ b/net/sched/act_api.c
>@@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
> 			   struct netlink_callback *cb)
> {
> 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
>+	unsigned short act_flags = cb->args[2];
> 	struct nlattr *nest;
> 
> 	spin_lock_bh(&hinfo->lock);
>@@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
> 			}
> 			nla_nest_end(skb, nest);
> 			n_i++;
>-			if (n_i >= TCA_ACT_MAX_PRIO)
>+			if (!(act_flags & ACT_LARGE_DUMP_ON) &&
>+			    n_i >= TCA_ACT_MAX_PRIO)
> 				goto done;
> 		}
> 	}
> done:
> 	spin_unlock_bh(&hinfo->lock);
>-	if (n_i)
>+	if (n_i) {
> 		cb->args[0] += n_i;
>+		if (act_flags & ACT_LARGE_DUMP_ON)
>+			cb->args[1] = n_i;
>+	}
> 	return n_i;
> 
> nla_put_failure:
>@@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
> 	return tcf_add_notify(net, n, &actions, portid);
> }
> 
>+static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = {
>+	[TCAA_ACT_FLAGS]      = { .type = NLA_U32 },
>+};
>+
> static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
> 			 struct netlink_ext_ack *extack)
> {
> 	struct net *net = sock_net(skb->sk);
>-	struct nlattr *tca[TCA_ACT_MAX + 1];
>+	struct nlattr *tca[TCAA_MAX + 1];
> 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
> 	int ret = 0, ovr = 0;
> 
>@@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
> 	    !netlink_capable(skb, CAP_NET_ADMIN))
> 		return -EPERM;
> 
>-	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
>+	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,

Please do this in a separate patch. It is an unrelated bug fix.



> 			  extack);
> 	if (ret < 0)
> 		return ret;
>@@ -1046,16 +1055,12 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
> 	return ret;
> }
> 
>-static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
>+static struct nlattr *find_dump_kind(struct nlattr **nla)
> {
> 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
> 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
>-	struct nlattr *nla[TCAA_MAX + 1];
> 	struct nlattr *kind;
> 
>-	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
>-			NULL, NULL) < 0)
>-		return NULL;
> 	tb1 = nla[TCA_ACT_TAB];
> 	if (tb1 == NULL)
> 		return NULL;
>@@ -1081,9 +1086,18 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	struct nlattr *nest;
> 	struct tc_action_ops *a_o;
> 	int ret = 0;
>+	struct nlattr *tcaa[TCAA_MAX + 1];

"tcaa" again, now as a variable :/
Just use "tb" as the rest of the code does.


> 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
>-	struct nlattr *kind = find_dump_kind(cb->nlh);
>+	struct nlattr *kind = NULL;
>+	struct nlattr *count_attr = NULL;
>+	u32 act_flags = 0;
>+
>+	ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tcaa, TCAA_MAX,
>+			  tcaa_policy, NULL);
>+	if (ret < 0)
>+		return ret;
> 
>+	kind = find_dump_kind(tcaa);
> 	if (kind == NULL) {
> 		pr_info("tc_dump_action: action bad kind\n");
> 		return 0;
>@@ -1093,14 +1107,23 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	if (a_o == NULL)
> 		return 0;
> 
>+	if (tcaa[TCAA_ACT_FLAGS])
>+		act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]);

I still believe this is wrong. Should be a separate attr per flag.
For user experience breakage reasons: 
2 kernels should not behave differently on the exact same value passed
from userspace:
User passes 0x2. Now the kernel will ignore the set bit, the next kernel
will recognize it as a valid flag and do something.
Please let the discussion reach a consensus before pushing this again.


>+
> 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
> 			cb->nlh->nlmsg_type, sizeof(*t), 0);
> 	if (!nlh)
> 		goto out_module_put;
>+
>+	cb->args[2] = act_flags;
>+
> 	t = nlmsg_data(nlh);
> 	t->tca_family = AF_UNSPEC;
> 	t->tca__pad1 = 0;
> 	t->tca__pad2 = 0;
>+	count_attr = nla_reserve(skb, TCAA_ACT_COUNT, sizeof(u32));
>+	if (!count_attr)
>+		goto out_module_put;
> 
> 	nest = nla_nest_start(skb, TCA_ACT_TAB);
> 	if (nest == NULL)
>@@ -1113,6 +1136,8 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	if (ret > 0) {
> 		nla_nest_end(skb, nest);
> 		ret = skb->len;
>+		memcpy(nla_data(count_attr), &cb->args[1], sizeof(u32));
>+		cb->args[1] = 0;
> 	} else
> 		nlmsg_trim(skb, b);
> 
>-- 
>1.9.1
>
Jamal Hadi Salim April 20, 2017, 2:18 p.m.
On 17-04-20 09:59 AM, Jiri Pirko wrote:
> Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote:
>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>>
>> When you dump hundreds of thousands of actions, getting only 32 per
>> dump batch even when the socket buffer and memory allocations allow
>> is inefficient.
>>
>> With this change, the user will get as many as possibly fitting
>> within the given constraints available to the kernel.
>>
>> A new top level TLV space is introduced. An attribute
>> TCAA_ACT_FLAGS is used to carry the flags indicating the user
>> is capable of processing these large dumps. Older user space which
>> doesn't set this flag doesn't get the large (than 32) batches.
>> The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
>> actions are put in a single batch. As such user space app knows how long
>> to iterate (independent of the type of action being dumped)
>> instead of hardcoded maximum of 32.
>>
>> Some results dumping 1.5M actions, first unpatched tc which the
>> kernel doesn't help:
>>
>> prompt$ time -p tc actions ls action gact | grep index | wc -l
>> 1500000
>> real 1388.43
>> user 2.07
>> sys 1386.79
>>
>> Now lets see a patched tc which sets the correct flags when requesting
>> a dump:
>>
>> prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
>> 1500000
>> real 178.13
>> user 2.02
>> sys 176.96
>>
>> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
>> ---
>> include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++--
>> net/sched/act_api.c            | 46 +++++++++++++++++++++++++++++++++---------
>> 2 files changed, 55 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
>> index cce0613..d7d28ec 100644
>> --- a/include/uapi/linux/rtnetlink.h
>> +++ b/include/uapi/linux/rtnetlink.h
>> @@ -674,10 +674,27 @@ struct tcamsg {
>> 	unsigned char	tca__pad1;
>> 	unsigned short	tca__pad2;
>> };
>> +
>> +enum {
>> +	TCAA_UNSPEC,
>
> TCAA stands for "traffic control action action". I don't get it :(
> Prefix still sounds wrong to me, sorry :/
> Should be:
> TCA_SOMETHING_*
>
>
>> +	TCAA_ACT_TAB,
>> +#define TCA_ACT_TAB TCAA_ACT_TAB
>> +	TCAA_ACT_FLAGS,
>> +	TCAA_ACT_COUNT,
>> +	__TCAA_MAX,
>> +#define	TCAA_MAX (__TCAA_MAX - 1)
>> +};
>> +
>> #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>> -#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>> -#define TCAA_MAX 1
>> +/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
>> + *
>> + * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>> + * actions in a dump. All dump responses will contain the number of actions
>> + * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
>> + *
>> + */
>> +#define ACT_LARGE_DUMP_ON		BIT(0)
>
> Please put some prefix to the name, as I asked for in the previous
> version.	
> 	

Didnt mean to leave out but:
I cant seem to find it. Do you recall what you said it should be?

> 	
>>
>> /* New extended info filters for IFLA_EXT_MASK */
>> #define RTEXT_FILTER_VF		(1 << 0)
>> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>> index 82b1d48..f85b8fd 100644
>> --- a/net/sched/act_api.c
>> +++ b/net/sched/act_api.c
>> @@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>> 			   struct netlink_callback *cb)
>> {
>> 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
>> +	unsigned short act_flags = cb->args[2];
>> 	struct nlattr *nest;
>>
>> 	spin_lock_bh(&hinfo->lock);
>> @@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>> 			}
>> 			nla_nest_end(skb, nest);
>> 			n_i++;
>> -			if (n_i >= TCA_ACT_MAX_PRIO)
>> +			if (!(act_flags & ACT_LARGE_DUMP_ON) &&
>> +			    n_i >= TCA_ACT_MAX_PRIO)
>> 				goto done;
>> 		}
>> 	}
>> done:
>> 	spin_unlock_bh(&hinfo->lock);
>> -	if (n_i)
>> +	if (n_i) {
>> 		cb->args[0] += n_i;
>> +		if (act_flags & ACT_LARGE_DUMP_ON)
>> +			cb->args[1] = n_i;
>> +	}
>> 	return n_i;
>>
>> nla_put_failure:
>> @@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
>> 	return tcf_add_notify(net, n, &actions, portid);
>> }
>>
>> +static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = {
>> +	[TCAA_ACT_FLAGS]      = { .type = NLA_U32 },
>> +};
>> +
>> static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> 			 struct netlink_ext_ack *extack)
>> {
>> 	struct net *net = sock_net(skb->sk);
>> -	struct nlattr *tca[TCA_ACT_MAX + 1];
>> +	struct nlattr *tca[TCAA_MAX + 1];
>> 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
>> 	int ret = 0, ovr = 0;
>>
>> @@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> 	    !netlink_capable(skb, CAP_NET_ADMIN))
>> 		return -EPERM;
>>
>> -	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
>> +	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,
>
> Please do this in a separate patch. It is an unrelated bug fix.
>

Ok, that is fair. Thanks. Ok, if this was patch 1/3 in next update.

>
>

>
> "tcaa" again, now as a variable :/
> Just use "tb" as the rest of the code does.
>

Sure.

>
>>
>> +	if (tcaa[TCAA_ACT_FLAGS])
>> +		act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]);
>
> I still believe this is wrong. Should be a separate attr per flag.
> For user experience breakage reasons:
> 2 kernels should not behave differently on the exact same value passed
> from userspace:
> User passes 0x2. Now the kernel will ignore the set bit, the next kernel
> will recognize it as a valid flag and do something.
> Please let the discussion reach a consensus before pushing this again.
>
>

Jiri - I dont agree. There is no such breakage. Refer to my previous
email. Lets just move on.

cheers,
jamal
Jiri Pirko April 20, 2017, 2:24 p.m.
Thu, Apr 20, 2017 at 04:18:50PM CEST, jhs@mojatatu.com wrote:
>On 17-04-20 09:59 AM, Jiri Pirko wrote:
>> Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote:
>> > From: Jamal Hadi Salim <jhs@mojatatu.com>
>> > 
>> > When you dump hundreds of thousands of actions, getting only 32 per
>> > dump batch even when the socket buffer and memory allocations allow
>> > is inefficient.
>> > 
>> > With this change, the user will get as many as possibly fitting
>> > within the given constraints available to the kernel.
>> > 
>> > A new top level TLV space is introduced. An attribute
>> > TCAA_ACT_FLAGS is used to carry the flags indicating the user
>> > is capable of processing these large dumps. Older user space which
>> > doesn't set this flag doesn't get the large (than 32) batches.
>> > The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
>> > actions are put in a single batch. As such user space app knows how long
>> > to iterate (independent of the type of action being dumped)
>> > instead of hardcoded maximum of 32.
>> > 
>> > Some results dumping 1.5M actions, first unpatched tc which the
>> > kernel doesn't help:
>> > 
>> > prompt$ time -p tc actions ls action gact | grep index | wc -l
>> > 1500000
>> > real 1388.43
>> > user 2.07
>> > sys 1386.79
>> > 
>> > Now lets see a patched tc which sets the correct flags when requesting
>> > a dump:
>> > 
>> > prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
>> > 1500000
>> > real 178.13
>> > user 2.02
>> > sys 176.96
>> > 
>> > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
>> > ---
>> > include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++--
>> > net/sched/act_api.c            | 46 +++++++++++++++++++++++++++++++++---------
>> > 2 files changed, 55 insertions(+), 12 deletions(-)
>> > 
>> > diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
>> > index cce0613..d7d28ec 100644
>> > --- a/include/uapi/linux/rtnetlink.h
>> > +++ b/include/uapi/linux/rtnetlink.h
>> > @@ -674,10 +674,27 @@ struct tcamsg {
>> > 	unsigned char	tca__pad1;
>> > 	unsigned short	tca__pad2;
>> > };
>> > +
>> > +enum {
>> > +	TCAA_UNSPEC,
>> 
>> TCAA stands for "traffic control action action". I don't get it :(
>> Prefix still sounds wrong to me, sorry :/
>> Should be:
>> TCA_SOMETHING_*
>> 
>> 
>> > +	TCAA_ACT_TAB,
>> > +#define TCA_ACT_TAB TCAA_ACT_TAB
>> > +	TCAA_ACT_FLAGS,
>> > +	TCAA_ACT_COUNT,
>> > +	__TCAA_MAX,
>> > +#define	TCAA_MAX (__TCAA_MAX - 1)
>> > +};
>> > +
>> > #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>> > #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>> > -#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>> > -#define TCAA_MAX 1
>> > +/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
>> > + *
>> > + * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>> > + * actions in a dump. All dump responses will contain the number of actions
>> > + * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
>> > + *
>> > + */
>> > +#define ACT_LARGE_DUMP_ON		BIT(0)
>> 
>> Please put some prefix to the name, as I asked for in the previous
>> version.	
>> 	
>
>Didnt mean to leave out but:
>I cant seem to find it. Do you recall what you said it should be?

TCA_SOMETHING_FLAGS_LARGE_DUMP_ON


>
>> 	
>> > 
>> > /* New extended info filters for IFLA_EXT_MASK */
>> > #define RTEXT_FILTER_VF		(1 << 0)
>> > diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>> > index 82b1d48..f85b8fd 100644
>> > --- a/net/sched/act_api.c
>> > +++ b/net/sched/act_api.c
>> > @@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>> > 			   struct netlink_callback *cb)
>> > {
>> > 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
>> > +	unsigned short act_flags = cb->args[2];
>> > 	struct nlattr *nest;
>> > 
>> > 	spin_lock_bh(&hinfo->lock);
>> > @@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>> > 			}
>> > 			nla_nest_end(skb, nest);
>> > 			n_i++;
>> > -			if (n_i >= TCA_ACT_MAX_PRIO)
>> > +			if (!(act_flags & ACT_LARGE_DUMP_ON) &&
>> > +			    n_i >= TCA_ACT_MAX_PRIO)
>> > 				goto done;
>> > 		}
>> > 	}
>> > done:
>> > 	spin_unlock_bh(&hinfo->lock);
>> > -	if (n_i)
>> > +	if (n_i) {
>> > 		cb->args[0] += n_i;
>> > +		if (act_flags & ACT_LARGE_DUMP_ON)
>> > +			cb->args[1] = n_i;
>> > +	}
>> > 	return n_i;
>> > 
>> > nla_put_failure:
>> > @@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
>> > 	return tcf_add_notify(net, n, &actions, portid);
>> > }
>> > 
>> > +static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = {
>> > +	[TCAA_ACT_FLAGS]      = { .type = NLA_U32 },
>> > +};
>> > +
>> > static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> > 			 struct netlink_ext_ack *extack)
>> > {
>> > 	struct net *net = sock_net(skb->sk);
>> > -	struct nlattr *tca[TCA_ACT_MAX + 1];
>> > +	struct nlattr *tca[TCAA_MAX + 1];
>> > 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
>> > 	int ret = 0, ovr = 0;
>> > 
>> > @@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> > 	    !netlink_capable(skb, CAP_NET_ADMIN))
>> > 		return -EPERM;
>> > 
>> > -	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
>> > +	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,
>> 
>> Please do this in a separate patch. It is an unrelated bug fix.
>> 
>
>Ok, that is fair. Thanks. Ok, if this was patch 1/3 in next update.

Good.

>
>> 
>> 
>
>> 
>> "tcaa" again, now as a variable :/
>> Just use "tb" as the rest of the code does.
>> 
>
>Sure.
>
>> 
>> > 
>> > +	if (tcaa[TCAA_ACT_FLAGS])
>> > +		act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]);
>> 
>> I still believe this is wrong. Should be a separate attr per flag.
>> For user experience breakage reasons:
>> 2 kernels should not behave differently on the exact same value passed
>> from userspace:
>> User passes 0x2. Now the kernel will ignore the set bit, the next kernel
>> will recognize it as a valid flag and do something.
>> Please let the discussion reach a consensus before pushing this again.
>> 
>> 
>
>Jiri - I dont agree. There is no such breakage. Refer to my previous
>email. Lets just move on.

Anyone else has opinion on this?
Jamal Hadi Salim April 20, 2017, 2:25 p.m.
On 17-04-20 09:59 AM, Jiri Pirko wrote:
> Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote:
>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>>
>> When you dump hundreds of thousands of actions, getting only 32 per
>> dump batch even when the socket buffer and memory allocations allow
>> is inefficient.
>>
>> With this change, the user will get as many as possibly fitting
>> within the given constraints available to the kernel.
>>
>> A new top level TLV space is introduced. An attribute
>> TCAA_ACT_FLAGS is used to carry the flags indicating the user
>> is capable of processing these large dumps. Older user space which
>> doesn't set this flag doesn't get the large (than 32) batches.
>> The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
>> actions are put in a single batch. As such user space app knows how long
>> to iterate (independent of the type of action being dumped)
>> instead of hardcoded maximum of 32.
>>
>> Some results dumping 1.5M actions, first unpatched tc which the
>> kernel doesn't help:
>>
>> prompt$ time -p tc actions ls action gact | grep index | wc -l
>> 1500000
>> real 1388.43
>> user 2.07
>> sys 1386.79
>>
>> Now lets see a patched tc which sets the correct flags when requesting
>> a dump:
>>
>> prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
>> 1500000
>> real 178.13
>> user 2.02
>> sys 176.96
>>
>> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
>> ---
>> include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++--
>> net/sched/act_api.c            | 46 +++++++++++++++++++++++++++++++++---------
>> 2 files changed, 55 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
>> index cce0613..d7d28ec 100644
>> --- a/include/uapi/linux/rtnetlink.h
>> +++ b/include/uapi/linux/rtnetlink.h
>> @@ -674,10 +674,27 @@ struct tcamsg {
>> 	unsigned char	tca__pad1;
>> 	unsigned short	tca__pad2;
>> };
>> +
>> +enum {
>> +	TCAA_UNSPEC,
>
> TCAA stands for "traffic control action action". I don't get it :(

TC Action Attributes == TCAA.

> Prefix still sounds wrong to me, sorry :/
> Should be:
> TCA_SOMETHING_*
>

TCA_ATTR_XXX ?

I have another opportunity to make a change  where we should close
the discussion on this. We cant drag it forever.

cheers,
jamal
Jiri Pirko April 20, 2017, 2:33 p.m.
Thu, Apr 20, 2017 at 04:25:11PM CEST, jhs@mojatatu.com wrote:
>On 17-04-20 09:59 AM, Jiri Pirko wrote:
>> Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote:
>> > From: Jamal Hadi Salim <jhs@mojatatu.com>
>> > 
>> > When you dump hundreds of thousands of actions, getting only 32 per
>> > dump batch even when the socket buffer and memory allocations allow
>> > is inefficient.
>> > 
>> > With this change, the user will get as many as possibly fitting
>> > within the given constraints available to the kernel.
>> > 
>> > A new top level TLV space is introduced. An attribute
>> > TCAA_ACT_FLAGS is used to carry the flags indicating the user
>> > is capable of processing these large dumps. Older user space which
>> > doesn't set this flag doesn't get the large (than 32) batches.
>> > The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
>> > actions are put in a single batch. As such user space app knows how long
>> > to iterate (independent of the type of action being dumped)
>> > instead of hardcoded maximum of 32.
>> > 
>> > Some results dumping 1.5M actions, first unpatched tc which the
>> > kernel doesn't help:
>> > 
>> > prompt$ time -p tc actions ls action gact | grep index | wc -l
>> > 1500000
>> > real 1388.43
>> > user 2.07
>> > sys 1386.79
>> > 
>> > Now lets see a patched tc which sets the correct flags when requesting
>> > a dump:
>> > 
>> > prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
>> > 1500000
>> > real 178.13
>> > user 2.02
>> > sys 176.96
>> > 
>> > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
>> > ---
>> > include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++--
>> > net/sched/act_api.c            | 46 +++++++++++++++++++++++++++++++++---------
>> > 2 files changed, 55 insertions(+), 12 deletions(-)
>> > 
>> > diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
>> > index cce0613..d7d28ec 100644
>> > --- a/include/uapi/linux/rtnetlink.h
>> > +++ b/include/uapi/linux/rtnetlink.h
>> > @@ -674,10 +674,27 @@ struct tcamsg {
>> > 	unsigned char	tca__pad1;
>> > 	unsigned short	tca__pad2;
>> > };
>> > +
>> > +enum {
>> > +	TCAA_UNSPEC,
>> 
>> TCAA stands for "traffic control action action". I don't get it :(
>
>TC Action Attributes == TCAA.
>
>> Prefix still sounds wrong to me, sorry :/
>> Should be:
>> TCA_SOMETHING_*
>> 
>
>TCA_ATTR_XXX ?

Of course it is an attribute. No need to have "attr" in the name. No
other enum has it. Does not make sense.

The name should contain the group name. What group this enum defines?


>
>I have another opportunity to make a change  where we should close
>the discussion on this. We cant drag it forever.
>
>cheers,
>jamal
>
Jamal Hadi Salim April 20, 2017, 3:08 p.m.
On 17-04-20 10:33 AM, Jiri Pirko wrote:
> Thu, Apr 20, 2017 at 04:25:11PM CEST, jhs@mojatatu.com wrote:

>>> TCAA stands for "traffic control action action". I don't get it :(
>>
>> TC Action Attributes == TCAA.
>>
>>> Prefix still sounds wrong to me, sorry :/
>>> Should be:
>>> TCA_SOMETHING_*
>>>
>>
>> TCA_ATTR_XXX ?
>
> Of course it is an attribute. No need to have "attr" in the name. No
> other enum has it. Does not make sense.
>
> The name should contain the group name. What group this enum defines?
>

It defines tc root level Action attributes.
Would TCRLAA_XXX be more exciting? ;->

cheers,
jamal
Jiri Pirko April 20, 2017, 3:13 p.m.
Thu, Apr 20, 2017 at 05:08:20PM CEST, jhs@mojatatu.com wrote:
>On 17-04-20 10:33 AM, Jiri Pirko wrote:
>> Thu, Apr 20, 2017 at 04:25:11PM CEST, jhs@mojatatu.com wrote:
>
>> > > TCAA stands for "traffic control action action". I don't get it :(
>> > 
>> > TC Action Attributes == TCAA.
>> > 
>> > > Prefix still sounds wrong to me, sorry :/
>> > > Should be:
>> > > TCA_SOMETHING_*
>> > > 
>> > 
>> > TCA_ATTR_XXX ?
>> 
>> Of course it is an attribute. No need to have "attr" in the name. No
>> other enum has it. Does not make sense.
>> 
>> The name should contain the group name. What group this enum defines?
>> 
>
>It defines tc root level Action attributes.
>Would TCRLAA_XXX be more exciting? ;->

How about TCA_ROOT_XXX


>
>cheers,
>jamal
>
Eric Dumazet April 20, 2017, 4:09 p.m.
On Thu, 2017-04-20 at 09:06 -0400, Jamal Hadi Salim wrote:

>  	nest = nla_nest_start(skb, TCA_ACT_TAB);
>  	if (nest == NULL)
> @@ -1113,6 +1136,8 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
>  	if (ret > 0) {
>  		nla_nest_end(skb, nest);
>  		ret = skb->len;
> +		memcpy(nla_data(count_attr), &cb->args[1], sizeof(u32));

This will not work on BigEndian 64bit hosts, since cb->args[1] is bigger
than 32bit.


> +		cb->args[1] = 0;
>  	} else
>  		nlmsg_trim(skb, b);
>
Jamal Hadi Salim April 20, 2017, 5:39 p.m.
On 17-04-20 12:09 PM, Eric Dumazet wrote:
> On Thu, 2017-04-20 at 09:06 -0400, Jamal Hadi Salim wrote:
>
>>  	nest = nla_nest_start(skb, TCA_ACT_TAB);
>>  	if (nest == NULL)
>> @@ -1113,6 +1136,8 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
>>  	if (ret > 0) {
>>  		nla_nest_end(skb, nest);
>>  		ret = skb->len;
>> +		memcpy(nla_data(count_attr), &cb->args[1], sizeof(u32));
>
> This will not work on BigEndian 64bit hosts, since cb->args[1] is bigger
> than 32bit.
>

Ok, thanks.
I will assign to a 32 bit var first then memcpy in the next iteration
(tomorrow).

cheers,
jamal
Simon Horman April 24, 2017, 9:14 a.m.
On Thu, Apr 20, 2017 at 04:24:53PM +0200, Jiri Pirko wrote:
> Thu, Apr 20, 2017 at 04:18:50PM CEST, jhs@mojatatu.com wrote:
> >On 17-04-20 09:59 AM, Jiri Pirko wrote:
> >> Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote:
> >> > From: Jamal Hadi Salim <jhs@mojatatu.com>

...

> >> > +	if (tcaa[TCAA_ACT_FLAGS])
> >> > +		act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]);
> >> 
> >> I still believe this is wrong. Should be a separate attr per flag.
> >> For user experience breakage reasons:
> >> 2 kernels should not behave differently on the exact same value passed
> >> from userspace:
> >> User passes 0x2. Now the kernel will ignore the set bit, the next kernel
> >> will recognize it as a valid flag and do something.
> >> Please let the discussion reach a consensus before pushing this again.
> >> 
> >> 
> >
> >Jiri - I dont agree. There is no such breakage. Refer to my previous
> >email. Lets just move on.
> 
> Anyone else has opinion on this?

At the risk of jumping into a hornets nest, yes, I have an opinion:

* A agree with Jiri that a separate attribute per flag seems to be
  the cleanest option, however;
* I think it would be reasonable from a UABI PoV to permit currently unused
  bits of TCAA_ACT_FLAGS to be re-uses so long as the kernel checks that
  they are zero until they are designated to have some use. I believe this
  implies that the default value for any future uses of these bits would be
  zero.

Jamal, I am confused about why are you so concerned about the space
consumed by this attribute, it's per-message, right? Is it the bigger
picture you are worried about - a similar per-entry flag at some point in
the future?
Jamal Hadi Salim April 24, 2017, 12:49 p.m.
On 17-04-24 05:14 AM, Simon Horman wrote:
[..]

> Jamal, I am confused about why are you so concerned about the space
> consumed by this attribute, it's per-message, right? Is it the bigger
> picture you are worried about - a similar per-entry flag at some point in
> the future?


To me the two worries are one and the same.

Jiri strongly believes (from a big picture view) we must use
TLVs for extensibility.
While I agree with him in general i have strong reservations
in this case because i can get both extensibility and
build for performance with using a flag bitmask as the
content of the TLV.

A TLV consumes 64 bits minimum. It doesnt matter if we decide
to use a u8 or a u16, we are still sending 64 bits on that
TLV with the rest being PADding. Not to be melodramatic, but
the worst case scenario of putting everything in a TLV for 32
flags is using about 30x more space than using a bitmask.

Yes, space is important and if i can express upto 32 flags
with one TLV rather than 32 TLVs i choose one TLV.
I am always looking for ways to filter out crap i dont need
when i do stats collection. I have numerous wounds from fdb
entries which decided to use a TLV per flag.

The design approach we have used in netlink is: flags start
as a bitmap (whether they are on main headers or TLVs); they may be
complemented with a bitmask/selector (refer to IFLINK messages).

Lets look at this specific patch I have sending. I have already
changed it 3 times and involved a churn of 3 different flags.
If you asked me in the beggining i wouldve scratched my head
thinking for a near term use for bit #3, #4 etc,

I am fine with the counter-Postel view of having the kernel
validate that appropriate bits are set as long as we dont make
user space to now start learning how to play acrobatics.

cheers,
jamal
Pablo Neira April 24, 2017, 2:20 p.m.
On Mon, Apr 24, 2017 at 08:49:00AM -0400, Jamal Hadi Salim wrote:
> On 17-04-24 05:14 AM, Simon Horman wrote:
> [..]
> 
> >Jamal, I am confused about why are you so concerned about the space
> >consumed by this attribute, it's per-message, right? Is it the bigger
> >picture you are worried about - a similar per-entry flag at some point in
> >the future?
> 
> 
> To me the two worries are one and the same.
> 
> Jiri strongly believes (from a big picture view) we must use
> TLVs for extensibility.
> While I agree with him in general i have strong reservations
> in this case because i can get both extensibility and
> build for performance with using a flag bitmask as the
> content of the TLV.
> 
> A TLV consumes 64 bits minimum. It doesnt matter if we decide
> to use a u8 or a u16, we are still sending 64 bits on that
> TLV with the rest being PADding. Not to be melodramatic, but
> the worst case scenario of putting everything in a TLV for 32
> flags is using about 30x more space than using a bitmask.
> 
> Yes, space is important and if i can express upto 32 flags
> with one TLV rather than 32 TLVs i choose one TLV.
> I am always looking for ways to filter out crap i dont need
> when i do stats collection. I have numerous wounds from fdb
> entries which decided to use a TLV per flag.
> 
> The design approach we have used in netlink is: flags start
> as a bitmap (whether they are on main headers or TLVs); they may be
> complemented with a bitmask/selector (refer to IFLINK messages).
> 
> Lets look at this specific patch I have sending. I have already
> changed it 3 times and involved a churn of 3 different flags.
> If you asked me in the beggining i wouldve scratched my head
> thinking for a near term use for bit #3, #4 etc,
> 
> I am fine with the counter-Postel view of having the kernel
> validate that appropriate bits are set as long as we dont make
> user space to now start learning how to play acrobatics.

jamal, what performance concern you have in building this error
message? TLVs is the most flexible way. And this is error path, so we
should build this message rarely, only if the user sends us something
incorrect, why bother...
Jamal Hadi Salim April 24, 2017, 2:42 p.m.
On 17-04-24 10:20 AM, Pablo Neira Ayuso wrote:
> On Mon, Apr 24, 2017 at 08:49:00AM -0400, Jamal Hadi Salim wrote:

>>
>> I am fine with the counter-Postel view of having the kernel
>> validate that appropriate bits are set as long as we dont make
>> user space to now start learning how to play acrobatics.
>
> jamal, what performance concern you have in building this error
> message? TLVs is the most flexible way. And this is error path, so we
> should build this message rarely, only if the user sends us something
> incorrect, why bother...

I have a feeling we are reffering to 2 different things.
Which error message? Are you talking about extended ACK?
I have no problem with that.

Let me sumarize for you the discussion.

My concern was was the double request needed now
which was unneeded before.

Before: You send a msg and say the kernel didnt understand.
Kernel ignores what it didnt understand and does things
you asked it to. i.e Part of Postel principle which says
"Be liberal in what you expect of others"
But the new concern is user space not abiding to the other
half of Postel principle "Be conservative in what you send".
It may set some random flags which the kernel doesnt understand.

One idea is to have the kernel totally reject anytime
it sees such flags. I am sure such a message could be
conveyed back to the user. Then the user sends the
correct one back.

The challenge i have is to enforce this trial by fire
approach to all user space apps. It is a large change.

My suggestion is for user to set flag to request the
old behavior of sending only one message.

cheers,
jamal
David Miller April 24, 2017, 8:30 p.m.
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 24 Apr 2017 08:49:00 -0400

> Yes, space is important and if i can express upto 32 flags
> with one TLV rather than 32 TLVs i choose one TLV.

Which is fine.  But two things:

1) Again, bits you aren't using now, make sure userspace doesn't
   set them.  And if it does, reject.

2) If you are worried about performance, we're talking about a TLV in
   the request here not the dump response itself so performance isn't
   a real issue as Pablo mentioned.
Jamal Hadi Salim April 24, 2017, 10:18 p.m.
On 17-04-24 04:30 PM, David Miller wrote:

> Which is fine.  But two things:
>
> 1) Again, bits you aren't using now, make sure userspace doesn't
>    set them.  And if it does, reject.
>

I meet those goals on the bit checks but i went a slightly different
path with a patch I posted[1]

With the posted patch: unknow bits set will result in a kernel rejection
unless the user space explicitly ask the kernel to ignore flags it
doesnt understand and just handles what it knows. This reduces the
amount of work in tc.

If this ok I will resend tomorrow.

> 2) If you are worried about performance, we're talking about a TLV in
>    the request here not the dump response itself so performance isn't
>    a real issue as Pablo mentioned.

doesnt make much of a difference for a simple request, true; i was more
worried about how we pack similar things for dumps or for large
set requests in general. And note it makes no difference if i make the
bitmask u32 or u16 - the TLV still eats 32 + 32 bits. So using a u32
is sensible.

cheers,
jamal

[1]
This is because i worry about making large changes to
the behavior of user space apps like tc. If I reject I will need to
change tc to detect this rejection and retry (and I dont think
extended ACKs in their current shape are ready to provide any
meaningful detail).
David Miller April 24, 2017, 10:24 p.m.
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 24 Apr 2017 18:18:42 -0400

> With the posted patch: unknow bits set will result in a kernel
> rejection unless the user space explicitly ask the kernel to ignore
> flags it doesnt understand and just handles what it knows. This
> reduces the amount of work in tc.
...
> [1]
> This is because i worry about making large changes to
> the behavior of user space apps like tc. If I reject I will need to
> change tc to detect this rejection and retry (and I dont think
> extended ACKs in their current shape are ready to provide any
> meaningful detail).

I think we should eat the pain now and use the strict checks
for all new features like this.

I'm sorry if this makes more work for you, but this is really
how we have to proceed moving forward.

Thanks.
Jamal Hadi Salim April 24, 2017, 10:58 p.m.
On 17-04-24 06:24 PM, David Miller wrote:

>
> I think we should eat the pain now and use the strict checks
> for all new features like this.
>
> I'm sorry if this makes more work for you, but this is really
> how we have to proceed moving forward.
>

There is no work for me to do in tc if i push this in without
the liberal flag i was suggesting.
This is more about the next flag change that will fail with
older kernels unless the large tc change happens
i.e future ones after this kernel patch.

cheers,
jamal

Patch hide | download patch | download mbox

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cce0613..d7d28ec 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -674,10 +674,27 @@  struct tcamsg {
 	unsigned char	tca__pad1;
 	unsigned short	tca__pad2;
 };
+
+enum {
+	TCAA_UNSPEC,
+	TCAA_ACT_TAB,
+#define TCA_ACT_TAB TCAA_ACT_TAB
+	TCAA_ACT_FLAGS,
+	TCAA_ACT_COUNT,
+	__TCAA_MAX,
+#define	TCAA_MAX (__TCAA_MAX - 1)
+};
+
 #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
 #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
-#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
-#define TCAA_MAX 1
+/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
+ *
+ * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
+ * actions in a dump. All dump responses will contain the number of actions
+ * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
+ *
+ */
+#define ACT_LARGE_DUMP_ON		BIT(0)
 
 /* New extended info filters for IFLA_EXT_MASK */
 #define RTEXT_FILTER_VF		(1 << 0)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 82b1d48..f85b8fd 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -83,6 +83,7 @@  static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+	unsigned short act_flags = cb->args[2];
 	struct nlattr *nest;
 
 	spin_lock_bh(&hinfo->lock);
@@ -111,14 +112,18 @@  static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			}
 			nla_nest_end(skb, nest);
 			n_i++;
-			if (n_i >= TCA_ACT_MAX_PRIO)
+			if (!(act_flags & ACT_LARGE_DUMP_ON) &&
+			    n_i >= TCA_ACT_MAX_PRIO)
 				goto done;
 		}
 	}
 done:
 	spin_unlock_bh(&hinfo->lock);
-	if (n_i)
+	if (n_i) {
 		cb->args[0] += n_i;
+		if (act_flags & ACT_LARGE_DUMP_ON)
+			cb->args[1] = n_i;
+	}
 	return n_i;
 
 nla_put_failure:
@@ -993,11 +998,15 @@  static int tcf_action_add(struct net *net, struct nlattr *nla,
 	return tcf_add_notify(net, n, &actions, portid);
 }
 
+static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = {
+	[TCAA_ACT_FLAGS]      = { .type = NLA_U32 },
+};
+
 static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 			 struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
-	struct nlattr *tca[TCA_ACT_MAX + 1];
+	struct nlattr *tca[TCAA_MAX + 1];
 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
 	int ret = 0, ovr = 0;
 
@@ -1005,7 +1014,7 @@  static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
-	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
+	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,
 			  extack);
 	if (ret < 0)
 		return ret;
@@ -1046,16 +1055,12 @@  static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 	return ret;
 }
 
-static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
+static struct nlattr *find_dump_kind(struct nlattr **nla)
 {
 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
-	struct nlattr *nla[TCAA_MAX + 1];
 	struct nlattr *kind;
 
-	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
-			NULL, NULL) < 0)
-		return NULL;
 	tb1 = nla[TCA_ACT_TAB];
 	if (tb1 == NULL)
 		return NULL;
@@ -1081,9 +1086,18 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	struct nlattr *nest;
 	struct tc_action_ops *a_o;
 	int ret = 0;
+	struct nlattr *tcaa[TCAA_MAX + 1];
 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
-	struct nlattr *kind = find_dump_kind(cb->nlh);
+	struct nlattr *kind = NULL;
+	struct nlattr *count_attr = NULL;
+	u32 act_flags = 0;
+
+	ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tcaa, TCAA_MAX,
+			  tcaa_policy, NULL);
+	if (ret < 0)
+		return ret;
 
+	kind = find_dump_kind(tcaa);
 	if (kind == NULL) {
 		pr_info("tc_dump_action: action bad kind\n");
 		return 0;
@@ -1093,14 +1107,23 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (a_o == NULL)
 		return 0;
 
+	if (tcaa[TCAA_ACT_FLAGS])
+		act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]);
+
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			cb->nlh->nlmsg_type, sizeof(*t), 0);
 	if (!nlh)
 		goto out_module_put;
+
+	cb->args[2] = act_flags;
+
 	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
+	count_attr = nla_reserve(skb, TCAA_ACT_COUNT, sizeof(u32));
+	if (!count_attr)
+		goto out_module_put;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
@@ -1113,6 +1136,8 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (ret > 0) {
 		nla_nest_end(skb, nest);
 		ret = skb->len;
+		memcpy(nla_data(count_attr), &cb->args[1], sizeof(u32));
+		cb->args[1] = 0;
 	} else
 		nlmsg_trim(skb, b);