diff mbox

[net-next,v4,1/2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch

Message ID 1492603050-9318-1-git-send-email-jhs@emojatatu.com
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Jamal Hadi Salim April 19, 2017, 11:57 a.m. UTC
From: Jamal Hadi Salim <jhs@mojatatu.com>

When you dump hundreds of thousands of actions, getting only 32 per
dump batch even when the socket buffer and memory allocations allow
is inefficient.

With this change, the user will get as many as possibly fitting
within the given constraints available to the kernel.

A new top level TLV space is introduced. An attribute
TCAA_ACT_FLAGS is used to carry the flags indicating the user
is capable of processing these large dumps. Older user space which
doesnt set this flag doesnt get the large (than 32) batches.
The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
actions are put in a single batch. As such user space app knows how long
to iterate (independent of the type of action being dumped)
instead of hardcoded maximum of 32.

Some results dumping 1.5M actions, first unpatched tc which the
kernel doesnt help:

prompt$ time -p tc actions ls action gact | grep index | wc -l
1500000
real 1388.43
user 2.07
sys 1386.79

Now lets see a patched tc which sets the correct flags when requesting
a dump:

prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
1500000
real 178.13
user 2.02
sys 176.96

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++++--
 net/sched/act_api.c            | 43 ++++++++++++++++++++++++++++++++----------
 3 files changed, 53 insertions(+), 12 deletions(-)

Comments

Jiri Pirko April 19, 2017, 12:36 p.m. UTC | #1
Wed, Apr 19, 2017 at 01:57:29PM CEST, jhs@mojatatu.com wrote:
>From: Jamal Hadi Salim <jhs@mojatatu.com>
>
>When you dump hundreds of thousands of actions, getting only 32 per
>dump batch even when the socket buffer and memory allocations allow
>is inefficient.
>
>With this change, the user will get as many as possibly fitting
>within the given constraints available to the kernel.
>
>A new top level TLV space is introduced. An attribute
>TCAA_ACT_FLAGS is used to carry the flags indicating the user
>is capable of processing these large dumps. Older user space which
>doesnt set this flag doesnt get the large (than 32) batches.
>The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
>actions are put in a single batch. As such user space app knows how long
>to iterate (independent of the type of action being dumped)
>instead of hardcoded maximum of 32.
>
>Some results dumping 1.5M actions, first unpatched tc which the
>kernel doesnt help:
>
>prompt$ time -p tc actions ls action gact | grep index | wc -l
>1500000
>real 1388.43
>user 2.07
>sys 1386.79
>
>Now lets see a patched tc which sets the correct flags when requesting
>a dump:
>
>prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
>1500000
>real 178.13
>user 2.02
>sys 176.96
>
>Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
>---
> include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++++--
> net/sched/act_api.c            | 43 ++++++++++++++++++++++++++++++++----------
> 3 files changed, 53 insertions(+), 12 deletions(-)
>
>diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
>index cce0613..c7080ec 100644
>--- a/include/uapi/linux/rtnetlink.h
>+++ b/include/uapi/linux/rtnetlink.h
>@@ -674,10 +674,27 @@ struct tcamsg {
> 	unsigned char	tca__pad1;
> 	unsigned short	tca__pad2;
> };
>+
>+enum {
>+	TCAA_UNSPEC,
>+	TCAA_ACT_TAB,
>+	TCAA_ACT_FLAGS,
>+	TCAA_ACT_COUNT,
>+	__TCAA_MAX
>+};
>+
>+#define TCAA_MAX (__TCAA_MAX - 1)
> #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>-#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>-#define TCAA_MAX 1
>+#define TCA_ACT_TAB TCAA_ACT_TAB

This is mess. What does "TCAA" stand for?
I suggest some more meaningful naming of the enum items and define
TCA_ACT_TAB and TCAA_MAX to the new values in order to maintain UAPI
Also, please put X_MAX = __X_MAX - 1 into enum


>+/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
>+ *
>+ * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>+ * actions in a dump. All dump responses will contain the number of actions
>+ * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
>+ *
>+ */
>+#define ACT_LARGE_DUMP_ON		(1 << 0) 

Use "BIT(0)"

Also use the same prefix as for the enum.

+ you can have each potential flag as a separate u8 attribute. That is the
clearest approach and easily extendable. That's how we do it in devlink
for example.


> 
> /* New extended info filters for IFLA_EXT_MASK */
> #define RTEXT_FILTER_VF		(1 << 0)
>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>index 82b1d48..45e1cf7 100644
>--- a/net/sched/act_api.c
>+++ b/net/sched/act_api.c
>@@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
> 			   struct netlink_callback *cb)
> {
> 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
>+	unsigned short act_flags = cb->args[2];
> 	struct nlattr *nest;
> 
> 	spin_lock_bh(&hinfo->lock);
>@@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
> 			}
> 			nla_nest_end(skb, nest);
> 			n_i++;
>-			if (n_i >= TCA_ACT_MAX_PRIO)
>+			if (!(act_flags & ACT_LARGE_DUMP_ON) &&
>+			    n_i >= TCA_ACT_MAX_PRIO)
> 				goto done;
> 		}
> 	}
> done:
> 	spin_unlock_bh(&hinfo->lock);
>-	if (n_i)
>+	if (n_i) {
> 		cb->args[0] += n_i;
>+		if (act_flags & ACT_LARGE_DUMP_ON)
>+			cb->args[1] = n_i;
>+	}
> 	return n_i;
> 
> nla_put_failure:
>@@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
> 	return tcf_add_notify(net, n, &actions, portid);
> }
> 
>+static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = {
>+	[TCAA_ACT_FLAGS]      = { .type = NLA_U32 },
>+};
>+
> static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
> 			 struct netlink_ext_ack *extack)
> {
> 	struct net *net = sock_net(skb->sk);
>-	struct nlattr *tca[TCA_ACT_MAX + 1];
>+	struct nlattr *tca[TCAA_MAX + 1];

This is certainly wrong.


> 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
> 	int ret = 0, ovr = 0;
> 
>@@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
> 	    !netlink_capable(skb, CAP_NET_ADMIN))
> 		return -EPERM;
> 
>-	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
>+	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,

This is certainly wrong.


> 			  extack);
> 	if (ret < 0)
> 		return ret;
>@@ -1046,16 +1055,12 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
> 	return ret;
> }
> 
>-static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
>+static struct nlattr *find_dump_kind(struct nlattr **nla)
> {
> 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
> 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
>-	struct nlattr *nla[TCAA_MAX + 1];
> 	struct nlattr *kind;
> 
>-	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
>-			NULL, NULL) < 0)
>-		return NULL;
> 	tb1 = nla[TCA_ACT_TAB];
> 	if (tb1 == NULL)
> 		return NULL;
>@@ -1081,9 +1086,18 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	struct nlattr *nest;
> 	struct tc_action_ops *a_o;
> 	int ret = 0;
>+	struct nlattr *tcaa[TCAA_MAX + 1];
> 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
>-	struct nlattr *kind = find_dump_kind(cb->nlh);
>+	struct nlattr *kind = NULL;
>+	u32 act_flags = 0;
>+
>+	ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tcaa, TCAA_MAX,
>+			  tcaa_policy, NULL);
>+	if (ret < 0)
>+		return ret;
>+
> 
>+	kind = find_dump_kind(tcaa);
> 	if (kind == NULL) {
> 		pr_info("tc_dump_action: action bad kind\n");
> 		return 0;
>@@ -1093,10 +1107,16 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	if (a_o == NULL)
> 		return 0;
> 
>+	if (tcaa[TCAA_ACT_FLAGS])
>+		act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]);
>+
> 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
> 			cb->nlh->nlmsg_type, sizeof(*t), 0);
> 	if (!nlh)
> 		goto out_module_put;
>+
>+	cb->args[2] = act_flags;
>+
> 	t = nlmsg_data(nlh);
> 	t->tca_family = AF_UNSPEC;
> 	t->tca__pad1 = 0;
>@@ -1113,6 +1133,9 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	if (ret > 0) {
> 		nla_nest_end(skb, nest);
> 		ret = skb->len;
>+		if (nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1]))
>+			goto out_module_put;
>+		cb->args[1] = 0;

Why you need to zero this?


> 	} else
> 		nlmsg_trim(skb, b);
> 
>-- 
>1.9.1
>
Roman Mashak April 19, 2017, 12:55 p.m. UTC | #2
Jiri Pirko <jiri@resnulli.us> writes:


[...]

>>+enum {
>>+	TCAA_UNSPEC,
>>+	TCAA_ACT_TAB,
>>+	TCAA_ACT_FLAGS,
>>+	TCAA_ACT_COUNT,
>>+	__TCAA_MAX
>>+};
>>+
>>+#define TCAA_MAX (__TCAA_MAX - 1)
>> #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>>-#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>>-#define TCAA_MAX 1
>>+#define TCA_ACT_TAB TCAA_ACT_TAB
>
> This is mess. What does "TCAA" stand for?
> I suggest some more meaningful naming of the enum items and define
> TCA_ACT_TAB and TCAA_MAX to the new values in order to maintain UAPI
> Also, please put X_MAX = __X_MAX - 1 into enum
>

Notation observed in tc and unfortunately not consistently maintained is
to have enum with TCA* attributes for instance, followed by define,
outside of the enum, with __X_MAX -1

[...]
Jamal Hadi Salim April 19, 2017, 1:03 p.m. UTC | #3
On 17-04-19 08:36 AM, Jiri Pirko wrote:
> Wed, Apr 19, 2017 at 01:57:29PM CEST, jhs@mojatatu.com wrote:
>> From: Jamal Hadi Salim <jhs@mojatatu.com>

>> include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++++--
>> net/sched/act_api.c            | 43 ++++++++++++++++++++++++++++++++----------
>> 3 files changed, 53 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
>> index cce0613..c7080ec 100644
>> --- a/include/uapi/linux/rtnetlink.h
>> +++ b/include/uapi/linux/rtnetlink.h
>> @@ -674,10 +674,27 @@ struct tcamsg {
>> 	unsigned char	tca__pad1;
>> 	unsigned short	tca__pad2;
>> };
>> +
>> +enum {
>> +	TCAA_UNSPEC,
>> +	TCAA_ACT_TAB,
>> +	TCAA_ACT_FLAGS,
>> +	TCAA_ACT_COUNT,
>> +	__TCAA_MAX
>> +};
>> +
>> +#define TCAA_MAX (__TCAA_MAX - 1)
>> #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>> -#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>> -#define TCAA_MAX 1
>> +#define TCA_ACT_TAB TCAA_ACT_TAB
>
> This is mess. What does "TCAA" stand for?

TC Actions Attributes.  What would you call it? I could have
called it TCA_ROOT etc. But maybe a comment to just call it
TC Actions Attributes would be enough?

> I suggest some more meaningful naming of the enum items and define
> TCA_ACT_TAB and TCAA_MAX to the new values in order to maintain UAPI


Thats what the above does (for UAPI) maintainance, no?

> Also, please put X_MAX = __X_MAX - 1 into enum

That is diverting from the norm which defines it outside
of the enum. A good reason could be: You, Jiri, plan to go and
cleanup all the netlink stuff which uses this style.
Or you think we should start a trend which leads us
to a new clean style.

>
>> +/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
>> + *
>> + * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>> + * actions in a dump. All dump responses will contain the number of actions
>> + * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
>> + *
>> + */
>> +#define ACT_LARGE_DUMP_ON		(1 << 0)
>
> Use "BIT(0)"
>

Same question as before.
Are you planning to cleanup the rest of the code which
follows the same style? example, look at this:
         TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),


> Also use the same prefix as for the enum.
>
> + you can have each potential flag as a separate u8 attribute. That is the
> clearest approach and easily extendable. That's how we do it in devlink
> for example.
>

So you are using 8 bits for one flag which requires one bit?
+ the TLV header? Sounds like overkill.
Note: We dont need more than 1 or 2 bits for this case.
Even 32 bits is overkill for what I am doing.
When do i need to extend a single bit representation?

>
>> 	struct net *net = sock_net(skb->sk);
>> -	struct nlattr *tca[TCA_ACT_MAX + 1];
>> +	struct nlattr *tca[TCAA_MAX + 1];
>
> This is certainly wrong.
>

Why is it wrong?

>
>> 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
>> 	int ret = 0, ovr = 0;
>>
>> @@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> 	    !netlink_capable(skb, CAP_NET_ADMIN))
>> 		return -EPERM;
>>
>> -	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
>> +	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,
>
> This is certainly wrong.
>

Same question as above.


>> +		if (nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1]))
>> +			goto out_module_put;
>> +		cb->args[1] = 0;
>
> Why you need to zero this?
>
>

The count is per submitted message - every time we succesfuly send a msg
to user, we start the recount.

cheers,
jamal
Jiri Pirko April 19, 2017, 1:05 p.m. UTC | #4
Wed, Apr 19, 2017 at 02:55:57PM CEST, mrv@mojatatu.com wrote:
>Jiri Pirko <jiri@resnulli.us> writes:
>
>
>[...]
>
>>>+enum {
>>>+	TCAA_UNSPEC,
>>>+	TCAA_ACT_TAB,
>>>+	TCAA_ACT_FLAGS,
>>>+	TCAA_ACT_COUNT,
>>>+	__TCAA_MAX
>>>+};
>>>+
>>>+#define TCAA_MAX (__TCAA_MAX - 1)
>>> #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>>> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>>>-#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>>>-#define TCAA_MAX 1
>>>+#define TCA_ACT_TAB TCAA_ACT_TAB
>>
>> This is mess. What does "TCAA" stand for?
>> I suggest some more meaningful naming of the enum items and define
>> TCA_ACT_TAB and TCAA_MAX to the new values in order to maintain UAPI
>> Also, please put X_MAX = __X_MAX - 1 into enum
>>
>
>Notation observed in tc and unfortunately not consistently maintained is
>to have enum with TCA* attributes for instance, followed by define,
>outside of the enum, with __X_MAX -1

I don't have strong opinion on define or in-enum. I like in-enum better.
The rest could be converted later on.
Jiri Pirko April 19, 2017, 1:13 p.m. UTC | #5
Wed, Apr 19, 2017 at 03:03:59PM CEST, jhs@mojatatu.com wrote:
>On 17-04-19 08:36 AM, Jiri Pirko wrote:
>> Wed, Apr 19, 2017 at 01:57:29PM CEST, jhs@mojatatu.com wrote:
>> > From: Jamal Hadi Salim <jhs@mojatatu.com>
>
>> > include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++++--
>> > net/sched/act_api.c            | 43 ++++++++++++++++++++++++++++++++----------
>> > 3 files changed, 53 insertions(+), 12 deletions(-)
>> > 
>> > diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
>> > index cce0613..c7080ec 100644
>> > --- a/include/uapi/linux/rtnetlink.h
>> > +++ b/include/uapi/linux/rtnetlink.h
>> > @@ -674,10 +674,27 @@ struct tcamsg {
>> > 	unsigned char	tca__pad1;
>> > 	unsigned short	tca__pad2;
>> > };
>> > +
>> > +enum {
>> > +	TCAA_UNSPEC,
>> > +	TCAA_ACT_TAB,
>> > +	TCAA_ACT_FLAGS,
>> > +	TCAA_ACT_COUNT,
>> > +	__TCAA_MAX
>> > +};
>> > +
>> > +#define TCAA_MAX (__TCAA_MAX - 1)
>> > #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>> > #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>> > -#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>> > -#define TCAA_MAX 1
>> > +#define TCA_ACT_TAB TCAA_ACT_TAB
>> 
>> This is mess. What does "TCAA" stand for?
>
>TC Actions Attributes.  What would you call it? I could have
>called it TCA_ROOT etc. But maybe a comment to just call it
>TC Actions Attributes would be enough?

TCA_DUMP_X

it is only for dumping. Naming it "attribute" seems weird. Same as if
you have: int variable_something;


>
>> I suggest some more meaningful naming of the enum items and define
>> TCA_ACT_TAB and TCAA_MAX to the new values in order to maintain UAPI
>
>
>Thats what the above does (for UAPI) maintainance, no?

It does it for TCA_ACT_TAB. We need to do it for both TCA_ACT_TAB and TCAA_MAX


>
>> Also, please put X_MAX = __X_MAX - 1 into enum
>
>That is diverting from the norm which defines it outside
>of the enum. A good reason could be: You, Jiri, plan to go and
>cleanup all the netlink stuff which uses this style.
>Or you think we should start a trend which leads us
>to a new clean style.

I would start now. I can take of the follow-up patch to change the rest.



>
>> 
>> > +/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
>> > + *
>> > + * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>> > + * actions in a dump. All dump responses will contain the number of actions
>> > + * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
>> > + *
>> > + */
>> > +#define ACT_LARGE_DUMP_ON		(1 << 0)
>> 
>> Use "BIT(0)"
>> 
>
>Same question as before.

Same answer :)


>Are you planning to cleanup the rest of the code which
>follows the same style? example, look at this:
>        TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
>
>
>> Also use the same prefix as for the enum.
>> 
>> + you can have each potential flag as a separate u8 attribute. That is the
>> clearest approach and easily extendable. That's how we do it in devlink
>> for example.
>> 
>
>So you are using 8 bits for one flag which requires one bit?
>+ the TLV header? Sounds like overkill.
>Note: We dont need more than 1 or 2 bits for this case.
>Even 32 bits is overkill for what I am doing.
>When do i need to extend a single bit representation?

I don't see any problem adding couple of bytes if it increases cleannes
and easy extendability.


>
>> 
>> > 	struct net *net = sock_net(skb->sk);
>> > -	struct nlattr *tca[TCA_ACT_MAX + 1];
>> > +	struct nlattr *tca[TCAA_MAX + 1];
>> 
>> This is certainly wrong.
>> 
>
>Why is it wrong?

Because you use existing TCA_ACT_ attr enum.


>
>> 
>> > 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
>> > 	int ret = 0, ovr = 0;
>> > 
>> > @@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> > 	    !netlink_capable(skb, CAP_NET_ADMIN))
>> > 		return -EPERM;
>> > 
>> > -	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
>> > +	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,
>> 
>> This is certainly wrong.
>> 
>
>Same question as above.

Same answer.


>
>
>> > +		if (nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1]))
>> > +			goto out_module_put;
>> > +		cb->args[1] = 0;
>> 
>> Why you need to zero this?
>> 
>> 
>
>The count is per submitted message - every time we succesfuly send a msg
>to user, we start the recount.

ok


>
>cheers,
>jamal
>
Jamal Hadi Salim April 19, 2017, 3:37 p.m. UTC | #6
On 17-04-19 09:13 AM, Jiri Pirko wrote:
> Wed, Apr 19, 2017 at 03:03:59PM CEST, jhs@mojatatu.com wrote:
>> On 17-04-19 08:36 AM, Jiri Pirko wrote:
>>> Wed, Apr 19, 2017 at 01:57:29PM CEST, jhs@mojatatu.com wrote:
>>>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>>
>>>> include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++++--
>>>> net/sched/act_api.c            | 43 ++++++++++++++++++++++++++++++++----------
>>>> 3 files changed, 53 insertions(+), 12 deletions(-)

>>>> +#define TCAA_MAX (__TCAA_MAX - 1)
>>>> #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>>>> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>>>> -#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>>>> -#define TCAA_MAX 1
>>>> +#define TCA_ACT_TAB TCAA_ACT_TAB
>>>
>>> This is mess. What does "TCAA" stand for?
>>
>> TC Actions Attributes.  What would you call it? I could have
>> called it TCA_ROOT etc. But maybe a comment to just call it
>> TC Actions Attributes would be enough?
>
> TCA_DUMP_X
>
> it is only for dumping. Naming it "attribute" seems weird. Same as if
> you have: int variable_something;
>

Jiri, this is not just for dumping. We are describing high level
attributes for tc actions.

>
>>
>>> I suggest some more meaningful naming of the enum items and define
>>> TCA_ACT_TAB and TCAA_MAX to the new values in order to maintain UAPI
>>
>>
>> Thats what the above does (for UAPI) maintainance, no?
>
> It does it for TCA_ACT_TAB. We need to do it for both TCA_ACT_TAB and TCAA_MAX
>

TCAA_XXX is the namespace selected. You dont like that name and
adding DUMP doesnt make sense to me. How about TCA_ACT_ROOT?

>>
>>> Also, please put X_MAX = __X_MAX - 1 into enum
>>
>> That is diverting from the norm which defines it outside
>> of the enum. A good reason could be: You, Jiri, plan to go and
>> cleanup all the netlink stuff which uses this style.
>> Or you think we should start a trend which leads us
>> to a new clean style.
>
> I would start now. I can take of the follow-up patch to change the rest.
>

It is a _lot_ of code to change! Note:
This is all the UAPI visible code (the same coding style for 20 years).
I am worried about this part.

>>>
>>>> +/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
>>>> + *
>>>> + * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>>>> + * actions in a dump. All dump responses will contain the number of actions
>>>> + * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
>>>> + *
>>>> + */
>>>> +#define ACT_LARGE_DUMP_ON		(1 << 0)
>>>
>>> Use "BIT(0)"
>>>
>>
>> Same question as before.
>
> Same answer :)
>

I will change this one - it is a lot simpler coding style
wide/wise than the other one.

>> Are you planning to cleanup the rest of the code which

>> So you are using 8 bits for one flag which requires one bit?
>> + the TLV header? Sounds like overkill.
>> Note: We dont need more than 1 or 2 bits for this case.
>> Even 32 bits is overkill for what I am doing.
>> When do i need to extend a single bit representation?
>
> I don't see any problem adding couple of bytes if it increases cleannes
> and easy extendability.
>

How do you extend one bit? Seriously. If i want to add another bit I
will add one more to existing bit map not 64 (T + L + 8bits + pad).
If i ran out of space i will add a new TLV.

>>>
>>>> 	struct net *net = sock_net(skb->sk);
>>>> -	struct nlattr *tca[TCA_ACT_MAX + 1];
>>>> +	struct nlattr *tca[TCAA_MAX + 1];
>>>
>>> This is certainly wrong.
>>>
>>
>> Why is it wrong?
>
> Because you use existing TCA_ACT_ attr enum.
>

Is there a programming mistake or you  just dont like the name?
AFAIK, and based on my testing that code is correct.

>>>> -	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
>>>> +	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,
>>>
>>> This is certainly wrong.
>>>
>>
>> Same question as above.
>
> Same answer.
>

And same question still.

cheers,
jamal
Jiri Pirko April 19, 2017, 3:54 p.m. UTC | #7
Wed, Apr 19, 2017 at 05:37:15PM CEST, jhs@mojatatu.com wrote:
>On 17-04-19 09:13 AM, Jiri Pirko wrote:
>> Wed, Apr 19, 2017 at 03:03:59PM CEST, jhs@mojatatu.com wrote:
>> > On 17-04-19 08:36 AM, Jiri Pirko wrote:
>> > > Wed, Apr 19, 2017 at 01:57:29PM CEST, jhs@mojatatu.com wrote:
>> > > > From: Jamal Hadi Salim <jhs@mojatatu.com>
>> > 
>> > > > include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++++--
>> > > > net/sched/act_api.c            | 43 ++++++++++++++++++++++++++++++++----------
>> > > > 3 files changed, 53 insertions(+), 12 deletions(-)
>
>> > > > +#define TCAA_MAX (__TCAA_MAX - 1)
>> > > > #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>> > > > #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>> > > > -#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>> > > > -#define TCAA_MAX 1
>> > > > +#define TCA_ACT_TAB TCAA_ACT_TAB
>> > > 
>> > > This is mess. What does "TCAA" stand for?
>> > 
>> > TC Actions Attributes.  What would you call it? I could have
>> > called it TCA_ROOT etc. But maybe a comment to just call it
>> > TC Actions Attributes would be enough?
>> 
>> TCA_DUMP_X
>> 
>> it is only for dumping. Naming it "attribute" seems weird. Same as if
>> you have: int variable_something;
>> 
>
>Jiri, this is not just for dumping. We are describing high level
>attributes for tc actions.

This is already present:
enum {
        TCA_ACT_UNSPEC,
        TCA_ACT_KIND,
        TCA_ACT_OPTIONS,
        TCA_ACT_INDEX,
        TCA_ACT_STATS,
        TCA_ACT_PAD,
        TCA_ACT_COOKIE,
        __TCA_ACT_MAX
};

This is nested inside the dump message (TCAA_MAX, TCA_ACT_TAB)

Looks like you are mixing these 2.



>
>> 
>> > 
>> > > I suggest some more meaningful naming of the enum items and define
>> > > TCA_ACT_TAB and TCAA_MAX to the new values in order to maintain UAPI
>> > 
>> > 
>> > Thats what the above does (for UAPI) maintainance, no?
>> 
>> It does it for TCA_ACT_TAB. We need to do it for both TCA_ACT_TAB and TCAA_MAX
>> 
>
>TCAA_XXX is the namespace selected. You dont like that name and
>adding DUMP doesnt make sense to me. How about TCA_ACT_ROOT?
>
>> > 
>> > > Also, please put X_MAX = __X_MAX - 1 into enum
>> > 
>> > That is diverting from the norm which defines it outside
>> > of the enum. A good reason could be: You, Jiri, plan to go and
>> > cleanup all the netlink stuff which uses this style.
>> > Or you think we should start a trend which leads us
>> > to a new clean style.
>> 
>> I would start now. I can take of the follow-up patch to change the rest.
>> 
>
>It is a _lot_ of code to change! Note:
>This is all the UAPI visible code (the same coding style for 20 years).
>I am worried about this part.

We'll see. Lets do it in a sensitive way, in steps. But for new things,
I think it is good not to stick with old and outlived habits.


>
>> > > 
>> > > > +/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
>> > > > + *
>> > > > + * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>> > > > + * actions in a dump. All dump responses will contain the number of actions
>> > > > + * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
>> > > > + *
>> > > > + */
>> > > > +#define ACT_LARGE_DUMP_ON		(1 << 0)
>> > > 
>> > > Use "BIT(0)"
>> > > 
>> > 
>> > Same question as before.
>> 
>> Same answer :)
>> 
>
>I will change this one - it is a lot simpler coding style
>wide/wise than the other one.
>
>> > Are you planning to cleanup the rest of the code which
>
>> > So you are using 8 bits for one flag which requires one bit?
>> > + the TLV header? Sounds like overkill.
>> > Note: We dont need more than 1 or 2 bits for this case.
>> > Even 32 bits is overkill for what I am doing.
>> > When do i need to extend a single bit representation?
>> 
>> I don't see any problem adding couple of bytes if it increases cleannes
>> and easy extendability.
>> 
>
>How do you extend one bit? Seriously. If i want to add another bit I
>will add one more to existing bit map not 64 (T + L + 8bits + pad).
>If i ran out of space i will add a new TLV.

Netlink is TLV, should be used as TLV. I don't see how you can run out
any space. You tend to use Netlink in some weird hybrid mode, with only
argument being space. I think that couple of bytes wasted is not
a problem at all...


>
>> > > 
>> > > > 	struct net *net = sock_net(skb->sk);
>> > > > -	struct nlattr *tca[TCA_ACT_MAX + 1];
>> > > > +	struct nlattr *tca[TCAA_MAX + 1];
>> > > 
>> > > This is certainly wrong.
>> > > 
>> > 
>> > Why is it wrong?
>> 
>> Because you use existing TCA_ACT_ attr enum.
>> 
>
>Is there a programming mistake or you  just dont like the name?
>AFAIK, and based on my testing that code is correct.

See my first comment. I think that you mix 2 things together.


>
>> > > > -	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
>> > > > +	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,
>> > > 
>> > > This is certainly wrong.
>> > > 
>> > 
>> > Same question as above.
>> 
>> Same answer.
>> 
>
>And same question still.
>
>cheers,
>jamal
Jamal Hadi Salim April 19, 2017, 4:07 p.m. UTC | #8
On 17-04-19 11:54 AM, Jiri Pirko wrote:
> Wed, Apr 19, 2017 at 05:37:15PM CEST, jhs@mojatatu.com wrote:
>> On 17-04-19 09:13 AM, Jiri Pirko wrote:
>>> Wed, Apr 19, 2017 at 03:03:59PM CEST, jhs@mojatatu.com wrote:
>>>> On 17-04-19 08:36 AM, Jiri Pirko wrote:
>>>>> Wed, Apr 19, 2017 at 01:57:29PM CEST, jhs@mojatatu.com wrote:
>>>>>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>>>>
>>>>>> include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++++--
>>>>>> net/sched/act_api.c            | 43 ++++++++++++++++++++++++++++++++----------
>>>>>> 3 files changed, 53 insertions(+), 12 deletions(-)
>>
>>>>>> +#define TCAA_MAX (__TCAA_MAX - 1)
>>>>>> #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>>>>>> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>>>>>> -#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>>>>>> -#define TCAA_MAX 1
>>>>>> +#define TCA_ACT_TAB TCAA_ACT_TAB
>>>>>
>>>>> This is mess. What does "TCAA" stand for?
>>>>
>>>> TC Actions Attributes.  What would you call it? I could have
>>>> called it TCA_ROOT etc. But maybe a comment to just call it
>>>> TC Actions Attributes would be enough?
>>>
>>> TCA_DUMP_X
>>>
>>> it is only for dumping. Naming it "attribute" seems weird. Same as if
>>> you have: int variable_something;
>>>
>>
>> Jiri, this is not just for dumping. We are describing high level
>> attributes for tc actions.
>
> This is already present:
> enum {
>         TCA_ACT_UNSPEC,
>         TCA_ACT_KIND,
>         TCA_ACT_OPTIONS,
>         TCA_ACT_INDEX,
>         TCA_ACT_STATS,
>         TCA_ACT_PAD,
>         TCA_ACT_COOKIE,
>         __TCA_ACT_MAX
> };
>
> This is nested inside the dump message (TCAA_MAX, TCA_ACT_TAB)
>
> Looks like you are mixing these 2.
>

No - That space is per action. The space i am defining is
above that in the the hierarchy. There used to be only
one attribute there (TCA_ACT_TAB) and now we are making
it more generic.

>
>

>> It is a _lot_ of code to change! Note:
>> This is all the UAPI visible code (the same coding style for 20 years).
>> I am worried about this part.
>
> We'll see. Lets do it in a sensitive way, in steps. But for new things,
> I think it is good not to stick with old and outlived habits.
>
>

I know you have the muscle to get it done - so fine, i will start
with this one change.

>
> Netlink is TLV, should be used as TLV. I don't see how you can run out
> any space. You tend to use Netlink in some weird hybrid mode, with only
> argument being space. I think that couple of bytes wasted is not
> a problem at all...
>

You are not making sense to me still.
What you describe as "a couple of bytes" adds up when you have
a large number of objects. I am trying to cut down on data back
and forth from user/kernel and a bitmap is a well understood entity.

Even if i did use a TLV - when i am representing flags which require one
bit each - it makes sense to use a bitmask encapsulated in a TLV. Not
to waste a TLV per bit.

cheers,
jamal
Jiri Pirko April 19, 2017, 4:17 p.m. UTC | #9
Wed, Apr 19, 2017 at 06:07:48PM CEST, jhs@mojatatu.com wrote:
>On 17-04-19 11:54 AM, Jiri Pirko wrote:
>> Wed, Apr 19, 2017 at 05:37:15PM CEST, jhs@mojatatu.com wrote:
>> > On 17-04-19 09:13 AM, Jiri Pirko wrote:
>> > > Wed, Apr 19, 2017 at 03:03:59PM CEST, jhs@mojatatu.com wrote:
>> > > > On 17-04-19 08:36 AM, Jiri Pirko wrote:
>> > > > > Wed, Apr 19, 2017 at 01:57:29PM CEST, jhs@mojatatu.com wrote:
>> > > > > > From: Jamal Hadi Salim <jhs@mojatatu.com>
>> > > > 
>> > > > > > include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++++--
>> > > > > > net/sched/act_api.c            | 43 ++++++++++++++++++++++++++++++++----------
>> > > > > > 3 files changed, 53 insertions(+), 12 deletions(-)
>> > 
>> > > > > > +#define TCAA_MAX (__TCAA_MAX - 1)
>> > > > > > #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>> > > > > > #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>> > > > > > -#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>> > > > > > -#define TCAA_MAX 1
>> > > > > > +#define TCA_ACT_TAB TCAA_ACT_TAB
>> > > > > 
>> > > > > This is mess. What does "TCAA" stand for?
>> > > > 
>> > > > TC Actions Attributes.  What would you call it? I could have
>> > > > called it TCA_ROOT etc. But maybe a comment to just call it
>> > > > TC Actions Attributes would be enough?
>> > > 
>> > > TCA_DUMP_X
>> > > 
>> > > it is only for dumping. Naming it "attribute" seems weird. Same as if
>> > > you have: int variable_something;
>> > > 
>> > 
>> > Jiri, this is not just for dumping. We are describing high level
>> > attributes for tc actions.
>> 
>> This is already present:
>> enum {
>>         TCA_ACT_UNSPEC,
>>         TCA_ACT_KIND,
>>         TCA_ACT_OPTIONS,
>>         TCA_ACT_INDEX,
>>         TCA_ACT_STATS,
>>         TCA_ACT_PAD,
>>         TCA_ACT_COOKIE,
>>         __TCA_ACT_MAX
>> };
>> 
>> This is nested inside the dump message (TCAA_MAX, TCA_ACT_TAB)
>> 
>> Looks like you are mixing these 2.
>> 
>
>No - That space is per action. The space i am defining is
>above that in the the hierarchy. There used to be only
>one attribute there (TCA_ACT_TAB) and now we are making
>it more generic.

Right. That's what I say. And that higher level is used only for
dumping. That's why I suggested TCA_ACT_DUMP prefix.


>
>> 
>> 
>
>> > It is a _lot_ of code to change! Note:
>> > This is all the UAPI visible code (the same coding style for 20 years).
>> > I am worried about this part.
>> 
>> We'll see. Lets do it in a sensitive way, in steps. But for new things,
>> I think it is good not to stick with old and outlived habits.
>> 
>> 
>
>I know you have the muscle to get it done - so fine, i will start
>with this one change.
>
>> 
>> Netlink is TLV, should be used as TLV. I don't see how you can run out
>> any space. You tend to use Netlink in some weird hybrid mode, with only
>> argument being space. I think that couple of bytes wasted is not
>> a problem at all...
>> 
>
>You are not making sense to me still.
>What you describe as "a couple of bytes" adds up when you have
>a large number of objects. I am trying to cut down on data back
>and forth from user/kernel and a bitmap is a well understood entity.

The attributes in question are per-message, not per-object


>
>Even if i did use a TLV - when i am representing flags which require one
>bit each - it makes sense to use a bitmask encapsulated in a TLV. Not
>to waste a TLV per bit.

That is the only correct way. For example, in future the kernel may
report in extended ack that it does not support some specific attribute
user passed. If you pack it all in one attr, that would not be possible.
Also, what prevents user from passing random data on bit flag positions
that you are not using? Current kernel would ignore it. Next kernel will
do something different according to the flag bit. That is UAPI break.
Essentialy the same thing what DaveM said about the struct. You have to
define this completely, at the beginning, not possible to use the unused
bits for other things in the future.
Jamal Hadi Salim April 20, 2017, 10:42 a.m. UTC | #10
On 17-04-19 12:17 PM, Jiri Pirko wrote:
> Wed, Apr 19, 2017 at 06:07:48PM CEST, jhs@mojatatu.com wrote:
>> On 17-04-19 11:54 AM, Jiri Pirko wrote:
>>> Wed, Apr 19, 2017 at 05:37:15PM CEST, jhs@mojatatu.com wrote:
>>>> On 17-04-19 09:13 AM, Jiri Pirko wrote:
>>>>> Wed, Apr 19, 2017 at 03:03:59PM CEST, jhs@mojatatu.com wrote:
>>>>>> On 17-04-19 08:36 AM, Jiri Pirko wrote:
>>>>>>> Wed, Apr 19, 2017 at 01:57:29PM CEST, jhs@mojatatu.com wrote:
>>>>>>>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>>>>>>
>>>>>>>> include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++++--
>>>>>>>> net/sched/act_api.c            | 43 ++++++++++++++++++++++++++++++++----------
>>>>>>>> 3 files changed, 53 insertions(+), 12 deletions(-)
>>>>
>>>>>>>> +#define TCAA_MAX (__TCAA_MAX - 1)
>>>>>>>> #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
>>>>>>>> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
>>>>>>>> -#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
>>>>>>>> -#define TCAA_MAX 1
>>>>>>>> +#define TCA_ACT_TAB TCAA_ACT_TAB
>>>>>>>
>>>>>>> This is mess. What does "TCAA" stand for?
>>>>>>
>>>>>> TC Actions Attributes.  What would you call it? I could have
>>>>>> called it TCA_ROOT etc. But maybe a comment to just call it
>>>>>> TC Actions Attributes would be enough?
>>>>>
>>>>> TCA_DUMP_X
>>>>>
>>>>> it is only for dumping. Naming it "attribute" seems weird. Same as if
>>>>> you have: int variable_something;
>>>>>
>>>>
>>>> Jiri, this is not just for dumping. We are describing high level
>>>> attributes for tc actions.
>>>
>>> This is already present:
>>> enum {
>>>         TCA_ACT_UNSPEC,
>>>         TCA_ACT_KIND,
>>>         TCA_ACT_OPTIONS,
>>>         TCA_ACT_INDEX,
>>>         TCA_ACT_STATS,
>>>         TCA_ACT_PAD,
>>>         TCA_ACT_COOKIE,
>>>         __TCA_ACT_MAX
>>> };
>>>
>>> This is nested inside the dump message (TCAA_MAX, TCA_ACT_TAB)
>>>
>>> Looks like you are mixing these 2.
>>>
>>
>> No - That space is per action. The space i am defining is
>> above that in the the hierarchy. There used to be only
>> one attribute there (TCA_ACT_TAB) and now we are making
>> it more generic.
>
> Right. That's what I say. And that higher level is used only for
> dumping. That's why I suggested TCA_ACT_DUMP prefix.
>

DUMP is not right. _TAB is used for SET/DEL as well.
why dont we leave this alone so we can progress?
You can submit changes later if it bothers you still.

>
>>
>>>
>>>
>>
>>>> It is a _lot_ of code to change! Note:
>>>> This is all the UAPI visible code (the same coding style for 20 years).
>>>> I am worried about this part.
>>>
>>> We'll see. Lets do it in a sensitive way, in steps. But for new things,
>>> I think it is good not to stick with old and outlived habits.
>>>
>>>
>>
>> I know you have the muscle to get it done - so fine, i will start
>> with this one change.
>>
>>>
>>> Netlink is TLV, should be used as TLV. I don't see how you can run out
>>> any space. You tend to use Netlink in some weird hybrid mode, with only
>>> argument being space. I think that couple of bytes wasted is not
>>> a problem at all...
>>>
>>
>> You are not making sense to me still.
>> What you describe as "a couple of bytes" adds up when you have
>> a large number of objects. I am trying to cut down on data back
>> and forth from user/kernel and a bitmap is a well understood entity.
>
> The attributes in question are per-message, not per-object
>
>
>>
>> Even if i did use a TLV - when i am representing flags which require one
>> bit each - it makes sense to use a bitmask encapsulated in a TLV. Not
>> to waste a TLV per bit.
>
> That is the only correct way. For example, in future the kernel may
> report in extended ack that it does not support some specific attribute
> user passed. If you pack it all in one attr, that would not be possible.
> Also, what prevents user from passing random data on bit flag positions
> that you are not using? Current kernel would ignore it. Next kernel will
> do something different according to the flag bit. That is UAPI break.
> Essentialy the same thing what DaveM said about the struct. You have to
> define this completely, at the beginning, not possible to use the unused
> bits for other things in the future.
>


They are not the same issue Jiri. We have used bitmasks fine on netlink
message for a millenia. Nobody sets garbage on a bitmask they are not
supposed to touch. The struct padding thing is a shame the way it
turned out - now netlink can no longer have a claim to be a (good)
wire protocol.
Other thing: I know you feel strongly about this but I dont agree that
everything has to be a TLV and in no way, as a matter of principle, you 
are going to convince me  (even when the cows come home) that I have to
use 64 bits to carry a single bit just so I can use TLVs.

cheers,
jamal
Eric Dumazet April 20, 2017, 12:18 p.m. UTC | #11
On Thu, 2017-04-20 at 06:42 -0400, Jamal Hadi Salim wrote:

> 
> They are not the same issue Jiri. We have used bitmasks fine on netlink
> message for a millenia. Nobody sets garbage on a bitmask they are not
> supposed to touch. The struct padding thing is a shame the way it
> turned out - now netlink can no longer have a claim to be a (good)
> wire protocol.

Except that users wrote programs, and these programs work today.

By changing the kernel and recognizing new flags in existing padding,
you might break the programs.

This is not acceptable. Period.

Had we checked the padding being 0 in old kernels, this change would
have been possible today.

But because old kernels did not care of the padding contents, then there
is no way new kernel can suddenly trust them at all.

Please Jamal, you have to forget this nonsense.
Jamal Hadi Salim April 20, 2017, 1:27 p.m. UTC | #12
On 17-04-20 08:18 AM, Eric Dumazet wrote:
> On Thu, 2017-04-20 at 06:42 -0400, Jamal Hadi Salim wrote:
>
>>
>> They are not the same issue Jiri. We have used bitmasks fine on netlink
>> message for a millenia. Nobody sets garbage on a bitmask they are not
>> supposed to touch. The struct padding thing is a shame the way it
>> turned out - now netlink can no longer have a claim to be a (good)
>> wire protocol.
>
> Except that users wrote programs, and these programs work today.
>
> By changing the kernel and recognizing new flags in existing padding,
> you might break the programs.
>
> This is not acceptable. Period.
>
> Had we checked the padding being 0 in old kernels, this change would
> have been possible today.
>
> But because old kernels did not care of the padding contents, then there
> is no way new kernel can suddenly trust them at all.
>
> Please Jamal, you have to forget this nonsense.

That is fine. We can rule out netlink ever being able to work
across machines. That was the dream in the past. Lets close that
discussion.

The issue Jiri is bringing up is unrelated. He is talking about
a bitmap and conflating it with a data structure. They are not
the same issue.

cheers,
jamal

>
David Miller April 20, 2017, 3:23 p.m. UTC | #13
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 20 Apr 2017 05:18:14 -0700

> By changing the kernel and recognizing new flags in existing padding,
> you might break the programs.
> 
> This is not acceptable. Period.
> 
> Had we checked the padding being 0 in old kernels, this change would
> have been possible today.

I completely agree.
David Miller April 20, 2017, 3:50 p.m. UTC | #14
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Thu, 20 Apr 2017 09:27:00 -0400

> The issue Jiri is bringing up is unrelated. He is talking about
> a bitmap and conflating it with a data structure. They are not
> the same issue.

Bitmaps can have the same exact problem as padding if we didn't code
it correctly.

The issue is _purely_, "did we check unused 'fields' and enforce them
to be a certain value"

If not, we lose, and can't use those "fields" in the future.

This rule applies whether you are speaking about padding or a bitmask.
Jamal Hadi Salim April 20, 2017, 5:38 p.m. UTC | #15
On 17-04-20 11:50 AM, David Miller wrote:
> From: Jamal Hadi Salim <jhs@mojatatu.com>
> Date: Thu, 20 Apr 2017 09:27:00 -0400
>
>> The issue Jiri is bringing up is unrelated. He is talking about
>> a bitmap and conflating it with a data structure. They are not
>> the same issue.
>
> Bitmaps can have the same exact problem as padding if we didn't code
> it correctly.
>
> The issue is _purely_, "did we check unused 'fields' and enforce them
> to be a certain value"
>
> If not, we lose, and can't use those "fields" in the future.
>
> This rule applies whether you are speaking about padding or a bitmask.
>

There are no examples of such issues with bitmasks encapsulated in TLVs
that exist.
I grep iproute2 code and there are tons of example of bitmask flags
being sent in TLVs. They all start with:

u64/32/16 mybitflags = 0;

if i want foo then
        mybitflags |= BRIDGE_FLAGS_SELF;
if i want bar then
        mybitflags |= xxxx

addattr16/32/64(&req.n, sizeof(req), ATTR_XXX, mybitflags);

It does not make much sense to have a TLV for each of these
bits when i can fit a bunch of them in u16/32/64.

cheers,
jamal
David Miller April 20, 2017, 5:58 p.m. UTC | #16
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Thu, 20 Apr 2017 13:38:14 -0400

> On 17-04-20 11:50 AM, David Miller wrote:
>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>> Date: Thu, 20 Apr 2017 09:27:00 -0400
>>
>>> The issue Jiri is bringing up is unrelated. He is talking about
>>> a bitmap and conflating it with a data structure. They are not
>>> the same issue.
>>
>> Bitmaps can have the same exact problem as padding if we didn't code
>> it correctly.
>>
>> The issue is _purely_, "did we check unused 'fields' and enforce them
>> to be a certain value"
>>
>> If not, we lose, and can't use those "fields" in the future.
>>
>> This rule applies whether you are speaking about padding or a bitmask.
>>
> 
> There are no examples of such issues with bitmasks encapsulated in
> TLVs
> that exist.
> I grep iproute2 code and there are tons of example of bitmask flags
> being sent in TLVs. They all start with:
> 
> u64/32/16 mybitflags = 0;
> 
> if i want foo then
>        mybitflags |= BRIDGE_FLAGS_SELF;
> if i want bar then
>        mybitflags |= xxxx
> 
> addattr16/32/64(&req.n, sizeof(req), ATTR_XXX, mybitflags);
> 
> It does not make much sense to have a TLV for each of these
> bits when i can fit a bunch of them in u16/32/64.

I have not ruled out bitmasks.  I'm only saying that the kernel must
properly reject bits it doesn't recognize when they are set.

Each bit must have a strict semantic, even unused ones, otherwise
unused ones may never safely be used in the future.
Jamal Hadi Salim April 21, 2017, 10:36 a.m. UTC | #17
On 17-04-20 01:58 PM, David Miller wrote:
> From: Jamal Hadi Salim <jhs@mojatatu.com>
> Date: Thu, 20 Apr 2017 13:38:14 -0400
>

>> There are no examples of such issues with bitmasks encapsulated in
>> TLVs

>> It does not make much sense to have a TLV for each of these
>> bits when i can fit a bunch of them in u16/32/64.
>
> I have not ruled out bitmasks.  I'm only saying that the kernel must
> properly reject bits it doesn't recognize when they are set.
>

It is the other way round from what i see: It ignores them.
This allows new bits to be added over time.
Note: It is a bug - which must be fixed - if user space sets
something the kernel doesnt want it to set. Even then, the only good
use case i can think of for something like this is the kernel
is exposing something to user space for read-only and user space
is being silly and setting read-only bits on requests to the kernel.
But even that is not a catastrophic issue; kernel should just ignore it.

> Each bit must have a strict semantic, even unused ones, otherwise
> unused ones may never safely be used in the future.
>

I think we are pretty good at this.
It would be interesting to have a fuzzer which sets random bits on a
TLV bitmask and see what bugs show up.

cheers,
jamal
David Miller April 21, 2017, 2:51 p.m. UTC | #18
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Fri, 21 Apr 2017 06:36:19 -0400

> On 17-04-20 01:58 PM, David Miller wrote:
>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>> Date: Thu, 20 Apr 2017 13:38:14 -0400
>>
> 
>>> There are no examples of such issues with bitmasks encapsulated in
>>> TLVs
> 
>>> It does not make much sense to have a TLV for each of these
>>> bits when i can fit a bunch of them in u16/32/64.
>>
>> I have not ruled out bitmasks.  I'm only saying that the kernel must
>> properly reject bits it doesn't recognize when they are set.
>>
> 
> It is the other way round from what i see: It ignores them.

Which means we can never use them for anything else reliably,
there could be random crap in there.

> This allows new bits to be added over time.

No, ignoring them actually means we cannot add new bits.

> Note: It is a bug - which must be fixed - if user space sets
> something the kernel doesnt want it to set. Even then, the only good
> use case i can think of for something like this is the kernel
> is exposing something to user space for read-only and user space
> is being silly and setting read-only bits on requests to the kernel.
> But even that is not a catastrophic issue; kernel should just ignore
> it.

But since we didn't check and enforce, we can't use the bits for
settings however we like.

That's the entire point.

We can _never_ go back later and say "oops, add the checks now, it's
all good" because that doesn't work at all.
Jamal Hadi Salim April 21, 2017, 3:29 p.m. UTC | #19
On 17-04-21 10:51 AM, David Miller wrote:
> From: Jamal Hadi Salim <jhs@mojatatu.com>
> Date: Fri, 21 Apr 2017 06:36:19 -0400
>
>> On 17-04-20 01:58 PM, David Miller wrote:
>>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>>> Date: Thu, 20 Apr 2017 13:38:14 -0400
>>>
>>

>
> Which means we can never use them for anything else reliably,
> there could be random crap in there.
>

Today: User space set them to zero. Receivers in the kernel
only look at what they are interested in. I stopped checking after a
while - but everything i looked at in iproute2 worked
like this.

>> This allows new bits to be added over time.
>
> No, ignoring them actually means we cannot add new bits.
>

Old kernels ignore them. New kernels look at the new ones.
We'll be in a lot of trouble if this was not the case
for things today;-> People add bits all the time in TLVs
and in netlink headers that are labeled as flags.

>> Note: It is a bug - which must be fixed - if user space sets
>> something the kernel doesnt want it to set. Even then, the only good
>> use case i can think of for something like this is the kernel
>> is exposing something to user space for read-only and user space
>> is being silly and setting read-only bits on requests to the kernel.
>> But even that is not a catastrophic issue; kernel should just ignore
>> it.
>
> But since we didn't check and enforce, we can't use the bits for
> settings however we like.
>
> That's the entire point.
>
> We can _never_ go back later and say "oops, add the checks now, it's
> all good" because that doesn't work at all.
>

Dave, I dont think you are suggesting we should use a TLV for every bit
we want to  send to the kernel (as Jiri is), are you?

I think you as suggesting we should from now on enforce a rule that
in the kernel we start checking that bits in a bitmap received for
things we are not interested in. So if a bit i dont understand shows
up in the kernel what should i do?
Rejecting the transaction because i received something i dont
understand is not conducive to forward compatibility. Maybe logging
it would be useful.
If i dont get a bit i am expecting (old user space), then for sure
rejecting sounds reasonable.

cheers,
jamal
David Miller April 21, 2017, 3:38 p.m. UTC | #20
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Fri, 21 Apr 2017 11:29:19 -0400

> On 17-04-21 10:51 AM, David Miller wrote:
>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>> Date: Fri, 21 Apr 2017 06:36:19 -0400
>>
>>> On 17-04-20 01:58 PM, David Miller wrote:
>>>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>>>> Date: Thu, 20 Apr 2017 13:38:14 -0400
>>>>
>>>
> 
>>
>> Which means we can never use them for anything else reliably,
>> there could be random crap in there.
>>
> 
> Today: User space set them to zero.

You don't know this because the kernel has never verified it.
Jamal, you cannot walk past this important point, nothing can
be argued further because of it.

> Old kernels ignore them. New kernels look at the new ones.
> We'll be in a lot of trouble if this was not the case
> for things today;-> People add bits all the time in TLVs
> and in netlink headers that are labeled as flags.

And when we do things that way it's broken, and why we have such
crappy behavior.

We made a very bad decision a long time ago to ignore unrecognized
things in netlink and it was a grave error which we must start
correcting now.

If a user says "enable X" and it just gets simply ignored by older
kernels, that can't work properly.  What if "enable X" is something
like "turn on encryption"?  Are you OK with the user getting no
feedback that their stuff is not going to be encrypted?

Even something as benign as "give melarger action dumps" _must_ still
have the same behavior because the user has no alternative action plan
possible if it cannot tell if the kernel supports the facility or not.

> Dave, I dont think you are suggesting we should use a TLV for every
> bit
> we want to  send to the kernel (as Jiri is), are you?

Jiri is not suggesting this, he is instead saying if you want to
support more bits in the future then you must check that the unused
bits are zero _now_ so that we can prove that userland clears them
properly.

And if you don't have any direct plans for more bits in the future,
use just a single attribute with the smallest integer type possible.

> I think you as suggesting we should from now on enforce a rule that
> in the kernel we start checking that bits in a bitmap received for
> things we are not interested in. So if a bit i dont understand shows
> up in the kernel what should i do?

Reject it.

> Rejecting the transaction because i received something i dont
> understand is not conducive to forward compatibility.

Not rejecting it breaks everything and gives the user no feedback
or way whatsoever to know whether the kernel supports something or
not.

I'm not letting us continue to do things so stupidly any more.

I want future applications to know if they are running on an older
kernel and that a specific netlink feature is not supported.

Ignoring not-understood bits prevents that and is the single most
fundamental mistake we've made in netlink.
Jamal Hadi Salim April 21, 2017, 3:55 p.m. UTC | #21
On 17-04-21 11:38 AM, David Miller wrote:
> From: Jamal Hadi Salim <jhs@mojatatu.com>
> Date: Fri, 21 Apr 2017 11:29:19 -0400
>

>
> You don't know this because the kernel has never verified it.
> Jamal, you cannot walk past this important point, nothing can
> be argued further because of it.
>
>> Old kernels ignore them. New kernels look at the new ones.
>> We'll be in a lot of trouble if this was not the case
>> for things today;-> People add bits all the time in TLVs
>> and in netlink headers that are labeled as flags.
>
> And when we do things that way it's broken, and why we have such
> crappy behavior.
>
> We made a very bad decision a long time ago to ignore unrecognized
> things in netlink and it was a grave error which we must start
> correcting now.
>

Dave, that is a different argument which i can appreciate.

> If a user says "enable X" and it just gets simply ignored by older
> kernels, that can't work properly.  What if "enable X" is something
> like "turn on encryption"?  Are you OK with the user getting no
> feedback that their stuff is not going to be encrypted?
>

For this specific use case:
Dont they need a newer kernel which supports "enable encryption"?

> Even something as benign as "give melarger action dumps" _must_ still
> have the same behavior because the user has no alternative action plan
> possible if it cannot tell if the kernel supports the facility or not.
>

So you are seeing this as feature discovery as well then.

>> Dave, I dont think you are suggesting we should use a TLV for every
>> bit
>> we want to  send to the kernel (as Jiri is), are you?
>
> Jiri is not suggesting this, he is instead saying if you want to
> support more bits in the future then you must check that the unused
> bits are zero _now_ so that we can prove that userland clears them
> properly.
>
> And if you don't have any direct plans for more bits in the future,
> use just a single attribute with the smallest integer type possible.
>

Ok, lets move with that premise then in our discussion.

>> I think you as suggesting we should from now on enforce a rule that
>> in the kernel we start checking that bits in a bitmap received for
>> things we are not interested in. So if a bit i dont understand shows
>> up in the kernel what should i do?
>
> Reject it.
>

It may be case by case basis, no?

I can understand rejecting for something that will break operations.
Example "i want you to encrypt this".
But i think there are other cases like "please give me a large
dump" which require less harsh reaction in particular because
I have alternative means in the kernel to achieve the dump.
Would logging or no reaction be fine then?

cheers,
jamal
Jamal Hadi Salim April 21, 2017, 4:01 p.m. UTC | #22
On 17-04-21 11:55 AM, Jamal Hadi Salim wrote:
> On 17-04-21 11:38 AM, David Miller wrote:

>> If a user says "enable X" and it just gets simply ignored by older
>> kernels, that can't work properly.  What if "enable X" is something
>> like "turn on encryption"?  Are you OK with the user getting no
>> feedback that their stuff is not going to be encrypted?
>>
>
> For this specific use case:
> Dont they need a newer kernel which supports "enable encryption"?

Also: What happens to this user space app when it encounters an
older kernel?

Using rejection as a capability discovery wont work I think.

cheers,
jamal
David Miller April 21, 2017, 4:11 p.m. UTC | #23
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Fri, 21 Apr 2017 11:55:40 -0400

> On 17-04-21 11:38 AM, David Miller wrote:
>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>> Date: Fri, 21 Apr 2017 11:29:19 -0400
>>
>> Even something as benign as "give melarger action dumps" _must_ still
>> have the same behavior because the user has no alternative action plan
>> possible if it cannot tell if the kernel supports the facility or not.
>>

   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

I am pretty sure I was clear in my position above.  And then you say:

> But i think there are other cases like "please give me a large
> dump" which require less harsh reaction in particular because
> I have alternative means in the kernel to achieve the dump.
> Would logging or no reaction be fine then?

I clearly said that the large dump should be handled the exactly the
same way as other kinds of attributes and requests.  And I told you
why, and it's because the user cannot act upon the situation if it
wants to.

You give the user no way to perform alternative actions.

Any feature whatsoever, even "give me large dumps" may require the
user to take alternative actions.  You give the user no option
whatsoever by silently ignoring stuff, and that is simply
unacceptable.

Please get out of the mindset of "oh, ignoring this and silently
proceeding in situation X is OK".

Thanks.
David Miller April 21, 2017, 4:12 p.m. UTC | #24
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Fri, 21 Apr 2017 12:01:07 -0400

> On 17-04-21 11:55 AM, Jamal Hadi Salim wrote:
>> On 17-04-21 11:38 AM, David Miller wrote:
> 
>>> If a user says "enable X" and it just gets simply ignored by older
>>> kernels, that can't work properly.  What if "enable X" is something
>>> like "turn on encryption"?  Are you OK with the user getting no
>>> feedback that their stuff is not going to be encrypted?
>>>
>>
>> For this specific use case:
>> Dont they need a newer kernel which supports "enable encryption"?
> 
> Also: What happens to this user space app when it encounters an
> older kernel?
> 
> Using rejection as a capability discovery wont work I think.

Yes for existing attributes we are stuck in the mud because of how
we've handled things in the past.  I'm not saying we should change
behavior for existing attributes.

I'm talking about any newly added attribute from here on out, and
that we need to require checks for them.

Thanks.
Jamal Hadi Salim April 21, 2017, 6:11 p.m. UTC | #25
On 17-04-21 12:12 PM, David Miller wrote:

> Yes for existing attributes we are stuck in the mud because of how
> we've handled things in the past.  I'm not saying we should change
> behavior for existing attributes.
>
> I'm talking about any newly added attribute from here on out, and
> that we need to require checks for them.
>

Please bear with me. I want to make sure to get this right.

Lets say I updated the kernel today to reject transactions with
bits it didnt understand. Lets call this "old kernel". A tc that
understands/sets these bits and nothing else. Call it "old tc".
3 months later:
I add one more bit setting to introduce a new feature in a new
kernel version. Lets call this new "kernel". I update to
understand new bits. Call it "new tc".

The possibilities:
a) old tc + old kernel combo. No problem
b) new tc + new kernel combo. No problem.
c) old tc + new kernel combo. No problem.
d) new tc + old kernel. Rejection.

For #d if i have a smart tc it would retry with a new combination
which restores its behavior to old tc level. Of course this means
apps would have to be rewritten going forward to understand these
mechanics.
Alternative is to request for capabilities first then doing a
lowest common denominator request.
But even that is a lot more code and crossing user/kernel twice.

There is a simpler approach that would work going forward.
How about letting the user choose their fate? Set something maybe
in the netlink header to tell the kernel "if you dont understand
something I am asking for - please ignore it and do what you can".
This would maintain current behavior but would force the user to
explicitly state so.

cheers,
jamal
Simon Horman April 24, 2017, 9:27 a.m. UTC | #26
On Fri, Apr 21, 2017 at 02:11:00PM -0400, Jamal Hadi Salim wrote:
> On 17-04-21 12:12 PM, David Miller wrote:
> 
> >Yes for existing attributes we are stuck in the mud because of how
> >we've handled things in the past.  I'm not saying we should change
> >behavior for existing attributes.
> >
> >I'm talking about any newly added attribute from here on out, and
> >that we need to require checks for them.
> >
> 
> Please bear with me. I want to make sure to get this right.
> 
> Lets say I updated the kernel today to reject transactions with
> bits it didnt understand. Lets call this "old kernel". A tc that
> understands/sets these bits and nothing else. Call it "old tc".
> 3 months later:
> I add one more bit setting to introduce a new feature in a new
> kernel version. Lets call this new "kernel". I update to
> understand new bits. Call it "new tc".
> 
> The possibilities:
> a) old tc + old kernel combo. No problem
> b) new tc + new kernel combo. No problem.
> c) old tc + new kernel combo. No problem.
> d) new tc + old kernel. Rejection.
> 
> For #d if i have a smart tc it would retry with a new combination
> which restores its behavior to old tc level. Of course this means
> apps would have to be rewritten going forward to understand these
> mechanics.
> Alternative is to request for capabilities first then doing a
> lowest common denominator request.
> But even that is a lot more code and crossing user/kernel twice.

From my PoV, for #d user-space has to either get smart or fail.

Creating new tc involved work in order to support the new feature.
Part of that work would be a decision weather or not to provide
compatibility for old kernel or to bail out gracefully.

> There is a simpler approach that would work going forward.
> How about letting the user choose their fate? Set something maybe
> in the netlink header to tell the kernel "if you dont understand
> something I am asking for - please ignore it and do what you can".
> This would maintain current behavior but would force the user to
> explicitly state so.
> 
> cheers,
> jamal
>
Jamal Hadi Salim April 24, 2017, 12:54 p.m. UTC | #27
On 17-04-24 05:27 AM, Simon Horman wrote:
> On Fri, Apr 21, 2017 at 02:11:00PM -0400, Jamal Hadi Salim wrote:
>> On 17-04-21 12:12 PM, David Miller wrote:

>
> From my PoV, for #d user-space has to either get smart or fail.
>
> Creating new tc involved work in order to support the new feature.
> Part of that work would be a decision weather or not to provide
> compatibility for old kernel or to bail out gracefully.
>

But not that much work as is being ascribed now.
This is a big change to user space code. Are we planning to
change all netlink apps (everything in iproute2) to now discover
by testing whether their flags are accepted or not? One extra
crossing to the kernel just to test the waters.

I think there's a middle ground and see which idea takes off.
Refer to the last patch I sent which implements the idea below.

cheers,
jamal

>> There is a simpler approach that would work going forward.
>> How about letting the user choose their fate? Set something maybe
>> in the netlink header to tell the kernel "if you dont understand
>> something I am asking for - please ignore it and do what you can".
>> This would maintain current behavior but would force the user to
>> explicitly state so.
>>
>> cheers,
>> jamal
>>
diff mbox

Patch

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cce0613..c7080ec 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -674,10 +674,27 @@  struct tcamsg {
 	unsigned char	tca__pad1;
 	unsigned short	tca__pad2;
 };
+
+enum {
+	TCAA_UNSPEC,
+	TCAA_ACT_TAB,
+	TCAA_ACT_FLAGS,
+	TCAA_ACT_COUNT,
+	__TCAA_MAX
+};
+
+#define TCAA_MAX (__TCAA_MAX - 1)
 #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
 #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
-#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
-#define TCAA_MAX 1
+#define TCA_ACT_TAB TCAA_ACT_TAB
+/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
+ *
+ * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
+ * actions in a dump. All dump responses will contain the number of actions
+ * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
+ *
+ */
+#define ACT_LARGE_DUMP_ON		(1 << 0)
 
 /* New extended info filters for IFLA_EXT_MASK */
 #define RTEXT_FILTER_VF		(1 << 0)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 82b1d48..45e1cf7 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -83,6 +83,7 @@  static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+	unsigned short act_flags = cb->args[2];
 	struct nlattr *nest;
 
 	spin_lock_bh(&hinfo->lock);
@@ -111,14 +112,18 @@  static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			}
 			nla_nest_end(skb, nest);
 			n_i++;
-			if (n_i >= TCA_ACT_MAX_PRIO)
+			if (!(act_flags & ACT_LARGE_DUMP_ON) &&
+			    n_i >= TCA_ACT_MAX_PRIO)
 				goto done;
 		}
 	}
 done:
 	spin_unlock_bh(&hinfo->lock);
-	if (n_i)
+	if (n_i) {
 		cb->args[0] += n_i;
+		if (act_flags & ACT_LARGE_DUMP_ON)
+			cb->args[1] = n_i;
+	}
 	return n_i;
 
 nla_put_failure:
@@ -993,11 +998,15 @@  static int tcf_action_add(struct net *net, struct nlattr *nla,
 	return tcf_add_notify(net, n, &actions, portid);
 }
 
+static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = {
+	[TCAA_ACT_FLAGS]      = { .type = NLA_U32 },
+};
+
 static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 			 struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
-	struct nlattr *tca[TCA_ACT_MAX + 1];
+	struct nlattr *tca[TCAA_MAX + 1];
 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
 	int ret = 0, ovr = 0;
 
@@ -1005,7 +1014,7 @@  static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
-	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
+	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,
 			  extack);
 	if (ret < 0)
 		return ret;
@@ -1046,16 +1055,12 @@  static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 	return ret;
 }
 
-static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
+static struct nlattr *find_dump_kind(struct nlattr **nla)
 {
 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
-	struct nlattr *nla[TCAA_MAX + 1];
 	struct nlattr *kind;
 
-	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
-			NULL, NULL) < 0)
-		return NULL;
 	tb1 = nla[TCA_ACT_TAB];
 	if (tb1 == NULL)
 		return NULL;
@@ -1081,9 +1086,18 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	struct nlattr *nest;
 	struct tc_action_ops *a_o;
 	int ret = 0;
+	struct nlattr *tcaa[TCAA_MAX + 1];
 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
-	struct nlattr *kind = find_dump_kind(cb->nlh);
+	struct nlattr *kind = NULL;
+	u32 act_flags = 0;
+
+	ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tcaa, TCAA_MAX,
+			  tcaa_policy, NULL);
+	if (ret < 0)
+		return ret;
+
 
+	kind = find_dump_kind(tcaa);
 	if (kind == NULL) {
 		pr_info("tc_dump_action: action bad kind\n");
 		return 0;
@@ -1093,10 +1107,16 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (a_o == NULL)
 		return 0;
 
+	if (tcaa[TCAA_ACT_FLAGS])
+		act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]);
+
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			cb->nlh->nlmsg_type, sizeof(*t), 0);
 	if (!nlh)
 		goto out_module_put;
+
+	cb->args[2] = act_flags;
+
 	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
@@ -1113,6 +1133,9 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (ret > 0) {
 		nla_nest_end(skb, nest);
 		ret = skb->len;
+		if (nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1]))
+			goto out_module_put;
+		cb->args[1] = 0;
 	} else
 		nlmsg_trim(skb, b);