Message ID | 1492693582-26810-2-git-send-email-jhs@emojatatu.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote: >From: Jamal Hadi Salim <jhs@mojatatu.com> > >When you dump hundreds of thousands of actions, getting only 32 per >dump batch even when the socket buffer and memory allocations allow >is inefficient. > >With this change, the user will get as many as possibly fitting >within the given constraints available to the kernel. > >A new top level TLV space is introduced. An attribute >TCAA_ACT_FLAGS is used to carry the flags indicating the user >is capable of processing these large dumps. Older user space which >doesn't set this flag doesn't get the large (than 32) batches. >The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many >actions are put in a single batch. As such user space app knows how long >to iterate (independent of the type of action being dumped) >instead of hardcoded maximum of 32. > >Some results dumping 1.5M actions, first unpatched tc which the >kernel doesn't help: > >prompt$ time -p tc actions ls action gact | grep index | wc -l >1500000 >real 1388.43 >user 2.07 >sys 1386.79 > >Now lets see a patched tc which sets the correct flags when requesting >a dump: > >prompt$ time -p updatedtc actions ls action gact | grep index | wc -l >1500000 >real 178.13 >user 2.02 >sys 176.96 > >Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> >--- > include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++-- > net/sched/act_api.c | 46 +++++++++++++++++++++++++++++++++--------- > 2 files changed, 55 insertions(+), 12 deletions(-) > >diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h >index cce0613..d7d28ec 100644 >--- a/include/uapi/linux/rtnetlink.h >+++ b/include/uapi/linux/rtnetlink.h >@@ -674,10 +674,27 @@ struct tcamsg { > unsigned char tca__pad1; > unsigned short tca__pad2; > }; >+ >+enum { >+ TCAA_UNSPEC, TCAA stands for "traffic control action action". I don't get it :( Prefix still sounds wrong to me, sorry :/ Should be: TCA_SOMETHING_* >+ TCAA_ACT_TAB, >+#define TCA_ACT_TAB TCAA_ACT_TAB >+ TCAA_ACT_FLAGS, >+ TCAA_ACT_COUNT, >+ __TCAA_MAX, >+#define TCAA_MAX (__TCAA_MAX - 1) >+}; >+ > #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) > #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) >-#define TCA_ACT_TAB 1 /* attr type must be >=1 */ >-#define TCAA_MAX 1 >+/* tcamsg flags stored in attribute TCAA_ACT_FLAGS >+ * >+ * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO >+ * actions in a dump. All dump responses will contain the number of actions >+ * being dumped stored in for user app's consumption in TCAA_ACT_COUNT >+ * >+ */ >+#define ACT_LARGE_DUMP_ON BIT(0) Please put some prefix to the name, as I asked for in the previous version. > > /* New extended info filters for IFLA_EXT_MASK */ > #define RTEXT_FILTER_VF (1 << 0) >diff --git a/net/sched/act_api.c b/net/sched/act_api.c >index 82b1d48..f85b8fd 100644 >--- a/net/sched/act_api.c >+++ b/net/sched/act_api.c >@@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, > struct netlink_callback *cb) > { > int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; >+ unsigned short act_flags = cb->args[2]; > struct nlattr *nest; > > spin_lock_bh(&hinfo->lock); >@@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, > } > nla_nest_end(skb, nest); > n_i++; >- if (n_i >= TCA_ACT_MAX_PRIO) >+ if (!(act_flags & ACT_LARGE_DUMP_ON) && >+ n_i >= TCA_ACT_MAX_PRIO) > goto done; > } > } > done: > spin_unlock_bh(&hinfo->lock); >- if (n_i) >+ if (n_i) { > cb->args[0] += n_i; >+ if (act_flags & ACT_LARGE_DUMP_ON) >+ cb->args[1] = n_i; >+ } > return n_i; > > nla_put_failure: >@@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, > return tcf_add_notify(net, n, &actions, portid); > } > >+static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = { >+ [TCAA_ACT_FLAGS] = { .type = NLA_U32 }, >+}; >+ > static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, > struct netlink_ext_ack *extack) > { > struct net *net = sock_net(skb->sk); >- struct nlattr *tca[TCA_ACT_MAX + 1]; >+ struct nlattr *tca[TCAA_MAX + 1]; > u32 portid = skb ? NETLINK_CB(skb).portid : 0; > int ret = 0, ovr = 0; > >@@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, > !netlink_capable(skb, CAP_NET_ADMIN)) > return -EPERM; > >- ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL, >+ ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy, Please do this in a separate patch. It is an unrelated bug fix. > extack); > if (ret < 0) > return ret; >@@ -1046,16 +1055,12 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, > return ret; > } > >-static struct nlattr *find_dump_kind(const struct nlmsghdr *n) >+static struct nlattr *find_dump_kind(struct nlattr **nla) > { > struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; > struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; >- struct nlattr *nla[TCAA_MAX + 1]; > struct nlattr *kind; > >- if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, >- NULL, NULL) < 0) >- return NULL; > tb1 = nla[TCA_ACT_TAB]; > if (tb1 == NULL) > return NULL; >@@ -1081,9 +1086,18 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) > struct nlattr *nest; > struct tc_action_ops *a_o; > int ret = 0; >+ struct nlattr *tcaa[TCAA_MAX + 1]; "tcaa" again, now as a variable :/ Just use "tb" as the rest of the code does. > struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); >- struct nlattr *kind = find_dump_kind(cb->nlh); >+ struct nlattr *kind = NULL; >+ struct nlattr *count_attr = NULL; >+ u32 act_flags = 0; >+ >+ ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tcaa, TCAA_MAX, >+ tcaa_policy, NULL); >+ if (ret < 0) >+ return ret; > >+ kind = find_dump_kind(tcaa); > if (kind == NULL) { > pr_info("tc_dump_action: action bad kind\n"); > return 0; >@@ -1093,14 +1107,23 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) > if (a_o == NULL) > return 0; > >+ if (tcaa[TCAA_ACT_FLAGS]) >+ act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]); I still believe this is wrong. Should be a separate attr per flag. For user experience breakage reasons: 2 kernels should not behave differently on the exact same value passed from userspace: User passes 0x2. Now the kernel will ignore the set bit, the next kernel will recognize it as a valid flag and do something. Please let the discussion reach a consensus before pushing this again. >+ > nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, > cb->nlh->nlmsg_type, sizeof(*t), 0); > if (!nlh) > goto out_module_put; >+ >+ cb->args[2] = act_flags; >+ > t = nlmsg_data(nlh); > t->tca_family = AF_UNSPEC; > t->tca__pad1 = 0; > t->tca__pad2 = 0; >+ count_attr = nla_reserve(skb, TCAA_ACT_COUNT, sizeof(u32)); >+ if (!count_attr) >+ goto out_module_put; > > nest = nla_nest_start(skb, TCA_ACT_TAB); > if (nest == NULL) >@@ -1113,6 +1136,8 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) > if (ret > 0) { > nla_nest_end(skb, nest); > ret = skb->len; >+ memcpy(nla_data(count_attr), &cb->args[1], sizeof(u32)); >+ cb->args[1] = 0; > } else > nlmsg_trim(skb, b); > >-- >1.9.1 >
On 17-04-20 09:59 AM, Jiri Pirko wrote: > Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote: >> From: Jamal Hadi Salim <jhs@mojatatu.com> >> >> When you dump hundreds of thousands of actions, getting only 32 per >> dump batch even when the socket buffer and memory allocations allow >> is inefficient. >> >> With this change, the user will get as many as possibly fitting >> within the given constraints available to the kernel. >> >> A new top level TLV space is introduced. An attribute >> TCAA_ACT_FLAGS is used to carry the flags indicating the user >> is capable of processing these large dumps. Older user space which >> doesn't set this flag doesn't get the large (than 32) batches. >> The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many >> actions are put in a single batch. As such user space app knows how long >> to iterate (independent of the type of action being dumped) >> instead of hardcoded maximum of 32. >> >> Some results dumping 1.5M actions, first unpatched tc which the >> kernel doesn't help: >> >> prompt$ time -p tc actions ls action gact | grep index | wc -l >> 1500000 >> real 1388.43 >> user 2.07 >> sys 1386.79 >> >> Now lets see a patched tc which sets the correct flags when requesting >> a dump: >> >> prompt$ time -p updatedtc actions ls action gact | grep index | wc -l >> 1500000 >> real 178.13 >> user 2.02 >> sys 176.96 >> >> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> >> --- >> include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++-- >> net/sched/act_api.c | 46 +++++++++++++++++++++++++++++++++--------- >> 2 files changed, 55 insertions(+), 12 deletions(-) >> >> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h >> index cce0613..d7d28ec 100644 >> --- a/include/uapi/linux/rtnetlink.h >> +++ b/include/uapi/linux/rtnetlink.h >> @@ -674,10 +674,27 @@ struct tcamsg { >> unsigned char tca__pad1; >> unsigned short tca__pad2; >> }; >> + >> +enum { >> + TCAA_UNSPEC, > > TCAA stands for "traffic control action action". I don't get it :( > Prefix still sounds wrong to me, sorry :/ > Should be: > TCA_SOMETHING_* > > >> + TCAA_ACT_TAB, >> +#define TCA_ACT_TAB TCAA_ACT_TAB >> + TCAA_ACT_FLAGS, >> + TCAA_ACT_COUNT, >> + __TCAA_MAX, >> +#define TCAA_MAX (__TCAA_MAX - 1) >> +}; >> + >> #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) >> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) >> -#define TCA_ACT_TAB 1 /* attr type must be >=1 */ >> -#define TCAA_MAX 1 >> +/* tcamsg flags stored in attribute TCAA_ACT_FLAGS >> + * >> + * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO >> + * actions in a dump. All dump responses will contain the number of actions >> + * being dumped stored in for user app's consumption in TCAA_ACT_COUNT >> + * >> + */ >> +#define ACT_LARGE_DUMP_ON BIT(0) > > Please put some prefix to the name, as I asked for in the previous > version. > Didnt mean to leave out but: I cant seem to find it. Do you recall what you said it should be? > >> >> /* New extended info filters for IFLA_EXT_MASK */ >> #define RTEXT_FILTER_VF (1 << 0) >> diff --git a/net/sched/act_api.c b/net/sched/act_api.c >> index 82b1d48..f85b8fd 100644 >> --- a/net/sched/act_api.c >> +++ b/net/sched/act_api.c >> @@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, >> struct netlink_callback *cb) >> { >> int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; >> + unsigned short act_flags = cb->args[2]; >> struct nlattr *nest; >> >> spin_lock_bh(&hinfo->lock); >> @@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, >> } >> nla_nest_end(skb, nest); >> n_i++; >> - if (n_i >= TCA_ACT_MAX_PRIO) >> + if (!(act_flags & ACT_LARGE_DUMP_ON) && >> + n_i >= TCA_ACT_MAX_PRIO) >> goto done; >> } >> } >> done: >> spin_unlock_bh(&hinfo->lock); >> - if (n_i) >> + if (n_i) { >> cb->args[0] += n_i; >> + if (act_flags & ACT_LARGE_DUMP_ON) >> + cb->args[1] = n_i; >> + } >> return n_i; >> >> nla_put_failure: >> @@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, >> return tcf_add_notify(net, n, &actions, portid); >> } >> >> +static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = { >> + [TCAA_ACT_FLAGS] = { .type = NLA_U32 }, >> +}; >> + >> static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, >> struct netlink_ext_ack *extack) >> { >> struct net *net = sock_net(skb->sk); >> - struct nlattr *tca[TCA_ACT_MAX + 1]; >> + struct nlattr *tca[TCAA_MAX + 1]; >> u32 portid = skb ? NETLINK_CB(skb).portid : 0; >> int ret = 0, ovr = 0; >> >> @@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, >> !netlink_capable(skb, CAP_NET_ADMIN)) >> return -EPERM; >> >> - ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL, >> + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy, > > Please do this in a separate patch. It is an unrelated bug fix. > Ok, that is fair. Thanks. Ok, if this was patch 1/3 in next update. > > > > "tcaa" again, now as a variable :/ > Just use "tb" as the rest of the code does. > Sure. > >> >> + if (tcaa[TCAA_ACT_FLAGS]) >> + act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]); > > I still believe this is wrong. Should be a separate attr per flag. > For user experience breakage reasons: > 2 kernels should not behave differently on the exact same value passed > from userspace: > User passes 0x2. Now the kernel will ignore the set bit, the next kernel > will recognize it as a valid flag and do something. > Please let the discussion reach a consensus before pushing this again. > > Jiri - I dont agree. There is no such breakage. Refer to my previous email. Lets just move on. cheers, jamal
Thu, Apr 20, 2017 at 04:18:50PM CEST, jhs@mojatatu.com wrote: >On 17-04-20 09:59 AM, Jiri Pirko wrote: >> Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote: >> > From: Jamal Hadi Salim <jhs@mojatatu.com> >> > >> > When you dump hundreds of thousands of actions, getting only 32 per >> > dump batch even when the socket buffer and memory allocations allow >> > is inefficient. >> > >> > With this change, the user will get as many as possibly fitting >> > within the given constraints available to the kernel. >> > >> > A new top level TLV space is introduced. An attribute >> > TCAA_ACT_FLAGS is used to carry the flags indicating the user >> > is capable of processing these large dumps. Older user space which >> > doesn't set this flag doesn't get the large (than 32) batches. >> > The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many >> > actions are put in a single batch. As such user space app knows how long >> > to iterate (independent of the type of action being dumped) >> > instead of hardcoded maximum of 32. >> > >> > Some results dumping 1.5M actions, first unpatched tc which the >> > kernel doesn't help: >> > >> > prompt$ time -p tc actions ls action gact | grep index | wc -l >> > 1500000 >> > real 1388.43 >> > user 2.07 >> > sys 1386.79 >> > >> > Now lets see a patched tc which sets the correct flags when requesting >> > a dump: >> > >> > prompt$ time -p updatedtc actions ls action gact | grep index | wc -l >> > 1500000 >> > real 178.13 >> > user 2.02 >> > sys 176.96 >> > >> > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> >> > --- >> > include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++-- >> > net/sched/act_api.c | 46 +++++++++++++++++++++++++++++++++--------- >> > 2 files changed, 55 insertions(+), 12 deletions(-) >> > >> > diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h >> > index cce0613..d7d28ec 100644 >> > --- a/include/uapi/linux/rtnetlink.h >> > +++ b/include/uapi/linux/rtnetlink.h >> > @@ -674,10 +674,27 @@ struct tcamsg { >> > unsigned char tca__pad1; >> > unsigned short tca__pad2; >> > }; >> > + >> > +enum { >> > + TCAA_UNSPEC, >> >> TCAA stands for "traffic control action action". I don't get it :( >> Prefix still sounds wrong to me, sorry :/ >> Should be: >> TCA_SOMETHING_* >> >> >> > + TCAA_ACT_TAB, >> > +#define TCA_ACT_TAB TCAA_ACT_TAB >> > + TCAA_ACT_FLAGS, >> > + TCAA_ACT_COUNT, >> > + __TCAA_MAX, >> > +#define TCAA_MAX (__TCAA_MAX - 1) >> > +}; >> > + >> > #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) >> > #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) >> > -#define TCA_ACT_TAB 1 /* attr type must be >=1 */ >> > -#define TCAA_MAX 1 >> > +/* tcamsg flags stored in attribute TCAA_ACT_FLAGS >> > + * >> > + * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO >> > + * actions in a dump. All dump responses will contain the number of actions >> > + * being dumped stored in for user app's consumption in TCAA_ACT_COUNT >> > + * >> > + */ >> > +#define ACT_LARGE_DUMP_ON BIT(0) >> >> Please put some prefix to the name, as I asked for in the previous >> version. >> > >Didnt mean to leave out but: >I cant seem to find it. Do you recall what you said it should be? TCA_SOMETHING_FLAGS_LARGE_DUMP_ON > >> >> > >> > /* New extended info filters for IFLA_EXT_MASK */ >> > #define RTEXT_FILTER_VF (1 << 0) >> > diff --git a/net/sched/act_api.c b/net/sched/act_api.c >> > index 82b1d48..f85b8fd 100644 >> > --- a/net/sched/act_api.c >> > +++ b/net/sched/act_api.c >> > @@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, >> > struct netlink_callback *cb) >> > { >> > int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; >> > + unsigned short act_flags = cb->args[2]; >> > struct nlattr *nest; >> > >> > spin_lock_bh(&hinfo->lock); >> > @@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, >> > } >> > nla_nest_end(skb, nest); >> > n_i++; >> > - if (n_i >= TCA_ACT_MAX_PRIO) >> > + if (!(act_flags & ACT_LARGE_DUMP_ON) && >> > + n_i >= TCA_ACT_MAX_PRIO) >> > goto done; >> > } >> > } >> > done: >> > spin_unlock_bh(&hinfo->lock); >> > - if (n_i) >> > + if (n_i) { >> > cb->args[0] += n_i; >> > + if (act_flags & ACT_LARGE_DUMP_ON) >> > + cb->args[1] = n_i; >> > + } >> > return n_i; >> > >> > nla_put_failure: >> > @@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, >> > return tcf_add_notify(net, n, &actions, portid); >> > } >> > >> > +static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = { >> > + [TCAA_ACT_FLAGS] = { .type = NLA_U32 }, >> > +}; >> > + >> > static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, >> > struct netlink_ext_ack *extack) >> > { >> > struct net *net = sock_net(skb->sk); >> > - struct nlattr *tca[TCA_ACT_MAX + 1]; >> > + struct nlattr *tca[TCAA_MAX + 1]; >> > u32 portid = skb ? NETLINK_CB(skb).portid : 0; >> > int ret = 0, ovr = 0; >> > >> > @@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, >> > !netlink_capable(skb, CAP_NET_ADMIN)) >> > return -EPERM; >> > >> > - ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL, >> > + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy, >> >> Please do this in a separate patch. It is an unrelated bug fix. >> > >Ok, that is fair. Thanks. Ok, if this was patch 1/3 in next update. Good. > >> >> > >> >> "tcaa" again, now as a variable :/ >> Just use "tb" as the rest of the code does. >> > >Sure. > >> >> > >> > + if (tcaa[TCAA_ACT_FLAGS]) >> > + act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]); >> >> I still believe this is wrong. Should be a separate attr per flag. >> For user experience breakage reasons: >> 2 kernels should not behave differently on the exact same value passed >> from userspace: >> User passes 0x2. Now the kernel will ignore the set bit, the next kernel >> will recognize it as a valid flag and do something. >> Please let the discussion reach a consensus before pushing this again. >> >> > >Jiri - I dont agree. There is no such breakage. Refer to my previous >email. Lets just move on. Anyone else has opinion on this?
On 17-04-20 09:59 AM, Jiri Pirko wrote: > Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote: >> From: Jamal Hadi Salim <jhs@mojatatu.com> >> >> When you dump hundreds of thousands of actions, getting only 32 per >> dump batch even when the socket buffer and memory allocations allow >> is inefficient. >> >> With this change, the user will get as many as possibly fitting >> within the given constraints available to the kernel. >> >> A new top level TLV space is introduced. An attribute >> TCAA_ACT_FLAGS is used to carry the flags indicating the user >> is capable of processing these large dumps. Older user space which >> doesn't set this flag doesn't get the large (than 32) batches. >> The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many >> actions are put in a single batch. As such user space app knows how long >> to iterate (independent of the type of action being dumped) >> instead of hardcoded maximum of 32. >> >> Some results dumping 1.5M actions, first unpatched tc which the >> kernel doesn't help: >> >> prompt$ time -p tc actions ls action gact | grep index | wc -l >> 1500000 >> real 1388.43 >> user 2.07 >> sys 1386.79 >> >> Now lets see a patched tc which sets the correct flags when requesting >> a dump: >> >> prompt$ time -p updatedtc actions ls action gact | grep index | wc -l >> 1500000 >> real 178.13 >> user 2.02 >> sys 176.96 >> >> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> >> --- >> include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++-- >> net/sched/act_api.c | 46 +++++++++++++++++++++++++++++++++--------- >> 2 files changed, 55 insertions(+), 12 deletions(-) >> >> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h >> index cce0613..d7d28ec 100644 >> --- a/include/uapi/linux/rtnetlink.h >> +++ b/include/uapi/linux/rtnetlink.h >> @@ -674,10 +674,27 @@ struct tcamsg { >> unsigned char tca__pad1; >> unsigned short tca__pad2; >> }; >> + >> +enum { >> + TCAA_UNSPEC, > > TCAA stands for "traffic control action action". I don't get it :( TC Action Attributes == TCAA. > Prefix still sounds wrong to me, sorry :/ > Should be: > TCA_SOMETHING_* > TCA_ATTR_XXX ? I have another opportunity to make a change where we should close the discussion on this. We cant drag it forever. cheers, jamal
Thu, Apr 20, 2017 at 04:25:11PM CEST, jhs@mojatatu.com wrote: >On 17-04-20 09:59 AM, Jiri Pirko wrote: >> Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote: >> > From: Jamal Hadi Salim <jhs@mojatatu.com> >> > >> > When you dump hundreds of thousands of actions, getting only 32 per >> > dump batch even when the socket buffer and memory allocations allow >> > is inefficient. >> > >> > With this change, the user will get as many as possibly fitting >> > within the given constraints available to the kernel. >> > >> > A new top level TLV space is introduced. An attribute >> > TCAA_ACT_FLAGS is used to carry the flags indicating the user >> > is capable of processing these large dumps. Older user space which >> > doesn't set this flag doesn't get the large (than 32) batches. >> > The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many >> > actions are put in a single batch. As such user space app knows how long >> > to iterate (independent of the type of action being dumped) >> > instead of hardcoded maximum of 32. >> > >> > Some results dumping 1.5M actions, first unpatched tc which the >> > kernel doesn't help: >> > >> > prompt$ time -p tc actions ls action gact | grep index | wc -l >> > 1500000 >> > real 1388.43 >> > user 2.07 >> > sys 1386.79 >> > >> > Now lets see a patched tc which sets the correct flags when requesting >> > a dump: >> > >> > prompt$ time -p updatedtc actions ls action gact | grep index | wc -l >> > 1500000 >> > real 178.13 >> > user 2.02 >> > sys 176.96 >> > >> > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> >> > --- >> > include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++-- >> > net/sched/act_api.c | 46 +++++++++++++++++++++++++++++++++--------- >> > 2 files changed, 55 insertions(+), 12 deletions(-) >> > >> > diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h >> > index cce0613..d7d28ec 100644 >> > --- a/include/uapi/linux/rtnetlink.h >> > +++ b/include/uapi/linux/rtnetlink.h >> > @@ -674,10 +674,27 @@ struct tcamsg { >> > unsigned char tca__pad1; >> > unsigned short tca__pad2; >> > }; >> > + >> > +enum { >> > + TCAA_UNSPEC, >> >> TCAA stands for "traffic control action action". I don't get it :( > >TC Action Attributes == TCAA. > >> Prefix still sounds wrong to me, sorry :/ >> Should be: >> TCA_SOMETHING_* >> > >TCA_ATTR_XXX ? Of course it is an attribute. No need to have "attr" in the name. No other enum has it. Does not make sense. The name should contain the group name. What group this enum defines? > >I have another opportunity to make a change where we should close >the discussion on this. We cant drag it forever. > >cheers, >jamal >
On 17-04-20 10:33 AM, Jiri Pirko wrote: > Thu, Apr 20, 2017 at 04:25:11PM CEST, jhs@mojatatu.com wrote: >>> TCAA stands for "traffic control action action". I don't get it :( >> >> TC Action Attributes == TCAA. >> >>> Prefix still sounds wrong to me, sorry :/ >>> Should be: >>> TCA_SOMETHING_* >>> >> >> TCA_ATTR_XXX ? > > Of course it is an attribute. No need to have "attr" in the name. No > other enum has it. Does not make sense. > > The name should contain the group name. What group this enum defines? > It defines tc root level Action attributes. Would TCRLAA_XXX be more exciting? ;-> cheers, jamal
Thu, Apr 20, 2017 at 05:08:20PM CEST, jhs@mojatatu.com wrote: >On 17-04-20 10:33 AM, Jiri Pirko wrote: >> Thu, Apr 20, 2017 at 04:25:11PM CEST, jhs@mojatatu.com wrote: > >> > > TCAA stands for "traffic control action action". I don't get it :( >> > >> > TC Action Attributes == TCAA. >> > >> > > Prefix still sounds wrong to me, sorry :/ >> > > Should be: >> > > TCA_SOMETHING_* >> > > >> > >> > TCA_ATTR_XXX ? >> >> Of course it is an attribute. No need to have "attr" in the name. No >> other enum has it. Does not make sense. >> >> The name should contain the group name. What group this enum defines? >> > >It defines tc root level Action attributes. >Would TCRLAA_XXX be more exciting? ;-> How about TCA_ROOT_XXX > >cheers, >jamal >
On Thu, 2017-04-20 at 09:06 -0400, Jamal Hadi Salim wrote: > nest = nla_nest_start(skb, TCA_ACT_TAB); > if (nest == NULL) > @@ -1113,6 +1136,8 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) > if (ret > 0) { > nla_nest_end(skb, nest); > ret = skb->len; > + memcpy(nla_data(count_attr), &cb->args[1], sizeof(u32)); This will not work on BigEndian 64bit hosts, since cb->args[1] is bigger than 32bit. > + cb->args[1] = 0; > } else > nlmsg_trim(skb, b); >
On 17-04-20 12:09 PM, Eric Dumazet wrote: > On Thu, 2017-04-20 at 09:06 -0400, Jamal Hadi Salim wrote: > >> nest = nla_nest_start(skb, TCA_ACT_TAB); >> if (nest == NULL) >> @@ -1113,6 +1136,8 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) >> if (ret > 0) { >> nla_nest_end(skb, nest); >> ret = skb->len; >> + memcpy(nla_data(count_attr), &cb->args[1], sizeof(u32)); > > This will not work on BigEndian 64bit hosts, since cb->args[1] is bigger > than 32bit. > Ok, thanks. I will assign to a 32 bit var first then memcpy in the next iteration (tomorrow). cheers, jamal
On Thu, Apr 20, 2017 at 04:24:53PM +0200, Jiri Pirko wrote: > Thu, Apr 20, 2017 at 04:18:50PM CEST, jhs@mojatatu.com wrote: > >On 17-04-20 09:59 AM, Jiri Pirko wrote: > >> Thu, Apr 20, 2017 at 03:06:21PM CEST, jhs@mojatatu.com wrote: > >> > From: Jamal Hadi Salim <jhs@mojatatu.com> ... > >> > + if (tcaa[TCAA_ACT_FLAGS]) > >> > + act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]); > >> > >> I still believe this is wrong. Should be a separate attr per flag. > >> For user experience breakage reasons: > >> 2 kernels should not behave differently on the exact same value passed > >> from userspace: > >> User passes 0x2. Now the kernel will ignore the set bit, the next kernel > >> will recognize it as a valid flag and do something. > >> Please let the discussion reach a consensus before pushing this again. > >> > >> > > > >Jiri - I dont agree. There is no such breakage. Refer to my previous > >email. Lets just move on. > > Anyone else has opinion on this? At the risk of jumping into a hornets nest, yes, I have an opinion: * A agree with Jiri that a separate attribute per flag seems to be the cleanest option, however; * I think it would be reasonable from a UABI PoV to permit currently unused bits of TCAA_ACT_FLAGS to be re-uses so long as the kernel checks that they are zero until they are designated to have some use. I believe this implies that the default value for any future uses of these bits would be zero. Jamal, I am confused about why are you so concerned about the space consumed by this attribute, it's per-message, right? Is it the bigger picture you are worried about - a similar per-entry flag at some point in the future?
On 17-04-24 05:14 AM, Simon Horman wrote: [..] > Jamal, I am confused about why are you so concerned about the space > consumed by this attribute, it's per-message, right? Is it the bigger > picture you are worried about - a similar per-entry flag at some point in > the future? To me the two worries are one and the same. Jiri strongly believes (from a big picture view) we must use TLVs for extensibility. While I agree with him in general i have strong reservations in this case because i can get both extensibility and build for performance with using a flag bitmask as the content of the TLV. A TLV consumes 64 bits minimum. It doesnt matter if we decide to use a u8 or a u16, we are still sending 64 bits on that TLV with the rest being PADding. Not to be melodramatic, but the worst case scenario of putting everything in a TLV for 32 flags is using about 30x more space than using a bitmask. Yes, space is important and if i can express upto 32 flags with one TLV rather than 32 TLVs i choose one TLV. I am always looking for ways to filter out crap i dont need when i do stats collection. I have numerous wounds from fdb entries which decided to use a TLV per flag. The design approach we have used in netlink is: flags start as a bitmap (whether they are on main headers or TLVs); they may be complemented with a bitmask/selector (refer to IFLINK messages). Lets look at this specific patch I have sending. I have already changed it 3 times and involved a churn of 3 different flags. If you asked me in the beggining i wouldve scratched my head thinking for a near term use for bit #3, #4 etc, I am fine with the counter-Postel view of having the kernel validate that appropriate bits are set as long as we dont make user space to now start learning how to play acrobatics. cheers, jamal
On Mon, Apr 24, 2017 at 08:49:00AM -0400, Jamal Hadi Salim wrote: > On 17-04-24 05:14 AM, Simon Horman wrote: > [..] > > >Jamal, I am confused about why are you so concerned about the space > >consumed by this attribute, it's per-message, right? Is it the bigger > >picture you are worried about - a similar per-entry flag at some point in > >the future? > > > To me the two worries are one and the same. > > Jiri strongly believes (from a big picture view) we must use > TLVs for extensibility. > While I agree with him in general i have strong reservations > in this case because i can get both extensibility and > build for performance with using a flag bitmask as the > content of the TLV. > > A TLV consumes 64 bits minimum. It doesnt matter if we decide > to use a u8 or a u16, we are still sending 64 bits on that > TLV with the rest being PADding. Not to be melodramatic, but > the worst case scenario of putting everything in a TLV for 32 > flags is using about 30x more space than using a bitmask. > > Yes, space is important and if i can express upto 32 flags > with one TLV rather than 32 TLVs i choose one TLV. > I am always looking for ways to filter out crap i dont need > when i do stats collection. I have numerous wounds from fdb > entries which decided to use a TLV per flag. > > The design approach we have used in netlink is: flags start > as a bitmap (whether they are on main headers or TLVs); they may be > complemented with a bitmask/selector (refer to IFLINK messages). > > Lets look at this specific patch I have sending. I have already > changed it 3 times and involved a churn of 3 different flags. > If you asked me in the beggining i wouldve scratched my head > thinking for a near term use for bit #3, #4 etc, > > I am fine with the counter-Postel view of having the kernel > validate that appropriate bits are set as long as we dont make > user space to now start learning how to play acrobatics. jamal, what performance concern you have in building this error message? TLVs is the most flexible way. And this is error path, so we should build this message rarely, only if the user sends us something incorrect, why bother...
On 17-04-24 10:20 AM, Pablo Neira Ayuso wrote: > On Mon, Apr 24, 2017 at 08:49:00AM -0400, Jamal Hadi Salim wrote: >> >> I am fine with the counter-Postel view of having the kernel >> validate that appropriate bits are set as long as we dont make >> user space to now start learning how to play acrobatics. > > jamal, what performance concern you have in building this error > message? TLVs is the most flexible way. And this is error path, so we > should build this message rarely, only if the user sends us something > incorrect, why bother... I have a feeling we are reffering to 2 different things. Which error message? Are you talking about extended ACK? I have no problem with that. Let me sumarize for you the discussion. My concern was was the double request needed now which was unneeded before. Before: You send a msg and say the kernel didnt understand. Kernel ignores what it didnt understand and does things you asked it to. i.e Part of Postel principle which says "Be liberal in what you expect of others" But the new concern is user space not abiding to the other half of Postel principle "Be conservative in what you send". It may set some random flags which the kernel doesnt understand. One idea is to have the kernel totally reject anytime it sees such flags. I am sure such a message could be conveyed back to the user. Then the user sends the correct one back. The challenge i have is to enforce this trial by fire approach to all user space apps. It is a large change. My suggestion is for user to set flag to request the old behavior of sending only one message. cheers, jamal
From: Jamal Hadi Salim <jhs@mojatatu.com> Date: Mon, 24 Apr 2017 08:49:00 -0400 > Yes, space is important and if i can express upto 32 flags > with one TLV rather than 32 TLVs i choose one TLV. Which is fine. But two things: 1) Again, bits you aren't using now, make sure userspace doesn't set them. And if it does, reject. 2) If you are worried about performance, we're talking about a TLV in the request here not the dump response itself so performance isn't a real issue as Pablo mentioned.
On 17-04-24 04:30 PM, David Miller wrote: > Which is fine. But two things: > > 1) Again, bits you aren't using now, make sure userspace doesn't > set them. And if it does, reject. > I meet those goals on the bit checks but i went a slightly different path with a patch I posted[1] With the posted patch: unknow bits set will result in a kernel rejection unless the user space explicitly ask the kernel to ignore flags it doesnt understand and just handles what it knows. This reduces the amount of work in tc. If this ok I will resend tomorrow. > 2) If you are worried about performance, we're talking about a TLV in > the request here not the dump response itself so performance isn't > a real issue as Pablo mentioned. doesnt make much of a difference for a simple request, true; i was more worried about how we pack similar things for dumps or for large set requests in general. And note it makes no difference if i make the bitmask u32 or u16 - the TLV still eats 32 + 32 bits. So using a u32 is sensible. cheers, jamal [1] This is because i worry about making large changes to the behavior of user space apps like tc. If I reject I will need to change tc to detect this rejection and retry (and I dont think extended ACKs in their current shape are ready to provide any meaningful detail).
From: Jamal Hadi Salim <jhs@mojatatu.com> Date: Mon, 24 Apr 2017 18:18:42 -0400 > With the posted patch: unknow bits set will result in a kernel > rejection unless the user space explicitly ask the kernel to ignore > flags it doesnt understand and just handles what it knows. This > reduces the amount of work in tc. ... > [1] > This is because i worry about making large changes to > the behavior of user space apps like tc. If I reject I will need to > change tc to detect this rejection and retry (and I dont think > extended ACKs in their current shape are ready to provide any > meaningful detail). I think we should eat the pain now and use the strict checks for all new features like this. I'm sorry if this makes more work for you, but this is really how we have to proceed moving forward. Thanks.
On 17-04-24 06:24 PM, David Miller wrote: > > I think we should eat the pain now and use the strict checks > for all new features like this. > > I'm sorry if this makes more work for you, but this is really > how we have to proceed moving forward. > There is no work for me to do in tc if i push this in without the liberal flag i was suggesting. This is more about the next flag change that will fail with older kernels unless the large tc change happens i.e future ones after this kernel patch. cheers, jamal
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index cce0613..d7d28ec 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -674,10 +674,27 @@ struct tcamsg { unsigned char tca__pad1; unsigned short tca__pad2; }; + +enum { + TCAA_UNSPEC, + TCAA_ACT_TAB, +#define TCA_ACT_TAB TCAA_ACT_TAB + TCAA_ACT_FLAGS, + TCAA_ACT_COUNT, + __TCAA_MAX, +#define TCAA_MAX (__TCAA_MAX - 1) +}; + #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) -#define TCA_ACT_TAB 1 /* attr type must be >=1 */ -#define TCAA_MAX 1 +/* tcamsg flags stored in attribute TCAA_ACT_FLAGS + * + * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO + * actions in a dump. All dump responses will contain the number of actions + * being dumped stored in for user app's consumption in TCAA_ACT_COUNT + * + */ +#define ACT_LARGE_DUMP_ON BIT(0) /* New extended info filters for IFLA_EXT_MASK */ #define RTEXT_FILTER_VF (1 << 0) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 82b1d48..f85b8fd 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, struct netlink_callback *cb) { int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; + unsigned short act_flags = cb->args[2]; struct nlattr *nest; spin_lock_bh(&hinfo->lock); @@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, } nla_nest_end(skb, nest); n_i++; - if (n_i >= TCA_ACT_MAX_PRIO) + if (!(act_flags & ACT_LARGE_DUMP_ON) && + n_i >= TCA_ACT_MAX_PRIO) goto done; } } done: spin_unlock_bh(&hinfo->lock); - if (n_i) + if (n_i) { cb->args[0] += n_i; + if (act_flags & ACT_LARGE_DUMP_ON) + cb->args[1] = n_i; + } return n_i; nla_put_failure: @@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, return tcf_add_notify(net, n, &actions, portid); } +static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = { + [TCAA_ACT_FLAGS] = { .type = NLA_U32 }, +}; + static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct nlattr *tca[TCA_ACT_MAX + 1]; + struct nlattr *tca[TCAA_MAX + 1]; u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; @@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL, + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy, extack); if (ret < 0) return ret; @@ -1046,16 +1055,12 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, return ret; } -static struct nlattr *find_dump_kind(const struct nlmsghdr *n) +static struct nlattr *find_dump_kind(struct nlattr **nla) { struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct nlattr *nla[TCAA_MAX + 1]; struct nlattr *kind; - if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, - NULL, NULL) < 0) - return NULL; tb1 = nla[TCA_ACT_TAB]; if (tb1 == NULL) return NULL; @@ -1081,9 +1086,18 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) struct nlattr *nest; struct tc_action_ops *a_o; int ret = 0; + struct nlattr *tcaa[TCAA_MAX + 1]; struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); - struct nlattr *kind = find_dump_kind(cb->nlh); + struct nlattr *kind = NULL; + struct nlattr *count_attr = NULL; + u32 act_flags = 0; + + ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tcaa, TCAA_MAX, + tcaa_policy, NULL); + if (ret < 0) + return ret; + kind = find_dump_kind(tcaa); if (kind == NULL) { pr_info("tc_dump_action: action bad kind\n"); return 0; @@ -1093,14 +1107,23 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (a_o == NULL) return 0; + if (tcaa[TCAA_ACT_FLAGS]) + act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]); + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) goto out_module_put; + + cb->args[2] = act_flags; + t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; + count_attr = nla_reserve(skb, TCAA_ACT_COUNT, sizeof(u32)); + if (!count_attr) + goto out_module_put; nest = nla_nest_start(skb, TCA_ACT_TAB); if (nest == NULL) @@ -1113,6 +1136,8 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (ret > 0) { nla_nest_end(skb, nest); ret = skb->len; + memcpy(nla_data(count_attr), &cb->args[1], sizeof(u32)); + cb->args[1] = 0; } else nlmsg_trim(skb, b);