diff mbox

[net-next,1/1] net sched actions: add time filter for action dumping

Message ID 1492350973-6846-1-git-send-email-jhs@emojatatu.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Jamal Hadi Salim April 16, 2017, 1:56 p.m. UTC
From: Jamal Hadi Salim <jhs@mojatatu.com>

This adds support for filtering based on time since last used.
When we are dumping a large number of actions it is useful to
have the option of filtering based on when the action was last
used to reduce the amount of data crossing to user space.

With this patch the user space app sets the FILTER_ACCESS_TIME flag
(in the pad1 flags area) and the "time of interest since now" in seconds
when the action was last used (in the pad2 area).  The kernel converts
this to jiffies and does the filtering comparison matching entries that
have seen activity since then and returns them to user space.
Old kernels and old tc continue to work in legacy mode.

Some example (we have 400 actions bound to 400 filters); at installation
time using  hacked tc which sets the time of interest to 120 seconds:

prompt$ hackedtc actions ls action gact | grep index | wc -l
400

go get some coffee and  wait for > 120 seconds and try again:

prompt$ hackedtc actions ls action gact | grep index | wc -l
0

Lets see a filter bound to one of these actions:
..
filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10  (rule hit 2 success 1)
  match 7f000002/ffffffff at 12 (success 1 )
	action order 1: gact action pass
	 random type none pass val 0
	 index 23 ref 2 bind 1 installed 1145 sec used 802 sec
 	Action statistics:
	Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0
....

Now lets ping -c 1 127.0.0.2, then run the actions again:

prompt$ hackedtc actions ls action gact | grep index | wc -l
1

More details please:

prompt$ hackedtc -s actions ls action gact
total acts 1 flags 0x3

	action order 0: gact action pass
	 random type none pass val 0
	 index 23 ref 2 bind 1 installed 1270 sec used 30 sec
 	Action statistics:
	Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0

And the filter?

filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10  (rule hit 4 success 2)
  match 7f000002/ffffffff at 12 (success 2 )
	action order 1: gact action pass
	 random type none pass val 0
	 index 23 ref 2 bind 1 installed 1324 sec used 84 sec
 	Action statistics:
	Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 net/sched/act_api.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

Comments

Jamal Hadi Salim April 16, 2017, 2 p.m. UTC | #1
I should say this is dependent on the earlier patch I posted.
Made them separate because I think this one in particular
may generate some discussions.

cheers,
jamal

On 17-04-16 09:56 AM, Jamal Hadi Salim wrote:
> From: Jamal Hadi Salim <jhs@mojatatu.com>
>
> This adds support for filtering based on time since last used.
> When we are dumping a large number of actions it is useful to
> have the option of filtering based on when the action was last
> used to reduce the amount of data crossing to user space.
>
> With this patch the user space app sets the FILTER_ACCESS_TIME flag
> (in the pad1 flags area) and the "time of interest since now" in seconds
> when the action was last used (in the pad2 area).  The kernel converts
> this to jiffies and does the filtering comparison matching entries that
> have seen activity since then and returns them to user space.
> Old kernels and old tc continue to work in legacy mode.
>
> Some example (we have 400 actions bound to 400 filters); at installation
> time using  hacked tc which sets the time of interest to 120 seconds:
>
> prompt$ hackedtc actions ls action gact | grep index | wc -l
> 400
>
> go get some coffee and  wait for > 120 seconds and try again:
>
> prompt$ hackedtc actions ls action gact | grep index | wc -l
> 0
>
> Lets see a filter bound to one of these actions:
> ..
> filter pref 10 u32
> filter pref 10 u32 fh 800: ht divisor 1
> filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10  (rule hit 2 success 1)
>   match 7f000002/ffffffff at 12 (success 1 )
> 	action order 1: gact action pass
> 	 random type none pass val 0
> 	 index 23 ref 2 bind 1 installed 1145 sec used 802 sec
>  	Action statistics:
> 	Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0)
> 	backlog 0b 0p requeues 0
> ....
>
> Now lets ping -c 1 127.0.0.2, then run the actions again:
>
> prompt$ hackedtc actions ls action gact | grep index | wc -l
> 1
>
> More details please:
>
> prompt$ hackedtc -s actions ls action gact
> total acts 1 flags 0x3
>
> 	action order 0: gact action pass
> 	 random type none pass val 0
> 	 index 23 ref 2 bind 1 installed 1270 sec used 30 sec
>  	Action statistics:
> 	Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
> 	backlog 0b 0p requeues 0
>
> And the filter?
>
> filter pref 10 u32
> filter pref 10 u32 fh 800: ht divisor 1
> filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10  (rule hit 4 success 2)
>   match 7f000002/ffffffff at 12 (success 2 )
> 	action order 1: gact action pass
> 	 random type none pass val 0
> 	 index 23 ref 2 bind 1 installed 1324 sec used 84 sec
>  	Action statistics:
> 	Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
> 	backlog 0b 0p requeues 0
>
> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> ---
>  net/sched/act_api.c | 24 ++++++++++++++++++++++--
>  1 file changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index 90cc774..4dd55f2 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -84,11 +84,13 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>  {
>  	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
>  	unsigned short act_flags = cb->args[2];
> +	unsigned long jiffy_filter = cb->args[3];
> +
>  	struct nlattr *nest;
>
>  	spin_lock_bh(&hinfo->lock);
>
> -	s_i = cb->args[0];
> +	s_i = cb->args[4];
>
>  	for (i = 0; i < (hinfo->hmask + 1); i++) {
>  		struct hlist_head *head;
> @@ -101,6 +103,12 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>  			if (index < s_i)
>  				continue;
>
> +			if (jiffy_filter &&
> +			    time_after(jiffy_filter,
> +				       (unsigned long)p->tcfa_tm.lastuse)) {
> +				continue;
> +			}
> +
>  			nest = nla_nest_start(skb, n_i);
>  			if (nest == NULL)
>  				goto nla_put_failure;
> @@ -118,6 +126,8 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>  		}
>  	}
>  done:
> +	if (index > 0)
> +		cb->args[4] = index + 1;
>  	spin_unlock_bh(&hinfo->lock);
>  	if (n_i) {
>  		cb->args[0] += n_i;
> @@ -1086,8 +1096,10 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
>  	struct tc_action_ops *a_o;
>  	int ret = 0;
>  	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
> -	unsigned char act_flags = t->tca__pad1;
>  	struct nlattr *kind = find_dump_kind(cb->nlh);
> +	unsigned char act_flags = t->tca__pad1;
> +	unsigned short secs = t->tca__pad2;
> +	unsigned long jiffy_wanted = 0;
>
>  	if (kind == NULL) {
>  		pr_info("tc_dump_action: action bad kind\n");
> @@ -1103,7 +1115,15 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
>  	if (!nlh)
>  		goto out_module_put;
>
> +	if (act_flags & ACT_FILTER_TIME_ACCESS) {
> +		const unsigned int m = secs * 1000L;
> +		unsigned long jiffy_msecs = msecs_to_jiffies(m);
> +
> +		jiffy_wanted = jiffies - jiffy_msecs;
> +	}
> +
>  	cb->args[2] = act_flags;
> +	cb->args[3] = jiffy_wanted;
>
>  	t = nlmsg_data(nlh);
>  	t->tca_family = AF_UNSPEC;
>
Jiri Pirko April 17, 2017, 8:21 a.m. UTC | #2
Sun, Apr 16, 2017 at 03:56:13PM CEST, jhs@mojatatu.com wrote:
>From: Jamal Hadi Salim <jhs@mojatatu.com>
>
>This adds support for filtering based on time since last used.
>When we are dumping a large number of actions it is useful to
>have the option of filtering based on when the action was last
>used to reduce the amount of data crossing to user space.
>
>With this patch the user space app sets the FILTER_ACCESS_TIME flag
>(in the pad1 flags area) and the "time of interest since now" in seconds
>when the action was last used (in the pad2 area).  The kernel converts

Again with the pads. Please don't make the TC uapi uglier than it
already is. Just use new attrs.



>this to jiffies and does the filtering comparison matching entries that
>have seen activity since then and returns them to user space.
>Old kernels and old tc continue to work in legacy mode.
>
>Some example (we have 400 actions bound to 400 filters); at installation
>time using  hacked tc which sets the time of interest to 120 seconds:
>
>prompt$ hackedtc actions ls action gact | grep index | wc -l
>400
>
>go get some coffee and  wait for > 120 seconds and try again:
>
>prompt$ hackedtc actions ls action gact | grep index | wc -l
>0
>
>Lets see a filter bound to one of these actions:
>..
>filter pref 10 u32
>filter pref 10 u32 fh 800: ht divisor 1
>filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10  (rule hit 2 success 1)
>  match 7f000002/ffffffff at 12 (success 1 )
>	action order 1: gact action pass
>	 random type none pass val 0
>	 index 23 ref 2 bind 1 installed 1145 sec used 802 sec
> 	Action statistics:
>	Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0)
>	backlog 0b 0p requeues 0
>....
>
>Now lets ping -c 1 127.0.0.2, then run the actions again:
>
>prompt$ hackedtc actions ls action gact | grep index | wc -l
>1
>
>More details please:
>
>prompt$ hackedtc -s actions ls action gact
>total acts 1 flags 0x3
>
>	action order 0: gact action pass
>	 random type none pass val 0
>	 index 23 ref 2 bind 1 installed 1270 sec used 30 sec
> 	Action statistics:
>	Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
>	backlog 0b 0p requeues 0
>
>And the filter?
>
>filter pref 10 u32
>filter pref 10 u32 fh 800: ht divisor 1
>filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10  (rule hit 4 success 2)
>  match 7f000002/ffffffff at 12 (success 2 )
>	action order 1: gact action pass
>	 random type none pass val 0
>	 index 23 ref 2 bind 1 installed 1324 sec used 84 sec
> 	Action statistics:
>	Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
>	backlog 0b 0p requeues 0
>
>Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
>---
> net/sched/act_api.c | 24 ++++++++++++++++++++++--
> 1 file changed, 22 insertions(+), 2 deletions(-)
>
>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>index 90cc774..4dd55f2 100644
>--- a/net/sched/act_api.c
>+++ b/net/sched/act_api.c
>@@ -84,11 +84,13 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
> {
> 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
> 	unsigned short act_flags = cb->args[2];
>+	unsigned long jiffy_filter = cb->args[3];
>+
> 	struct nlattr *nest;
> 
> 	spin_lock_bh(&hinfo->lock);
> 
>-	s_i = cb->args[0];
>+	s_i = cb->args[4];
> 
> 	for (i = 0; i < (hinfo->hmask + 1); i++) {
> 		struct hlist_head *head;
>@@ -101,6 +103,12 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
> 			if (index < s_i)
> 				continue;
> 
>+			if (jiffy_filter &&
>+			    time_after(jiffy_filter,
>+				       (unsigned long)p->tcfa_tm.lastuse)) {
>+				continue;
>+			}
>+
> 			nest = nla_nest_start(skb, n_i);
> 			if (nest == NULL)
> 				goto nla_put_failure;
>@@ -118,6 +126,8 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
> 		}
> 	}
> done:
>+	if (index > 0)
>+		cb->args[4] = index + 1;
> 	spin_unlock_bh(&hinfo->lock);
> 	if (n_i) {
> 		cb->args[0] += n_i;
>@@ -1086,8 +1096,10 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	struct tc_action_ops *a_o;
> 	int ret = 0;
> 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
>-	unsigned char act_flags = t->tca__pad1;
> 	struct nlattr *kind = find_dump_kind(cb->nlh);
>+	unsigned char act_flags = t->tca__pad1;
>+	unsigned short secs = t->tca__pad2;
>+	unsigned long jiffy_wanted = 0;
> 
> 	if (kind == NULL) {
> 		pr_info("tc_dump_action: action bad kind\n");
>@@ -1103,7 +1115,15 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> 	if (!nlh)
> 		goto out_module_put;
> 
>+	if (act_flags & ACT_FILTER_TIME_ACCESS) {
>+		const unsigned int m = secs * 1000L;
>+		unsigned long jiffy_msecs = msecs_to_jiffies(m);
>+
>+		jiffy_wanted = jiffies - jiffy_msecs;
>+	}
>+
> 	cb->args[2] = act_flags;
>+	cb->args[3] = jiffy_wanted;
> 
> 	t = nlmsg_data(nlh);
> 	t->tca_family = AF_UNSPEC;
>-- 
>1.9.1
>
Jakub Kicinski April 18, 2017, 10:12 p.m. UTC | #3
On Sun, 16 Apr 2017 09:56:13 -0400, Jamal Hadi Salim wrote:
> From: Jamal Hadi Salim <jhs@mojatatu.com>
> 
> This adds support for filtering based on time since last used.
> When we are dumping a large number of actions it is useful to
> have the option of filtering based on when the action was last
> used to reduce the amount of data crossing to user space.

Noob question - would it be an option to establish some form of
streaming statistic updates out of the kernel?  The notion of user
space trying to keep track of the time here seems a bit shaky.  Would
it be an option with netlink to open a socket and request specific
set of statistic to be dumped on it periodically if they changed?
I guess it would be way more code that the patch you present...
Jamal Hadi Salim April 19, 2017, 1:37 a.m. UTC | #4
On 17-04-18 06:12 PM, Jakub Kicinski wrote:

> Noob question - would it be an option to establish some form of
> streaming statistic updates out of the kernel?

It would be more efficient, certainly. Current polling approach is not
as bad though (you send one extra kernel message) and is more reliable
(events generated by the kernel are not reliably delivered to user
space).

> The notion of user
> space trying to keep track of the time here seems a bit shaky.

You mean the patch i sent or suggestion you are making would be
more shaky?
I see both being fine from that perspective - you dont need 100%
accuracy. Just something that is within reason of a small delta
of time.

>Would
> it be an option with netlink to open a socket and request specific
> set of statistic to be dumped on it periodically if they changed?
> I guess it would be way more code that the patch you present...

Sensible but like you said more code.
For efficiency you will need to have some tweakables.
Possibly something that could specify "send me X events or whatever
youve accumulated in Y seconds" etc.

Note: Here we open one socket;
and every X seconds we ask the kernel to give us whatever changed
in the last X seconds or so.

There are some drawbacks. If it takes you longer than X seconds to
walk over everything then you are in trouble. The earlier patch
speeds us up a bit. And I have some other rough patches I am
where we could have multiple threads asking for both time and
specific index ranges to make sure we got everything in time.
For a couple of million actions, I havent found big need for
those yet;->

cheers,
jamal
Jakub Kicinski April 19, 2017, 3:55 a.m. UTC | #5
On Tue, 18 Apr 2017 21:37:12 -0400, Jamal Hadi Salim wrote:
> On 17-04-18 06:12 PM, Jakub Kicinski wrote:
> 
> > Noob question - would it be an option to establish some form of
> > streaming statistic updates out of the kernel?  
> 
> It would be more efficient, certainly. Current polling approach is not
> as bad though (you send one extra kernel message) and is more reliable
> (events generated by the kernel are not reliably delivered to user
> space).

I see.

> > The notion of user
> > space trying to keep track of the time here seems a bit shaky.  
> 
> You mean the patch i sent or suggestion you are making would be
> more shaky?
> I see both being fine from that perspective - you dont need 100%
> accuracy. Just something that is within reason of a small delta
> of time.

I'm just referring to the theoretical possibility that if the dumping
process gets preempted for long enough you may loose samples.  Just
because the dumping process cannot control when kernel executes this
line:

	jiffy_wanted = jiffies - jiffy_msecs;

It could in theory be few seconds after the request was made.  Perhaps
using timestamps from a proper time sources instead of the notion of
"last X seconds" would solve that?
Jamal Hadi Salim April 19, 2017, 10:44 a.m. UTC | #6
On 17-04-18 11:55 PM, Jakub Kicinski wrote:
> On Tue, 18 Apr 2017 21:37:12 -0400, Jamal Hadi Salim wrote:
>> On 17-04-18 06:12 PM, Jakub Kicinski wrote:

[..]
>> I see both being fine from that perspective - you dont need 100%
>> accuracy. Just something that is within reason of a small delta
>> of time.
>
> I'm just referring to the theoretical possibility that if the dumping
> process gets preempted for long enough you may loose samples.  Just
> because the dumping process cannot control when kernel executes this
> line:
>
> 	jiffy_wanted = jiffies - jiffy_msecs;
>
> It could in theory be few seconds after the request was made.  Perhaps
> using timestamps from a proper time sources instead of the notion of
> "last X seconds" would solve that?


Good point which i didnt mention as part of the drawbacks. This
is a tradeoff. We dont need to be 100% accurate[1].
The timestamps on the action entries in the kernel are in jiffies;
a lot simpler to do jiffy comparison. If you used a different
timestamp source you'd need to convert for every comparison you
make (i am not sure how costly is when you have many actions).

In use cases i am familiar with, there is a user process app which
opens the socket once and issues dumps every X seconds (ranging from
5-120 seconds). So we will re-issue the dump regardless. Yes
it would be an issue if said application keeps getting pre-empted
and that jiffy computation was always off - but not sure under
what circumstances that could be a common scenario.

cheers,
jamal

[1] As an example, dumps are never 100% accurate you could iterate
  something that then changes while you are in the middle of dumping
  which then renders an already dumped entity obsolete.
diff mbox

Patch

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 90cc774..4dd55f2 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -84,11 +84,13 @@  static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 {
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
 	unsigned short act_flags = cb->args[2];
+	unsigned long jiffy_filter = cb->args[3];
+
 	struct nlattr *nest;
 
 	spin_lock_bh(&hinfo->lock);
 
-	s_i = cb->args[0];
+	s_i = cb->args[4];
 
 	for (i = 0; i < (hinfo->hmask + 1); i++) {
 		struct hlist_head *head;
@@ -101,6 +103,12 @@  static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			if (index < s_i)
 				continue;
 
+			if (jiffy_filter &&
+			    time_after(jiffy_filter,
+				       (unsigned long)p->tcfa_tm.lastuse)) {
+				continue;
+			}
+
 			nest = nla_nest_start(skb, n_i);
 			if (nest == NULL)
 				goto nla_put_failure;
@@ -118,6 +126,8 @@  static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 		}
 	}
 done:
+	if (index > 0)
+		cb->args[4] = index + 1;
 	spin_unlock_bh(&hinfo->lock);
 	if (n_i) {
 		cb->args[0] += n_i;
@@ -1086,8 +1096,10 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	struct tc_action_ops *a_o;
 	int ret = 0;
 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
-	unsigned char act_flags = t->tca__pad1;
 	struct nlattr *kind = find_dump_kind(cb->nlh);
+	unsigned char act_flags = t->tca__pad1;
+	unsigned short secs = t->tca__pad2;
+	unsigned long jiffy_wanted = 0;
 
 	if (kind == NULL) {
 		pr_info("tc_dump_action: action bad kind\n");
@@ -1103,7 +1115,15 @@  static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (!nlh)
 		goto out_module_put;
 
+	if (act_flags & ACT_FILTER_TIME_ACCESS) {
+		const unsigned int m = secs * 1000L;
+		unsigned long jiffy_msecs = msecs_to_jiffies(m);
+
+		jiffy_wanted = jiffies - jiffy_msecs;
+	}
+
 	cb->args[2] = act_flags;
+	cb->args[3] = jiffy_wanted;
 
 	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;