diff mbox

[net-next] net/sched: cls_flower: Add user specified data

Message ID 6b671aaf-d35d-70a5-65e0-40308baeb471@mojatatu.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Jamal Hadi Salim Jan. 2, 2017, 10:21 p.m. UTC
On 17-01-02 01:23 PM, John Fastabend wrote:

>
> Additionally I would like to point out this is an arbitrary length binary
> blob (for undefined use, without even a specified encoding) that gets pushed
> between user space and hardware ;) This seemed to get folks fairly excited in
> the past.
>

The binary blob size is a little strange - but i think there is value
in storing some "cookie" field. The challenge is whether the kernel
gets to intepret it; in which case encoding must be specified. Or
whether we should leave it up to user space - in which something
like tc could standardize its own encodings.

> Some questions, exactly what do you mean by "port mappings" above? In
> general the 'tc' API uses the netdev the netlink msg is processed on as
> the port mapping. If you mean OVS port to netdev port I think this is
> a OVS problem and nothing to do with 'tc'. For what its worth there is an
> existing problem with 'tc' where rules only apply to a single ingress or
> egress port which is limiting on hardware.
>

In our case the desire is to be able to correlate for a system wide
mostly identity/key mapping.

> The UFID in my ovs code base is defined as best I can tell here,
>
>         [OVS_FLOW_ATTR_UFID] = { .type = NL_A_UNSPEC, .optional = true,
>                                  .min_len = sizeof(ovs_u128) },
>
> So you need 128 bits if you want a 1:1 mapping onto 'tc'. So rather
> than an arbitrary blob why not make the case that 'tc' ids need to be
> 128 bits long? Even if its just initially done in flower call it
> flower_flow_id and define it so its not opaque and at least at the code
> level it isn't an arbitrary blob of data.
>

I dont know what this UFID is, but do note:
The idea is not new - the FIB for example has some such cookie
(albeit a tiny one) which will typically be populated to tell
you who/what installed the entry.
I could see f.e use for this cookie to simplify and pretty print in
a human language for the u32 classifier (i.e user space tc sets
some fields in the cookie when updating kernel and when user space
invokes get/dump it uses the cookie to intepret how to pretty print).

I have attached a compile tested version of the cookies on actions
(flat 64 bit; now that we have experienced the use when we have a
large number of counters - I would not mind a 128 bit field).


cheers,
jamal

> And what are the "next" uses of this besides OVS. It would be really
> valuable to see how this generalizes to other usage models. To avoid
> embedding OVS syntax into 'tc'.
>
> Finally if you want to see an example of binary data encodings look at
> how drivers/hardware/users are currently using the user defined bits in
> ethtools ntuple API. Also track down out of tree drivers to see other
> interesting uses. And that was capped at 64bits :/
>
> Thanks,
> John
>
>
>
>
>

commit 0a6fd6b024db77e3a460c22ab8a496a714bc71b7
Author: Jamal Hadi Salim <hadi@mojatatu.com>
Date:   Fri Aug 12 06:10:46 2016 -0400

    actions: add support for cookies
    
    Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>

diff --git a/tc/m_action.c b/tc/m_action.c
index c416d98..75d1a5a 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -137,8 +137,7 @@ noexist:
 	return a;
 }
 
-static int
-new_cmd(char **argv)
+static int new_cmd(char **argv)
 {
 	if ((matches(*argv, "change") == 0) ||
 		(matches(*argv, "replace") == 0) ||
@@ -151,8 +150,7 @@ new_cmd(char **argv)
 
 }
 
-int
-parse_action(int *argc_p, char ***argv_p, int tca_id, struct nlmsghdr *n)
+int parse_action(int *argc_p, char ***argv_p, int tca_id, struct nlmsghdr *n)
 {
 	int argc = *argc_p;
 	char **argv = *argv_p;
@@ -160,6 +158,7 @@ parse_action(int *argc_p, char ***argv_p, int tca_id, struct nlmsghdr *n)
 	char k[16];
 	int ok = 0;
 	int eap = 0; /* expect action parameters */
+	__u64 act_ck = 0;
 
 	int ret = 0;
 	int prio = 0;
@@ -215,13 +214,28 @@ done0:
 			tail = NLMSG_TAIL(n);
 			addattr_l(n, MAX_MSG, ++prio, NULL, 0);
 			addattr_l(n, MAX_MSG, TCA_ACT_KIND, k, strlen(k) + 1);
-
-			ret = a->parse_aopt(a, &argc, &argv, TCA_ACT_OPTIONS, n);
+			ret = a->parse_aopt(a, &argc, &argv, TCA_ACT_OPTIONS,
+					    n);
 
 			if (ret < 0) {
-				fprintf(stderr, "bad action parsing\n");
+				fprintf(stderr, "bad action option parsing\n");
 				goto bad_val;
 			}
+
+			if (*argv && strcmp(*argv, "cookie") == 0) {
+				NEXT_ARG();
+				ret = get_u64(&act_ck, *argv, 0);
+				if (ret) {
+					fprintf(stderr, "bad cookie <%s>\n",
+						*argv);
+					goto bad_val;
+				}
+				argc--;
+				argv++;
+			}
+
+			if (act_ck)
+				addattr64(n, MAX_MSG, TCA_ACT_COOKIE, act_ck);
 			tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
 			ok++;
 		}
@@ -246,8 +260,7 @@ bad_val:
 	return -1;
 }
 
-static int
-tc_print_one_action(FILE *f, struct rtattr *arg)
+static int tc_print_one_action(FILE *f, struct rtattr *arg)
 {
 
 	struct rtattr *tb[TCA_ACT_MAX + 1];
@@ -277,6 +290,10 @@ tc_print_one_action(FILE *f, struct rtattr *arg)
 	if (show_stats && tb[TCA_ACT_STATS]) {
 		fprintf(f, "\tAction statistics:\n");
 		print_tcstats2_attr(f, tb[TCA_ACT_STATS], "\t", NULL);
+		if (tb[TCA_ACT_COOKIE]) {
+			__u64 acookie = rta_getattr_u64(tb[TCA_ACT_COOKIE]);
+			fprintf(f, "cookie 0x%llx ",  acookie);
+		}
 		fprintf(f, "\n");
 	}

Comments

John Fastabend Jan. 2, 2017, 10:58 p.m. UTC | #1
On 17-01-02 02:21 PM, Jamal Hadi Salim wrote:
> On 17-01-02 01:23 PM, John Fastabend wrote:
> 
>>
>> Additionally I would like to point out this is an arbitrary length binary
>> blob (for undefined use, without even a specified encoding) that gets pushed
>> between user space and hardware ;) This seemed to get folks fairly excited in
>> the past.
>>
> 
> The binary blob size is a little strange - but i think there is value
> in storing some "cookie" field. The challenge is whether the kernel
> gets to intepret it; in which case encoding must be specified. Or
> whether we should leave it up to user space - in which something
> like tc could standardize its own encodings.
> 

Well having the length value avoids ending up with cookie1, cookie2, ...
values as folks push more and more data into the cookie.

I don't see any use in the kernel interpreting it. It has no use
for it as far as I can see. It doesn't appear to be metadata which
we use skb->mark for at the moment.

>> Some questions, exactly what do you mean by "port mappings" above? In
>> general the 'tc' API uses the netdev the netlink msg is processed on as
>> the port mapping. If you mean OVS port to netdev port I think this is
>> a OVS problem and nothing to do with 'tc'. For what its worth there is an
>> existing problem with 'tc' where rules only apply to a single ingress or
>> egress port which is limiting on hardware.
>>
> 
> In our case the desire is to be able to correlate for a system wide
> mostly identity/key mapping.
> 

The tuple <ifindex:qdisc:prio:handle> really should be unique why
not use this for system wide mappings?

The only thing I can think to do with this that I can't do with
the above tuple and a simple userspace lookup is stick hardware specific
"hints" in the cookie for the firmware to consume. Which would be
very helpful for what its worth.

>> The UFID in my ovs code base is defined as best I can tell here,
>>
>>         [OVS_FLOW_ATTR_UFID] = { .type = NL_A_UNSPEC, .optional = true,
>>                                  .min_len = sizeof(ovs_u128) },
>>
>> So you need 128 bits if you want a 1:1 mapping onto 'tc'. So rather
>> than an arbitrary blob why not make the case that 'tc' ids need to be
>> 128 bits long? Even if its just initially done in flower call it
>> flower_flow_id and define it so its not opaque and at least at the code
>> level it isn't an arbitrary blob of data.
>>
> 
> I dont know what this UFID is, but do note:
> The idea is not new - the FIB for example has some such cookie
> (albeit a tiny one) which will typically be populated to tell
> you who/what installed the entry.
> I could see f.e use for this cookie to simplify and pretty print in
> a human language for the u32 classifier (i.e user space tc sets
> some fields in the cookie when updating kernel and when user space
> invokes get/dump it uses the cookie to intepret how to pretty print).
> 
> I have attached a compile tested version of the cookies on actions
> (flat 64 bit; now that we have experienced the use when we have a
> large number of counters - I would not mind a 128 bit field).
> 

Its a bit strange to push it as an action when its not really an
action in the traditional datapath.

I suspect the OVS usage is a simple 1:1 lookup from OVS id to TC id to
avoid a userspace map lookup.

> 
> cheers,
> jamal
> 
>> And what are the "next" uses of this besides OVS. It would be really
>> valuable to see how this generalizes to other usage models. To avoid
>> embedding OVS syntax into 'tc'.
>>
>> Finally if you want to see an example of binary data encodings look at
>> how drivers/hardware/users are currently using the user defined bits in
>> ethtools ntuple API. Also track down out of tree drivers to see other
>> interesting uses. And that was capped at 64bits :/
>>
>> Thanks,
>> John
>>
>>
>>
>>
>>
>
Jamal Hadi Salim Jan. 3, 2017, 1:22 a.m. UTC | #2
On 17-01-02 05:58 PM, John Fastabend wrote:
> On 17-01-02 02:21 PM, Jamal Hadi Salim wrote:


> Well having the length value avoids ending up with cookie1, cookie2, ...
> values as folks push more and more data into the cookie.
>

Unless there is good reason I dont see why it shouldnt be a fixed length
value. u64/128 should be plenty.

> I don't see any use in the kernel interpreting it. It has no use
> for it as far as I can see. It doesn't appear to be metadata which
> we use skb->mark for at the moment.
>

Like all cookie semantics it is for storing state. The receiver (kernel)
is not just store it and not intepret it. The user when reading it back
simplifies what they have to do for their processing.

>
> The tuple <ifindex:qdisc:prio:handle> really should be unique why
> not use this for system wide mappings?
>

I think on a single machine should be enough, however:
typically the user wants to define the value in a manner that
in a distributed system it is unique. It would be trickier to
do so with well defined values such as above.


> The only thing I can think to do with this that I can't do with
> the above tuple and a simple userspace lookup is stick hardware specific
> "hints" in the cookie for the firmware to consume. Which would be
> very helpful for what its worth.
>

Ok, very different from our use case with actions.
We just use those values to map to stats info without needing to
know what flow or action (graph) it is associated with.

> Its a bit strange to push it as an action when its not really an
> action in the traditional datapath.
>

The action is part of a graph pointed to by a filter.

> I suspect the OVS usage is a simple 1:1 lookup from OVS id to TC id to
> avoid a userspace map lookup.

Not that i care about OVS but it sounds like a good use case (even for
tc), no?

cheers,
jamal
John Fastabend Jan. 3, 2017, 4:33 a.m. UTC | #3
On 17-01-02 05:22 PM, Jamal Hadi Salim wrote:
> On 17-01-02 05:58 PM, John Fastabend wrote:
>> On 17-01-02 02:21 PM, Jamal Hadi Salim wrote:
> 
> 
>> Well having the length value avoids ending up with cookie1, cookie2, ...
>> values as folks push more and more data into the cookie.
>>
> 
> Unless there is good reason I dont see why it shouldnt be a fixed length
> value. u64/128 should be plenty.
> 
>> I don't see any use in the kernel interpreting it. It has no use
>> for it as far as I can see. It doesn't appear to be metadata which
>> we use skb->mark for at the moment.
>>
> 
> Like all cookie semantics it is for storing state. The receiver (kernel)
> is not just store it and not intepret it. The user when reading it back
> simplifies what they have to do for their processing.
> 
>>
>> The tuple <ifindex:qdisc:prio:handle> really should be unique why
>> not use this for system wide mappings?
>>
> 
> I think on a single machine should be enough, however:
> typically the user wants to define the value in a manner that
> in a distributed system it is unique. It would be trickier to
> do so with well defined values such as above.
> 

Just extend the tuple <hostname:ifindex:qdisc:prio:handle> that
should be unique in the domain of hostname's, or use some other domain
wide machine identifier.

> 
>> The only thing I can think to do with this that I can't do with
>> the above tuple and a simple userspace lookup is stick hardware specific
>> "hints" in the cookie for the firmware to consume. Which would be
>> very helpful for what its worth.
>>
> 
> Ok, very different from our use case with actions.
> We just use those values to map to stats info without needing to
> know what flow or action (graph) it is associated with.
> 

Sure.

>> Its a bit strange to push it as an action when its not really an
>> action in the traditional datapath.
>>
> 
> The action is part of a graph pointed to by a filter.

Although actions can be shared so the cookie can be shared across
filters. Maybe its useful but it doesn't uniquely identify a filter
in the shared case but the user would have to specify that case
so maybe its not important.

> 
>> I suspect the OVS usage is a simple 1:1 lookup from OVS id to TC id to
>> avoid a userspace map lookup.
> 
> Not that i care about OVS but it sounds like a good use case (even for
> tc), no?

I'm not opposed to it. Just pushing on the use case a bit to understand.

> 
> cheers,
> jamal
Jamal Hadi Salim Jan. 3, 2017, 11:44 a.m. UTC | #4
On 17-01-02 11:33 PM, John Fastabend wrote:
> On 17-01-02 05:22 PM, Jamal Hadi Salim wrote:

[..]
>> Like all cookie semantics it is for storing state. The receiver (kernel)
>> is not just store it and not intepret it. The user when reading it back
>> simplifies what they have to do for their processing.
>>
>>>
>>> The tuple <ifindex:qdisc:prio:handle> really should be unique why
>>> not use this for system wide mappings?
>>>
>>
>> I think on a single machine should be enough, however:
>> typically the user wants to define the value in a manner that
>> in a distributed system it is unique. It would be trickier to
>> do so with well defined values such as above.
>>
>
> Just extend the tuple <hostname:ifindex:qdisc:prio:handle> that
> should be unique in the domain of hostname's, or use some other domain
> wide machine identifier.
>

May work for the case of filter identification. The nice thing for
allowing cookies is you can let the user define it define their
own scheme.

> Although actions can be shared so the cookie can be shared across
> filters. Maybe its useful but it doesn't uniquely identify a filter
> in the shared case but the user would have to specify that case
> so maybe its not important.
>

Note: the action cookies and filter cookies are unrelated/orthogonal.
Their basic concept of stashing something in the cookie to help improve
what user space does (in our case millions of actions of which some are
used for accounting) is similar.
I have no objections to the flow cookies; my main concern was it should
be applicable to all classifiers not just flower. And the arbitrary size
of the cookie that you pointed out is questionable.

cheers,
jamal
Paul Blakey Jan. 3, 2017, 12:22 p.m. UTC | #5
On 03/01/2017 13:44, Jamal Hadi Salim wrote:
> On 17-01-02 11:33 PM, John Fastabend wrote:
>> On 17-01-02 05:22 PM, Jamal Hadi Salim wrote:
>
> [..]
>>> Like all cookie semantics it is for storing state. The receiver 
>>> (kernel)
>>> is not just store it and not intepret it. The user when reading it back
>>> simplifies what they have to do for their processing.
>>>
>>>>
>>>> The tuple <ifindex:qdisc:prio:handle> really should be unique why
>>>> not use this for system wide mappings?
>>>>
>>>
>>> I think on a single machine should be enough, however:
>>> typically the user wants to define the value in a manner that
>>> in a distributed system it is unique. It would be trickier to
>>> do so with well defined values such as above.
>>>
>>
>> Just extend the tuple <hostname:ifindex:qdisc:prio:handle> that
>> should be unique in the domain of hostname's, or use some other domain
>> wide machine identifier.
>>
>
> May work for the case of filter identification. The nice thing for
> allowing cookies is you can let the user define it define their
> own scheme.
>
>> Although actions can be shared so the cookie can be shared across
>> filters. Maybe its useful but it doesn't uniquely identify a filter
>> in the shared case but the user would have to specify that case
>> so maybe its not important.
>>
>
> Note: the action cookies and filter cookies are unrelated/orthogonal.
> Their basic concept of stashing something in the cookie to help improve
> what user space does (in our case millions of actions of which some are
> used for accounting) is similar.
> I have no objections to the flow cookies; my main concern was it should
> be applicable to all classifiers not just flower. And the arbitrary size
> of the cookie that you pointed out is questionable.
>
> cheers,
> jamal


Hi all,
Our use case is replacing OVS rules with TC filters for HW offload, and 
you're are right the cookie would
have saved us the mapping from OVS rule ufid to the tc filter 
handle/prio... that was generated for it.
It also was going to be used to store other info like which OVS output 
port corresponds to the ifindex,
so we need 128+32 for now. It helps us with dumping the the flows back, 
when we lose data on crash
or restarting the user space daemon.
HW hints is another thing that might be helpful.
Its binary blob because user/app specifc and its usage might change in 
the future and its and that's why there
is some headroom with size as well.


I don't mind having it on TC level but I didn't want to intervene with 
all filters/TC.
Simon Horman Jan. 4, 2017, 10:14 a.m. UTC | #6
On Tue, Jan 03, 2017 at 02:22:05PM +0200, Paul Blakey wrote:
> 
> 
> On 03/01/2017 13:44, Jamal Hadi Salim wrote:
> >On 17-01-02 11:33 PM, John Fastabend wrote:
> >>On 17-01-02 05:22 PM, Jamal Hadi Salim wrote:
> >
> >[..]
> >>>Like all cookie semantics it is for storing state. The receiver
> >>>(kernel)
> >>>is not just store it and not intepret it. The user when reading it back
> >>>simplifies what they have to do for their processing.
> >>>
> >>>>
> >>>>The tuple <ifindex:qdisc:prio:handle> really should be unique why
> >>>>not use this for system wide mappings?
> >>>>
> >>>
> >>>I think on a single machine should be enough, however:
> >>>typically the user wants to define the value in a manner that
> >>>in a distributed system it is unique. It would be trickier to
> >>>do so with well defined values such as above.
> >>>
> >>
> >>Just extend the tuple <hostname:ifindex:qdisc:prio:handle> that
> >>should be unique in the domain of hostname's, or use some other domain
> >>wide machine identifier.
> >>
> >
> >May work for the case of filter identification. The nice thing for
> >allowing cookies is you can let the user define it define their
> >own scheme.
> >
> >>Although actions can be shared so the cookie can be shared across
> >>filters. Maybe its useful but it doesn't uniquely identify a filter
> >>in the shared case but the user would have to specify that case
> >>so maybe its not important.
> >>
> >
> >Note: the action cookies and filter cookies are unrelated/orthogonal.
> >Their basic concept of stashing something in the cookie to help improve
> >what user space does (in our case millions of actions of which some are
> >used for accounting) is similar.
> >I have no objections to the flow cookies; my main concern was it should
> >be applicable to all classifiers not just flower. And the arbitrary size
> >of the cookie that you pointed out is questionable.
> >
> >cheers,
> >jamal
> 
> 
> Hi all,
> Our use case is replacing OVS rules with TC filters for HW offload, and
> you're are right the cookie would
> have saved us the mapping from OVS rule ufid to the tc filter handle/prio...
> that was generated for it.
> It also was going to be used to store other info like which OVS output port
> corresponds to the ifindex,

Possibly off-topic but I am curious to know why you need to store the port.
My possibly naïve assumption is that a filter is attached to the netdev
corresponding to the input port and mirred or other actions are used to output
to netdevs corresponding to output ports.

> so we need 128+32 for now. It helps us with dumping the the flows back, when
> we lose data on crash
> or restarting the user space daemon.
> HW hints is another thing that might be helpful.
> Its binary blob because user/app specifc and its usage might change in the
> future and its and that's why there
> is some headroom with size as well.
Paul Blakey Jan. 4, 2017, 11:45 a.m. UTC | #7
On 04/01/2017 12:14, Simon Horman wrote:
> On Tue, Jan 03, 2017 at 02:22:05PM +0200, Paul Blakey wrote:
>>
>> On 03/01/2017 13:44, Jamal Hadi Salim wrote:
>>> On 17-01-02 11:33 PM, John Fastabend wrote:
>>>> On 17-01-02 05:22 PM, Jamal Hadi Salim wrote:
>>> [..]
>>>>> Like all cookie semantics it is for storing state. The receiver
>>>>> (kernel)
>>>>> is not just store it and not intepret it. The user when reading it back
>>>>> simplifies what they have to do for their processing.
>>>>>
>>>>>> The tuple <ifindex:qdisc:prio:handle> really should be unique why
>>>>>> not use this for system wide mappings?
>>>>>>
>>>>> I think on a single machine should be enough, however:
>>>>> typically the user wants to define the value in a manner that
>>>>> in a distributed system it is unique. It would be trickier to
>>>>> do so with well defined values such as above.
>>>>>
>>>> Just extend the tuple <hostname:ifindex:qdisc:prio:handle> that
>>>> should be unique in the domain of hostname's, or use some other domain
>>>> wide machine identifier.
>>>>
>>> May work for the case of filter identification. The nice thing for
>>> allowing cookies is you can let the user define it define their
>>> own scheme.
>>>
>>>> Although actions can be shared so the cookie can be shared across
>>>> filters. Maybe its useful but it doesn't uniquely identify a filter
>>>> in the shared case but the user would have to specify that case
>>>> so maybe its not important.
>>>>
>>> Note: the action cookies and filter cookies are unrelated/orthogonal.
>>> Their basic concept of stashing something in the cookie to help improve
>>> what user space does (in our case millions of actions of which some are
>>> used for accounting) is similar.
>>> I have no objections to the flow cookies; my main concern was it should
>>> be applicable to all classifiers not just flower. And the arbitrary size
>>> of the cookie that you pointed out is questionable.
>>>
>>> cheers,
>>> jamal
>>
>> Hi all,
>> Our use case is replacing OVS rules with TC filters for HW offload, and
>> you're are right the cookie would
>> have saved us the mapping from OVS rule ufid to the tc filter handle/prio...
>> that was generated for it.
>> It also was going to be used to store other info like which OVS output port
>> corresponds to the ifindex,
> Possibly off-topic but I am curious to know why you need to store the port.
> My possibly naïve assumption is that a filter is attached to the netdev
> corresponding to the input port and mirred or other actions are used to output
> to netdevs corresponding to output ports.

Right, its for the output ports, OVS uses ovs port numbers and mirred 
action uses the device ifindex, so there is need
to translate it back to OVS port on dump.

>
>> so we need 128+32 for now. It helps us with dumping the the flows back, when
>> we lose data on crash
>> or restarting the user space daemon.
>> HW hints is another thing that might be helpful.
>> Its binary blob because user/app specifc and its usage might change in the
>> future and its and that's why there
>> is some headroom with size as well.
Simon Horman Jan. 5, 2017, 8:03 a.m. UTC | #8
On Wed, Jan 04, 2017 at 01:45:28PM +0200, Paul Blakey wrote:
> On 04/01/2017 12:14, Simon Horman wrote:
> >On Tue, Jan 03, 2017 at 02:22:05PM +0200, Paul Blakey wrote:
> >>
> >>On 03/01/2017 13:44, Jamal Hadi Salim wrote:
> >>>On 17-01-02 11:33 PM, John Fastabend wrote:
> >>>>On 17-01-02 05:22 PM, Jamal Hadi Salim wrote:
> >>>[..]
> >>>>>Like all cookie semantics it is for storing state. The receiver
> >>>>>(kernel)
> >>>>>is not just store it and not intepret it. The user when reading it back
> >>>>>simplifies what they have to do for their processing.
> >>>>>
> >>>>>>The tuple <ifindex:qdisc:prio:handle> really should be unique why
> >>>>>>not use this for system wide mappings?
> >>>>>>
> >>>>>I think on a single machine should be enough, however:
> >>>>>typically the user wants to define the value in a manner that
> >>>>>in a distributed system it is unique. It would be trickier to
> >>>>>do so with well defined values such as above.
> >>>>>
> >>>>Just extend the tuple <hostname:ifindex:qdisc:prio:handle> that
> >>>>should be unique in the domain of hostname's, or use some other domain
> >>>>wide machine identifier.
> >>>>
> >>>May work for the case of filter identification. The nice thing for
> >>>allowing cookies is you can let the user define it define their
> >>>own scheme.
> >>>
> >>>>Although actions can be shared so the cookie can be shared across
> >>>>filters. Maybe its useful but it doesn't uniquely identify a filter
> >>>>in the shared case but the user would have to specify that case
> >>>>so maybe its not important.
> >>>>
> >>>Note: the action cookies and filter cookies are unrelated/orthogonal.
> >>>Their basic concept of stashing something in the cookie to help improve
> >>>what user space does (in our case millions of actions of which some are
> >>>used for accounting) is similar.
> >>>I have no objections to the flow cookies; my main concern was it should
> >>>be applicable to all classifiers not just flower. And the arbitrary size
> >>>of the cookie that you pointed out is questionable.
> >>>
> >>>cheers,
> >>>jamal
> >>
> >>Hi all,
> >>Our use case is replacing OVS rules with TC filters for HW offload, and
> >>you're are right the cookie would
> >>have saved us the mapping from OVS rule ufid to the tc filter handle/prio...
> >>that was generated for it.
> >>It also was going to be used to store other info like which OVS output port
> >>corresponds to the ifindex,
> >Possibly off-topic but I am curious to know why you need to store the port.
> >My possibly naïve assumption is that a filter is attached to the netdev
> >corresponding to the input port and mirred or other actions are used to output
> >to netdevs corresponding to output ports.
> 
> Right, its for the output ports, OVS uses ovs port numbers and mirred action
> uses the device ifindex, so there is need
> to translate it back to OVS port on dump.

Understood, that is a tedious abstraction to support.
But I don't see an easy way around it at this time.

If I read Jamal's emails correctly he is working on per-action cookies.
They may be better than per-flow cookies for storing the OvS port number
(though not the UUID of the flow).

...
Jiri Pirko Jan. 8, 2017, 5:19 p.m. UTC | #9
Mon, Jan 02, 2017 at 11:21:41PM CET, jhs@mojatatu.com wrote:
>On 17-01-02 01:23 PM, John Fastabend wrote:
>
>> 
>> Additionally I would like to point out this is an arbitrary length binary
>> blob (for undefined use, without even a specified encoding) that gets pushed
>> between user space and hardware ;) This seemed to get folks fairly excited in
>> the past.
>> 
>
>The binary blob size is a little strange - but i think there is value
>in storing some "cookie" field. The challenge is whether the kernel
>gets to intepret it; in which case encoding must be specified. Or
>whether we should leave it up to user space - in which something
>like tc could standardize its own encodings.

This should never be interpreted by kernel. I think this would be good
to make clear in the comment in the code.


>
>> Some questions, exactly what do you mean by "port mappings" above? In
>> general the 'tc' API uses the netdev the netlink msg is processed on as
>> the port mapping. If you mean OVS port to netdev port I think this is
>> a OVS problem and nothing to do with 'tc'. For what its worth there is an
>> existing problem with 'tc' where rules only apply to a single ingress or
>> egress port which is limiting on hardware.
>> 
>
>In our case the desire is to be able to correlate for a system wide
>mostly identity/key mapping.
>
>> The UFID in my ovs code base is defined as best I can tell here,
>> 
>>         [OVS_FLOW_ATTR_UFID] = { .type = NL_A_UNSPEC, .optional = true,
>>                                  .min_len = sizeof(ovs_u128) },
>> 
>> So you need 128 bits if you want a 1:1 mapping onto 'tc'. So rather
>> than an arbitrary blob why not make the case that 'tc' ids need to be
>> 128 bits long? Even if its just initially done in flower call it
>> flower_flow_id and define it so its not opaque and at least at the code
>> level it isn't an arbitrary blob of data.
>> 
>
>I dont know what this UFID is, but do note:
>The idea is not new - the FIB for example has some such cookie
>(albeit a tiny one) which will typically be populated to tell
>you who/what installed the entry.
>I could see f.e use for this cookie to simplify and pretty print in
>a human language for the u32 classifier (i.e user space tc sets
>some fields in the cookie when updating kernel and when user space
>invokes get/dump it uses the cookie to intepret how to pretty print).
>
>I have attached a compile tested version of the cookies on actions
>(flat 64 bit; now that we have experienced the use when we have a
>large number of counters - I would not mind a 128 bit field).
>
>
>cheers,
>jamal
>
>> And what are the "next" uses of this besides OVS. It would be really
>> valuable to see how this generalizes to other usage models. To avoid
>> embedding OVS syntax into 'tc'.
>> 
>> Finally if you want to see an example of binary data encodings look at
>> how drivers/hardware/users are currently using the user defined bits in
>> ethtools ntuple API. Also track down out of tree drivers to see other
>> interesting uses. And that was capped at 64bits :/
>> 
>> Thanks,
>> John
>> 
>> 
>> 
>> 
>> 
>

>diff --git a/include/net/act_api.h b/include/net/act_api.h
>index 1d71644..f299ed3 100644
>--- a/include/net/act_api.h
>+++ b/include/net/act_api.h
>@@ -41,6 +41,7 @@ struct tc_action {
> 	struct rcu_head			tcfa_rcu;
> 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
> 	struct gnet_stats_queue __percpu *cpu_qstats;
>+	u64	cookie;
> };
> #define tcf_head	common.tcfa_head
> #define tcf_index	common.tcfa_index
>diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
>index cb4bcdc..2e968ee 100644
>--- a/include/uapi/linux/pkt_cls.h
>+++ b/include/uapi/linux/pkt_cls.h
>@@ -67,6 +67,7 @@ enum {
> 	TCA_ACT_INDEX,
> 	TCA_ACT_STATS,
> 	TCA_ACT_PAD,
>+	TCA_ACT_COOKIE,
> 	__TCA_ACT_MAX
> };
> 
>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>index 2095c83..97eae6b 100644
>--- a/net/sched/act_api.c
>+++ b/net/sched/act_api.c
>@@ -26,6 +26,7 @@
> #include <net/sch_generic.h>
> #include <net/act_api.h>
> #include <net/netlink.h>
>+#include <net/tc_act/tc_gact.h>
> 
> static void free_tcf(struct rcu_head *head)
> {
>@@ -467,17 +468,21 @@ int tcf_action_destroy(struct list_head *actions, int bind)
> 	return a->ops->dump(skb, a, bind, ref);
> }
> 
>-int
>-tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
>+int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind,
>+		      int ref)
> {
> 	int err = -EINVAL;
> 	unsigned char *b = skb_tail_pointer(skb);
> 	struct nlattr *nest;
>+	u64 cookie = a->cookie;
> 
> 	if (nla_put_string(skb, TCA_KIND, a->ops->kind))
> 		goto nla_put_failure;
> 	if (tcf_action_copy_stats(skb, a, 0))
> 		goto nla_put_failure;
>+	if (nla_put_u64_64bit(skb, TCA_ACT_COOKIE, cookie, TCA_ACT_PAD))
>+		goto nla_put_failure;
>+
> 	nest = nla_nest_start(skb, TCA_OPTIONS);
> 	if (nest == NULL)
> 		goto nla_put_failure;
>@@ -578,6 +583,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
> 	if (err < 0)
> 		goto err_mod;
> 
>+	if (tb[TCA_ACT_COOKIE])
>+		a->cookie = nla_get_u64(tb[TCA_ACT_COOKIE]);
>+	else
>+		a->cookie = 0; /* kernel uses 0 */
>+
> 	/* module count goes up only when brand new policy is created
> 	 * if it exists and is only bound to in a_o->init() then
> 	 * ACT_P_CREATED is not returned (a zero is).

>commit 0a6fd6b024db77e3a460c22ab8a496a714bc71b7
>Author: Jamal Hadi Salim <hadi@mojatatu.com>
>Date:   Fri Aug 12 06:10:46 2016 -0400
>
>    actions: add support for cookies
>    
>    Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
>
>diff --git a/tc/m_action.c b/tc/m_action.c
>index c416d98..75d1a5a 100644
>--- a/tc/m_action.c
>+++ b/tc/m_action.c
>@@ -137,8 +137,7 @@ noexist:
> 	return a;
> }
> 
>-static int
>-new_cmd(char **argv)
>+static int new_cmd(char **argv)
> {
> 	if ((matches(*argv, "change") == 0) ||
> 		(matches(*argv, "replace") == 0) ||
>@@ -151,8 +150,7 @@ new_cmd(char **argv)
> 
> }
> 
>-int
>-parse_action(int *argc_p, char ***argv_p, int tca_id, struct nlmsghdr *n)
>+int parse_action(int *argc_p, char ***argv_p, int tca_id, struct nlmsghdr *n)
> {
> 	int argc = *argc_p;
> 	char **argv = *argv_p;
>@@ -160,6 +158,7 @@ parse_action(int *argc_p, char ***argv_p, int tca_id, struct nlmsghdr *n)
> 	char k[16];
> 	int ok = 0;
> 	int eap = 0; /* expect action parameters */
>+	__u64 act_ck = 0;
> 
> 	int ret = 0;
> 	int prio = 0;
>@@ -215,13 +214,28 @@ done0:
> 			tail = NLMSG_TAIL(n);
> 			addattr_l(n, MAX_MSG, ++prio, NULL, 0);
> 			addattr_l(n, MAX_MSG, TCA_ACT_KIND, k, strlen(k) + 1);
>-
>-			ret = a->parse_aopt(a, &argc, &argv, TCA_ACT_OPTIONS, n);
>+			ret = a->parse_aopt(a, &argc, &argv, TCA_ACT_OPTIONS,
>+					    n);
> 
> 			if (ret < 0) {
>-				fprintf(stderr, "bad action parsing\n");
>+				fprintf(stderr, "bad action option parsing\n");
> 				goto bad_val;
> 			}
>+
>+			if (*argv && strcmp(*argv, "cookie") == 0) {
>+				NEXT_ARG();
>+				ret = get_u64(&act_ck, *argv, 0);
>+				if (ret) {
>+					fprintf(stderr, "bad cookie <%s>\n",
>+						*argv);
>+					goto bad_val;
>+				}
>+				argc--;
>+				argv++;
>+			}
>+
>+			if (act_ck)
>+				addattr64(n, MAX_MSG, TCA_ACT_COOKIE, act_ck);
> 			tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
> 			ok++;
> 		}
>@@ -246,8 +260,7 @@ bad_val:
> 	return -1;
> }
> 
>-static int
>-tc_print_one_action(FILE *f, struct rtattr *arg)
>+static int tc_print_one_action(FILE *f, struct rtattr *arg)
> {
> 
> 	struct rtattr *tb[TCA_ACT_MAX + 1];
>@@ -277,6 +290,10 @@ tc_print_one_action(FILE *f, struct rtattr *arg)
> 	if (show_stats && tb[TCA_ACT_STATS]) {
> 		fprintf(f, "\tAction statistics:\n");
> 		print_tcstats2_attr(f, tb[TCA_ACT_STATS], "\t", NULL);
>+		if (tb[TCA_ACT_COOKIE]) {
>+			__u64 acookie = rta_getattr_u64(tb[TCA_ACT_COOKIE]);
>+			fprintf(f, "cookie 0x%llx ",  acookie);
>+		}
> 		fprintf(f, "\n");
> 	}
John Fastabend Jan. 9, 2017, 6:23 p.m. UTC | #10
On 17-01-08 09:19 AM, Jiri Pirko wrote:
> Mon, Jan 02, 2017 at 11:21:41PM CET, jhs@mojatatu.com wrote:
>> On 17-01-02 01:23 PM, John Fastabend wrote:
>>
>>>
>>> Additionally I would like to point out this is an arbitrary length binary
>>> blob (for undefined use, without even a specified encoding) that gets pushed
>>> between user space and hardware ;) This seemed to get folks fairly excited in
>>> the past.
>>>
>>
>> The binary blob size is a little strange - but i think there is value
>> in storing some "cookie" field. The challenge is whether the kernel
>> gets to intepret it; in which case encoding must be specified. Or
>> whether we should leave it up to user space - in which something
>> like tc could standardize its own encodings.
> 
> This should never be interpreted by kernel. I think this would be good
> to make clear in the comment in the code.
> 

Ah OK I had assumed you would be pushing this via tc_cls_flower_offload into
the driver in a follow up patch. But if it lives in kernel space as opaque
cookie guess its no different then other id fields order/prio/cookie.

Thanks for clarifying.
Jamal Hadi Salim Jan. 14, 2017, 12:56 p.m. UTC | #11
On 17-01-09 01:23 PM, John Fastabend wrote:
> On 17-01-08 09:19 AM, Jiri Pirko wrote:

[..]
>> This should never be interpreted by kernel. I think this would be good
>> to make clear in the comment in the code.
>>
>
> Ah OK I had assumed you would be pushing this via tc_cls_flower_offload into
> the driver in a follow up patch. But if it lives in kernel space as opaque
> cookie guess its no different then other id fields order/prio/cookie.
>
> Thanks for clarifying.


I think the feature makes a lot of sense (as is the action variant).
But can we make it:
a) fixed size
b) apply to all classifiers
c) please post a usage example via iproute2/tc

I am going to post the action variant in the next while - will do some 
more testing first.

cheers,
jamal
Jamal Hadi Salim Jan. 14, 2017, 1:06 p.m. UTC | #12
On 17-01-03 07:22 AM, Paul Blakey wrote:
>
> I don't mind having it on TC level but I didn't want to intervene with
> all filters/TC.
>

Please do make it for all classifiers. I described a use case for u32
where the cookie could be used to pretty print the output on a
dump/get.

cheers,
jamal
Jiri Pirko Jan. 14, 2017, 2:48 p.m. UTC | #13
Sat, Jan 14, 2017 at 01:56:35PM CET, jhs@mojatatu.com wrote:
>On 17-01-09 01:23 PM, John Fastabend wrote:
>> On 17-01-08 09:19 AM, Jiri Pirko wrote:
>
>[..]
>> > This should never be interpreted by kernel. I think this would be good
>> > to make clear in the comment in the code.
>> > 
>> 
>> Ah OK I had assumed you would be pushing this via tc_cls_flower_offload into
>> the driver in a follow up patch. But if it lives in kernel space as opaque
>> cookie guess its no different then other id fields order/prio/cookie.
>> 
>> Thanks for clarifying.
>
>
>I think the feature makes a lot of sense (as is the action variant).
>But can we make it:
>a) fixed size

Can you elaborate on why is this needed?


>b) apply to all classifiers
>c) please post a usage example via iproute2/tc
>
>I am going to post the action variant in the next while - will do some more
>testing first.

I believe we have to make the cls and ats cookies exactly the same.

>
>cheers,
>jamal
Jamal Hadi Salim Jan. 14, 2017, 3:03 p.m. UTC | #14
On 17-01-14 09:48 AM, Jiri Pirko wrote:
> Sat, Jan 14, 2017 at 01:56:35PM CET, jhs@mojatatu.com wrote:


>> I think the feature makes a lot of sense (as is the action variant).
>> But can we make it:
>> a) fixed size
>
> Can you elaborate on why is this needed?
>

My experience with the action bits its easier to debug
and enforces some discipline to not abuse the amount of RAM used.
If you have 1M rules, one extra 128M is easier on the system than
a few Gigs.

>
>> b) apply to all classifiers
>> c) please post a usage example via iproute2/tc
>>
>> I am going to post the action variant in the next while - will do some more
>> testing first.
>
> I believe we have to make the cls and ats cookies exactly the same.
>

Probably - they are both needed. See the patch I just posted.

cheers,
jamal
Jiri Pirko Jan. 14, 2017, 3:29 p.m. UTC | #15
Sat, Jan 14, 2017 at 04:03:17PM CET, jhs@mojatatu.com wrote:
>On 17-01-14 09:48 AM, Jiri Pirko wrote:
>> Sat, Jan 14, 2017 at 01:56:35PM CET, jhs@mojatatu.com wrote:
>
>
>> > I think the feature makes a lot of sense (as is the action variant).
>> > But can we make it:
>> > a) fixed size
>> 
>> Can you elaborate on why is this needed?
>> 
>
>My experience with the action bits its easier to debug
>and enforces some discipline to not abuse the amount of RAM used.
>If you have 1M rules, one extra 128M is easier on the system than
>a few Gigs.

Fair. So could this be done like IFLA_PHYS_SWITCH_ID and
IFLA_PHYS_PORT_ID. They can have variable size, max is MAX_PHYS_ITEM_ID_LEN

We can let user to pass arbitrary len up to 16 bytes. This has benefit in
fact that if in future this needs to be extended to say 32 bytes, it is
backward compatible. We just change the check in kernel.


>
>> 
>> > b) apply to all classifiers
>> > c) please post a usage example via iproute2/tc
>> > 
>> > I am going to post the action variant in the next while - will do some more
>> > testing first.
>> 
>> I believe we have to make the cls and ats cookies exactly the same.
>> 
>
>Probably - they are both needed. See the patch I just posted.
>
>cheers,
>jamal
>
Jamal Hadi Salim Jan. 14, 2017, 5:46 p.m. UTC | #16
On 17-01-14 10:29 AM, Jiri Pirko wrote:
> Sat, Jan 14, 2017 at 04:03:17PM CET, jhs@mojatatu.com wrote:

> Fair. So could this be done like IFLA_PHYS_SWITCH_ID and
> IFLA_PHYS_PORT_ID. They can have variable size, max is MAX_PHYS_ITEM_ID_LEN
>
> We can let user to pass arbitrary len up to 16 bytes. This has benefit in
> fact that if in future this needs to be extended to say 32 bytes, it is
> backward compatible. We just change the check in kernel.
>

Sure. Ive run out of time for now; but will work on a new version later.

cheers,
jamal
diff mbox

Patch

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 1d71644..f299ed3 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -41,6 +41,7 @@  struct tc_action {
 	struct rcu_head			tcfa_rcu;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue __percpu *cpu_qstats;
+	u64	cookie;
 };
 #define tcf_head	common.tcfa_head
 #define tcf_index	common.tcfa_index
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index cb4bcdc..2e968ee 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -67,6 +67,7 @@  enum {
 	TCA_ACT_INDEX,
 	TCA_ACT_STATS,
 	TCA_ACT_PAD,
+	TCA_ACT_COOKIE,
 	__TCA_ACT_MAX
 };
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 2095c83..97eae6b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -26,6 +26,7 @@ 
 #include <net/sch_generic.h>
 #include <net/act_api.h>
 #include <net/netlink.h>
+#include <net/tc_act/tc_gact.h>
 
 static void free_tcf(struct rcu_head *head)
 {
@@ -467,17 +468,21 @@  int tcf_action_destroy(struct list_head *actions, int bind)
 	return a->ops->dump(skb, a, bind, ref);
 }
 
-int
-tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind,
+		      int ref)
 {
 	int err = -EINVAL;
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
+	u64 cookie = a->cookie;
 
 	if (nla_put_string(skb, TCA_KIND, a->ops->kind))
 		goto nla_put_failure;
 	if (tcf_action_copy_stats(skb, a, 0))
 		goto nla_put_failure;
+	if (nla_put_u64_64bit(skb, TCA_ACT_COOKIE, cookie, TCA_ACT_PAD))
+		goto nla_put_failure;
+
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
@@ -578,6 +583,11 @@  struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 	if (err < 0)
 		goto err_mod;
 
+	if (tb[TCA_ACT_COOKIE])
+		a->cookie = nla_get_u64(tb[TCA_ACT_COOKIE]);
+	else
+		a->cookie = 0; /* kernel uses 0 */
+
 	/* module count goes up only when brand new policy is created
 	 * if it exists and is only bound to in a_o->init() then
 	 * ACT_P_CREATED is not returned (a zero is).