[ovs-dev,3/4] tc: Add header rewrite using tc pedit action

Message ID 1502202114-57266-4-git-send-email-roid@mellanox.com
State Changes Requested
Headers show

Commit Message

Roi Dayan Aug. 8, 2017, 2:21 p.m.
From: Paul Blakey <paulb@mellanox.com>

To be later used to implement ovs action set offloading.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
---
 lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/tc.h |  12 +++
 2 files changed, 381 insertions(+), 3 deletions(-)

Comments

Simon Horman Aug. 14, 2017, 1 p.m. | #1
On Tue, Aug 08, 2017 at 05:21:53PM +0300, Roi Dayan wrote:
> From: Paul Blakey <paulb@mellanox.com>
> 
> To be later used to implement ovs action set offloading.
> 
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Reviewed-by: Roi Dayan <roid@mellanox.com>
> ---
>  lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  lib/tc.h |  12 +++
>  2 files changed, 381 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/tc.c b/lib/tc.c

...

> @@ -589,6 +758,10 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
>          nl_parse_act_vlan(act_options, flower);
>      } else if (!strcmp(act_kind, "tunnel_key")) {
>          nl_parse_act_tunnel_key(act_options, flower);
> +    } else if (!strcmp(act_kind, "pedit")) {
> +        nl_parse_act_pedit(act_options, flower);
> +    } else if (!strcmp(act_kind, "csum")) {
> +        /* not doing anything for now */
>      } else {
>          VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
>          return EINVAL;
> @@ -790,6 +963,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy)
>  }
>  
>  static void
> +nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
> +{
> +    size_t offset;
> +
> +    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
> +    {
> +        struct tc_csum parm = { .action = TC_ACT_PIPE,
> +                                .update_flags = flags };

I think it would be a bit nicer to move param to the top of the function
and remove the extra { } scope it is currently inside.

> +
> +        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
> +    }
> +    nl_msg_end_nested(request, offset);
> +}
> +
> +static void
> +nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
> +                     struct tc_pedit_key_ex *ex)
> +{
> +    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
> +    size_t offset, offset_keys_ex, offset_key;
> +    int i;
> +
> +    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
> +    {

The extra { } scope here seems unnecessary.

> +        parm->action = TC_ACT_PIPE;
> +
> +        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
> +        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
> +        for (i = 0; i < parm->nkeys; i++, ex++) {
> +            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
> +            nl_msg_end_nested(request, offset_key);
> +        }
> +        nl_msg_end_nested(request, offset_keys_ex);
> +    }
> +    nl_msg_end_nested(request, offset);
> +}
> +
> +static void
>  nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
>  {
>      size_t offset;

...
Joe Stringer Aug. 14, 2017, 9:56 p.m. | #2
On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
> From: Paul Blakey <paulb@mellanox.com>
>
> To be later used to implement ovs action set offloading.
>
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Reviewed-by: Roi Dayan <roid@mellanox.com>
> ---

Hi Paul and folks, some questions inline.

utilities/checkpatch.py says:

$ utilities/checkpatch.py -4
== Checking 00ecb482f5d1 ("compat: Add act_pedit compatibility for old
kernels") ==
Lines checked: 127, no obvious problems found

== Checking 7503746718f6 ("odp-util: Expose ovs flow key attr len
table for reuse") ==
Lines checked: 195, no obvious problems found

== Checking 1e8ab68aefe1 ("tc: Add header rewrite using tc pedit action") ==
ERROR: Improper whitespace around control block
#359 FILE: lib/tc.c:480:
   NL_ATTR_FOR_EACH(nla, left, keys_ex, keys_ex_size) {

Lines checked: 721, Warnings: 0, Errors: 1

== Checking f6b52ce504c2 ("netdev-tc-offloads: Add support for action set") ==
ERROR: Improper whitespace around control block
#908 FILE: lib/netdev-tc-offloads.c:590:
   NL_ATTR_FOR_EACH_UNSAFE(set_attr, set_left, set, set_len) {

Lines checked: 981, Warnings: 0, Errors: 1

<snip>

> +static struct flower_key_to_pedit flower_pedit_map[] = {
> +    [offsetof(struct tc_flower_key, ipv4.ipv4_src)] = {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> +        12,
> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
> +    },

Why are these items indexed by the offset, and not just an array we
iterate? It seems like the only places where we iterate this, we use
"for (i=0; ... i++)", which means that we iterate several times across
empty items, and this array is sparsely populated.

> +    [offsetof(struct tc_flower_key, ipv4.ipv4_dst)] = {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> +        16,
> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
> +    },
> +    [offsetof(struct tc_flower_key, ipv4.rewrite_ttl)] = {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> +        8,
> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
> +    },

<snip>

> @@ -346,6 +425,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
>      }
>  }
>
> +static const struct nl_policy pedit_policy[] = {
> +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
> +                                     .min_len = sizeof(struct tc_pedit),
> +                                     .optional = false, },
> +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
> +                                      .optional = false, },
> +};
> +
> +static int
> +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
> +{
> +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
> +    const struct tc_pedit *pe;
> +    const struct tc_pedit_key *keys;
> +    const struct nlattr *nla, *keys_ex, *ex_type;
> +    const void *keys_attr;
> +    char *rewrite_key = (void *) &flower->rewrite.key;
> +    char *rewrite_mask = (void *) &flower->rewrite.mask;

These are actually 'struct tc_flower_key', shouldn't we use the actual
types? (I realise there is pointer arithmetic below, but usually we
restrict the usage of void casting and pointer arithmetic as much as
possible).

> +    size_t keys_ex_size, left;
> +    int type, i = 0;
> +
> +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
> +                         ARRAY_SIZE(pedit_policy))) {
> +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
> +        return EPROTO;
> +    }
> +
> +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
> +    keys = pe->keys;
> +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
> +    keys_ex = nl_attr_get(keys_attr);
> +    keys_ex_size = nl_attr_get_size(keys_attr);
> +
> +    NL_ATTR_FOR_EACH(nla, left, keys_ex, keys_ex_size) {
> +        if (i >= pe->nkeys) {
> +            break;
> +        }
> +
> +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
> +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
> +            type = nl_attr_get_u16(ex_type);
> +
> +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
> +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
> +                int flower_off = j;
> +                int sz = m->size;
> +                int mf = m->offset;
> +
> +                if (!sz || m->htype != type) {
> +                   continue;
> +                }

If flower_pedit_map was just a regular array and the offset was
included in 'struct flower_key_to_pedit', then I don't think we need
the above check?

> +
> +                /* check overlap between current pedit key, which is always
> +                 * 4 bytes (range [off, off + 3]), and a map entry in
> +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
> +                if ((keys->off >= mf && keys->off < mf + sz)
> +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
> +                    int diff = flower_off + (keys->off - mf);
> +                    uint32_t *dst = (void *) (rewrite_key + diff);
> +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
> +                    uint32_t mask = ~(keys->mask);
> +                    uint32_t zero_bits;
> +
> +                    if (keys->off < mf) {
> +                        zero_bits = 8 * (mf - keys->off);
> +                        mask &= UINT32_MAX << zero_bits;
> +                    } else if (keys->off + 4 > mf + m->size) {
> +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
> +                        mask &= UINT32_MAX >> zero_bits;
> +                    }

I think this is all trying to avoid having a giant switch case here
which would handle all of the possible flower key attributes, similar
to somewhere else where this kind of iteration occurs. Is there any
overlap between this code and calc_offsets()?

Could we somehow trigger an assert or a strong warning message if the
offsets of our flower_pedit_map and the flower key don't align, given
that there's a bunch of pointer arithmetic happening here?

<snip>

> @@ -589,6 +758,10 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
>          nl_parse_act_vlan(act_options, flower);
>      } else if (!strcmp(act_kind, "tunnel_key")) {
>          nl_parse_act_tunnel_key(act_options, flower);
> +    } else if (!strcmp(act_kind, "pedit")) {
> +        nl_parse_act_pedit(act_options, flower);
> +    } else if (!strcmp(act_kind, "csum")) {
> +        /* not doing anything for now */

Should we log a message that we're not handling csums?

<snip>

> +static int
> +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
> +                                 struct tc_flower *flower)
> +{
> +    struct {
> +        struct tc_pedit sel;
> +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
> +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
> +    } sel = {
> +        .sel = {
> +            .nkeys = 0
> +        }
> +    };
> +    int i, j;
> +
> +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
> +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
> +        struct tc_pedit_key *pedit_key = NULL;
> +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
> +        uint32_t *mask, *data, first_word_mask, last_word_mask;
> +        int cnt = 0, cur_offset = 0;
> +
> +        if (!m->size) {
> +            continue;
> +        }
> +
> +        calc_offsets(flower, m, i, &cur_offset, &cnt, &last_word_mask,
> +                     &first_word_mask, &mask, &data);
> +
> +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
> +            uint32_t mask_word = *mask;
> +
> +            if (j == 0) {
> +                mask_word &= first_word_mask;
> +            }
> +            if (j == cnt - 1) {
> +                mask_word &= last_word_mask;
> +            }
> +            if (!mask_word) {
> +                continue;
> +            }
> +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
> +                VLOG_ERR_RL(&error_rl, "reached too many pedit offsets: %d",
> +                            MAX_PEDIT_OFFSETS);
> +                return EOPNOTSUPP;
> +            }

Just wondering if this should be err or warn, since presumably the
result is that such flows avoid TC and should end up being correctly
handled by the openvswitch kernel datapath?

> +
> +            pedit_key = &sel.keys[sel.sel.nkeys];
> +            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
> +            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
> +            pedit_key_ex->htype = m->htype;
> +            pedit_key->off = cur_offset;
> +            pedit_key->mask = ~mask_word;
> +            pedit_key->val = *data & mask_word;
> +            sel.sel.nkeys++;
> +            csum_update_flag(flower, m->htype);
> +        }
> +    }
> +    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
> +
> +    return 0;
> +}
> +
> +static int
>  nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>  {
>      size_t offset;
> @@ -1002,11 +1354,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
>      nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
>                              &flower->mask.member, sizeof flower->key.member)
>
> -static void
> +static int
>  nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>  {
> +
>      uint16_t host_eth_type = ntohs(flower->key.eth_type);
>      bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
> +    int err;
> +
> +    /* need to parse acts first as some acts require changing the matching */
> +    err  = nl_msg_put_flower_acts(request, flower);
> +    if (err) {
> +        return err;
> +    }
>
>      if (is_vlan) {
>          host_eth_type = ntohs(flower->key.encap_eth_type);
> @@ -1062,7 +1422,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>          nl_msg_put_flower_tunnel(request, flower);
>      }
>
> -    nl_msg_put_flower_acts(request, flower);
> +    return 0;
>  }

The above snippet seems like it could be logically separate and kept
as a different patch, as it shouldn't affect behaviour at all but it
does actually introduce additional error checking. This could assist
git bisect if necessary.

>
>  int
> @@ -1085,11 +1445,17 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
>      nl_msg_put_string(&request, TCA_KIND, "flower");
>      basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
>      {
> -        nl_msg_put_flower_options(&request, flower);
> +        error = nl_msg_put_flower_options(&request, flower);
> +
> +        if (error) {
> +            ofpbuf_uninit(&request);
> +            return error;
> +        }
>      }
>      nl_msg_end_nested(&request, basic_offset);
>
>      error = tc_transact(&request, &reply);
> +

Random extra newline?

>      if (!error) {
>          struct tcmsg *tc =
>              ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
> diff --git a/lib/tc.h b/lib/tc.h
> index 5f363d0..2269a22 100644
> --- a/lib/tc.h
> +++ b/lib/tc.h
> @@ -93,6 +93,7 @@ struct tc_flower_key {
>      struct {
>          ovs_be32 ipv4_src;
>          ovs_be32 ipv4_dst;
> +        uint8_t rewrite_ttl;
>      } ipv4;
>      struct {
>          struct in6_addr ipv6_src;
> @@ -117,6 +118,17 @@ struct tc_flower {
>      uint64_t lastused;
>
>      struct {
> +        bool rewrite;
> +        uint8_t pad1[3];
> +        struct tc_flower_key key;
> +        uint8_t pad2[3];
> +        struct tc_flower_key mask;
> +        uint8_t pad3[3];
> +    } rewrite;
> +
> +    uint32_t csum_update_flags;
> +
> +    struct {
>          bool set;
>          ovs_be64 id;
>          ovs_be16 tp_src;

This is maybe an aside, but it seems like 'rewrite_ttl' already exists
in 'struct tc_flower_key'. Should it also be in 'struct tc_flower'
like this?

Cheers,
Joe
Paul Blakey Aug. 15, 2017, 8:19 a.m. | #3
On 15/08/2017 00:56, Joe Stringer wrote:
> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>> From: Paul Blakey <paulb@mellanox.com>
>>
>> To be later used to implement ovs action set offloading.
>>
>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>> ---
> 
> Hi Paul and folks, some questions inline.
> 
> utilities/checkpatch.py says:
> 
> $ utilities/checkpatch.py -4
> == Checking 00ecb482f5d1 ("compat: Add act_pedit compatibility for old
> kernels") ==
> Lines checked: 127, no obvious problems found
> 
> == Checking 7503746718f6 ("odp-util: Expose ovs flow key attr len
> table for reuse") ==
> Lines checked: 195, no obvious problems found
> 
> == Checking 1e8ab68aefe1 ("tc: Add header rewrite using tc pedit action") ==
> ERROR: Improper whitespace around control block
> #359 FILE: lib/tc.c:480:
>     NL_ATTR_FOR_EACH(nla, left, keys_ex, keys_ex_size) {
> 
> Lines checked: 721, Warnings: 0, Errors: 1
> 
> == Checking f6b52ce504c2 ("netdev-tc-offloads: Add support for action set") ==
> ERROR: Improper whitespace around control block
> #908 FILE: lib/netdev-tc-offloads.c:590:
>     NL_ATTR_FOR_EACH_UNSAFE(set_attr, set_left, set, set_len) {
> 
> Lines checked: 981, Warnings: 0, Errors: 1
> 
> <snip>
> 
>> +static struct flower_key_to_pedit flower_pedit_map[] = {
>> +    [offsetof(struct tc_flower_key, ipv4.ipv4_src)] = {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>> +        12,
>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
>> +    },
> 
> Why are these items indexed by the offset, and not just an array we
> iterate? It seems like the only places where we iterate this, we use
> "for (i=0; ... i++)", which means that we iterate several times across
> empty items, and this array is sparsely populated.
Right we can make this one not sparse as there is no direct access to 
some index. Or some of the suggestions on the other patch's map (like 
creating a fast map for both put and dump paths).

> 
>> +    [offsetof(struct tc_flower_key, ipv4.ipv4_dst)] = {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>> +        16,
>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
>> +    },
>> +    [offsetof(struct tc_flower_key, ipv4.rewrite_ttl)] = {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>> +        8,
>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
>> +    },
> 
> <snip>
> 
>> @@ -346,6 +425,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
>>       }
>>   }
>>
>> +static const struct nl_policy pedit_policy[] = {
>> +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
>> +                                     .min_len = sizeof(struct tc_pedit),
>> +                                     .optional = false, },
>> +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
>> +                                      .optional = false, },
>> +};
>> +
>> +static int
>> +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
>> +{
>> +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
>> +    const struct tc_pedit *pe;
>> +    const struct tc_pedit_key *keys;
>> +    const struct nlattr *nla, *keys_ex, *ex_type;
>> +    const void *keys_attr;
>> +    char *rewrite_key = (void *) &flower->rewrite.key;
>> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
> 
> These are actually 'struct tc_flower_key', shouldn't we use the actual
> types? (I realise there is pointer arithmetic below, but usually we
> restrict the usage of void casting and pointer arithmetic as much as
> possible).
> 

Yes, It's for the pointer arithmetic. (void *) cast was for the
clangs warning "cast increases required alignment of target type"
which we couldn't find another way to suppress.
I don't think alignments an issue here.


>> +    size_t keys_ex_size, left;
>> +    int type, i = 0;
>> +
>> +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
>> +                         ARRAY_SIZE(pedit_policy))) {
>> +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
>> +        return EPROTO;
>> +    }
>> +
>> +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
>> +    keys = pe->keys;
>> +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
>> +    keys_ex = nl_attr_get(keys_attr);
>> +    keys_ex_size = nl_attr_get_size(keys_attr);
>> +
>> +    NL_ATTR_FOR_EACH(nla, left, keys_ex, keys_ex_size) {
>> +        if (i >= pe->nkeys) {
>> +            break;
>> +        }
>> +
>> +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
>> +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
>> +            type = nl_attr_get_u16(ex_type);
>> +
>> +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
>> +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
>> +                int flower_off = j;
>> +                int sz = m->size;
>> +                int mf = m->offset;
>> +
>> +                if (!sz || m->htype != type) {
>> +                   continue;
>> +                }
> 
> If flower_pedit_map was just a regular array and the offset was
> included in 'struct flower_key_to_pedit', then I don't think we need
> the above check?
> 
>> +
>> +                /* check overlap between current pedit key, which is always
>> +                 * 4 bytes (range [off, off + 3]), and a map entry in
>> +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
>> +                if ((keys->off >= mf && keys->off < mf + sz)
>> +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
>> +                    int diff = flower_off + (keys->off - mf);
>> +                    uint32_t *dst = (void *) (rewrite_key + diff);
>> +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
>> +                    uint32_t mask = ~(keys->mask);
>> +                    uint32_t zero_bits;
>> +
>> +                    if (keys->off < mf) {
>> +                        zero_bits = 8 * (mf - keys->off);
>> +                        mask &= UINT32_MAX << zero_bits;
>> +                    } else if (keys->off + 4 > mf + m->size) {
>> +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
>> +                        mask &= UINT32_MAX >> zero_bits;
>> +                    }
> 
> I think this is all trying to avoid having a giant switch case here
> which would handle all of the possible flower key attributes, similar
> to somewhere else where this kind of iteration occurs.
Right that was the objective.

  Is there any
> overlap between this code and calc_offsets()calc_offsets was to calculate the word offsets and masks for the first 
and last word to write, here its the reverse so its already in word size 
with correct masks, and easier to write back.

> 
> Could we somehow trigger an assert or a strong warning message if the
> offsets of our flower_pedit_map and the flower key don't align, given
> that there's a bunch of pointer arithmetic happening here?
> 

Not sure what you mean here, we are aligned to words (4 bytes) and
"cut" the words if they overflow the target. we also padded 
flower.rewrite key and mask so we won't overflow to flower member we 
don't mean to write.


> <snip>
> 
>> @@ -589,6 +758,10 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
>>           nl_parse_act_vlan(act_options, flower);
>>       } else if (!strcmp(act_kind, "tunnel_key")) {
>>           nl_parse_act_tunnel_key(act_options, flower);
>> +    } else if (!strcmp(act_kind, "pedit")) {
>> +        nl_parse_act_pedit(act_options, flower);
>> +    } else if (!strcmp(act_kind, "csum")) {
>> +        /* not doing anything for now */
> 
> Should we log a message that we're not handling csums?
> 

We aren't handling them because we don't need to, OVS has an implicit
csum recalculation of rewritten packet headers.

> <snip>
> 
>> +static int
>> +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
>> +                                 struct tc_flower *flower)
>> +{
>> +    struct {
>> +        struct tc_pedit sel;
>> +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
>> +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
>> +    } sel = {
>> +        .sel = {
>> +            .nkeys = 0
>> +        }
>> +    };
>> +    int i, j;
>> +
>> +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
>> +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
>> +        struct tc_pedit_key *pedit_key = NULL;
>> +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
>> +        uint32_t *mask, *data, first_word_mask, last_word_mask;
>> +        int cnt = 0, cur_offset = 0;
>> +
>> +        if (!m->size) {
>> +            continue;
>> +        }
>> +
>> +        calc_offsets(flower, m, i, &cur_offset, &cnt, &last_word_mask,
>> +                     &first_word_mask, &mask, &data);
>> +
>> +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
>> +            uint32_t mask_word = *mask;
>> +
>> +            if (j == 0) {
>> +                mask_word &= first_word_mask;
>> +            }
>> +            if (j == cnt - 1) {
>> +                mask_word &= last_word_mask;
>> +            }
>> +            if (!mask_word) {
>> +                continue;
>> +            }
>> +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
>> +                VLOG_ERR_RL(&error_rl, "reached too many pedit offsets: %d",
>> +                            MAX_PEDIT_OFFSETS);
>> +                return EOPNOTSUPP;
>> +            }
> 
> Just wondering if this should be err or warn, since presumably the
> result is that such flows avoid TC and should end up being correctly
> handled by the openvswitch kernel datapath?

Right, we'll change that.

> 
>> +
>> +            pedit_key = &sel.keys[sel.sel.nkeys];
>> +            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
>> +            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
>> +            pedit_key_ex->htype = m->htype;
>> +            pedit_key->off = cur_offset;
>> +            pedit_key->mask = ~mask_word;
>> +            pedit_key->val = *data & mask_word;
>> +            sel.sel.nkeys++;
>> +            csum_update_flag(flower, m->htype);
>> +        }
>> +    }
>> +    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
>> +
>> +    return 0;
>> +}
>> +
>> +static int
>>   nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>>   {
>>       size_t offset;
>> @@ -1002,11 +1354,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
>>       nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
>>                               &flower->mask.member, sizeof flower->key.member)
>>
>> -static void
>> +static int
>>   nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>>   {
>> +
>>       uint16_t host_eth_type = ntohs(flower->key.eth_type);
>>       bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
>> +    int err;
>> +
>> +    /* need to parse acts first as some acts require changing the matching */
>> +    err  = nl_msg_put_flower_acts(request, flower);
>> +    if (err) {
>> +        return err;
>> +    }
>>
>>       if (is_vlan) {
>>           host_eth_type = ntohs(flower->key.encap_eth_type);
>> @@ -1062,7 +1422,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>>           nl_msg_put_flower_tunnel(request, flower);
>>       }
>>
>> -    nl_msg_put_flower_acts(request, flower);
>> +    return 0;
>>   }
> 
> The above snippet seems like it could be logically separate and kept
> as a different patch, as it shouldn't affect behaviour at all but it
> does actually introduce additional error checking. This could assist
> git bisect if necessary.

We'll do that, thanks.

> 
>>
>>   int
>> @@ -1085,11 +1445,17 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
>>       nl_msg_put_string(&request, TCA_KIND, "flower");
>>       basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
>>       {
>> -        nl_msg_put_flower_options(&request, flower);
>> +        error = nl_msg_put_flower_options(&request, flower);
>> +
>> +        if (error) {
>> +            ofpbuf_uninit(&request);
>> +            return error;
>> +        }
>>       }
>>       nl_msg_end_nested(&request, basic_offset);
>>
>>       error = tc_transact(&request, &reply);
>> +
> 
> Random extra newline?

Will be removed, thanks.

> 
>>       if (!error) {
>>           struct tcmsg *tc =
>>               ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
>> diff --git a/lib/tc.h b/lib/tc.h
>> index 5f363d0..2269a22 100644
>> --- a/lib/tc.h
>> +++ b/lib/tc.h
>> @@ -93,6 +93,7 @@ struct tc_flower_key {
>>       struct {
>>           ovs_be32 ipv4_src;
>>           ovs_be32 ipv4_dst;
>> +        uint8_t rewrite_ttl;
>>       } ipv4;
>>       struct {
>>           struct in6_addr ipv6_src;
>> @@ -117,6 +118,17 @@ struct tc_flower {
>>       uint64_t lastused;
>>
>>       struct {
>> +        bool rewrite;
>> +        uint8_t pad1[3];
>> +        struct tc_flower_key key;
>> +        uint8_t pad2[3];
>> +        struct tc_flower_key mask;
>> +        uint8_t pad3[3];
>> +    } rewrite;
>> +
>> +    uint32_t csum_update_flags;
>> +
>> +    struct {
>>           bool set;
>>           ovs_be64 id;
>>           ovs_be16 tp_src;
> 
> This is maybe an aside, but it seems like 'rewrite_ttl' already exists
> in 'struct tc_flower_key'. Should it also be in 'struct tc_flower'
> like this?
> 

On latest master, we have ip_ttl in struct tc_flower, which is for 
matching on ipv6/ipv4 ttl/tos.
This one is for rewriting of ttl, we need to distinguish between
ipv4 ttl and ipv6 ttl to translate them to different action pedit 
offsets/layers, we didn't want to add ip protocol dependencies in the map.

> Cheers,
> Joe
>
Joe Stringer Aug. 15, 2017, 5:04 p.m. | #4
On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>
>
> On 15/08/2017 00:56, Joe Stringer wrote:
>>
>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>
>>> From: Paul Blakey <paulb@mellanox.com>
>>>
>>> @@ -346,6 +425,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct
>>> tc_flower *flower) {
>>>       }
>>>   }
>>>
>>> +static const struct nl_policy pedit_policy[] = {
>>> +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
>>> +                                     .min_len = sizeof(struct tc_pedit),
>>> +                                     .optional = false, },
>>> +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
>>> +                                      .optional = false, },
>>> +};
>>> +
>>> +static int
>>> +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
>>> +{
>>> +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
>>> +    const struct tc_pedit *pe;
>>> +    const struct tc_pedit_key *keys;
>>> +    const struct nlattr *nla, *keys_ex, *ex_type;
>>> +    const void *keys_attr;
>>> +    char *rewrite_key = (void *) &flower->rewrite.key;
>>> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
>>
>>
>> These are actually 'struct tc_flower_key', shouldn't we use the actual
>> types? (I realise there is pointer arithmetic below, but usually we
>> restrict the usage of void casting and pointer arithmetic as much as
>> possible).
>>
>
> Yes, It's for the pointer arithmetic. (void *) cast was for the
> clangs warning "cast increases required alignment of target type"
> which we couldn't find another way to suppress.
> I don't think alignments an issue here.

Ah. I don't have particularly much experience with pointer alignment.

>
>>> +    size_t keys_ex_size, left;
>>> +    int type, i = 0;
>>> +
>>> +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
>>> +                         ARRAY_SIZE(pedit_policy))) {
>>> +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
>>> +        return EPROTO;
>>> +    }
>>> +
>>> +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
>>> +    keys = pe->keys;
>>> +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
>>> +    keys_ex = nl_attr_get(keys_attr);
>>> +    keys_ex_size = nl_attr_get_size(keys_attr);
>>> +
>>> +    NL_ATTR_FOR_EACH(nla, left, keys_ex, keys_ex_size) {
>>> +        if (i >= pe->nkeys) {
>>> +            break;
>>> +        }
>>> +
>>> +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
>>> +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
>>> +            type = nl_attr_get_u16(ex_type);
>>> +
>>> +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
>>> +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
>>> +                int flower_off = j;
>>> +                int sz = m->size;
>>> +                int mf = m->offset;
>>> +
>>> +                if (!sz || m->htype != type) {
>>> +                   continue;
>>> +                }
>>
>>
>> If flower_pedit_map was just a regular array and the offset was
>> included in 'struct flower_key_to_pedit', then I don't think we need
>> the above check?
>>
>>> +
>>> +                /* check overlap between current pedit key, which is
>>> always
>>> +                 * 4 bytes (range [off, off + 3]), and a map entry in
>>> +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
>>> +                if ((keys->off >= mf && keys->off < mf + sz)
>>> +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz))
>>> {
>>> +                    int diff = flower_off + (keys->off - mf);
>>> +                    uint32_t *dst = (void *) (rewrite_key + diff);
>>> +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
>>> +                    uint32_t mask = ~(keys->mask);
>>> +                    uint32_t zero_bits;
>>> +
>>> +                    if (keys->off < mf) {
>>> +                        zero_bits = 8 * (mf - keys->off);
>>> +                        mask &= UINT32_MAX << zero_bits;
>>> +                    } else if (keys->off + 4 > mf + m->size) {
>>> +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
>>> +                        mask &= UINT32_MAX >> zero_bits;
>>> +                    }
>>
>>
>> I think this is all trying to avoid having a giant switch case here
>> which would handle all of the possible flower key attributes, similar
>> to somewhere else where this kind of iteration occurs.
>
> Right that was the objective.
>
>  Is there any
>>
>> overlap between this code and calc_offsets()calc_offsets was to calculate
>> the word offsets and masks for the first
>
> and last word to write, here its the reverse so its already in word size
> with correct masks, and easier to write back.

OK.

>>
>> Could we somehow trigger an assert or a strong warning message if the
>> offsets of our flower_pedit_map and the flower key don't align, given
>> that there's a bunch of pointer arithmetic happening here?
>>
>
> Not sure what you mean here, we are aligned to words (4 bytes) and
> "cut" the words if they overflow the target. we also padded flower.rewrite
> key and mask so we won't overflow to flower member we don't mean to write.

What I was trying to figure out is if there is any additional
compile-time check we could have to ensure that we copy exactly the
right bits to exactly the right place. I'm not sure that there is.

>> <snip>
>>
>>> @@ -589,6 +758,10 @@ nl_parse_single_action(struct nlattr *action, struct
>>> tc_flower *flower)
>>>           nl_parse_act_vlan(act_options, flower);
>>>       } else if (!strcmp(act_kind, "tunnel_key")) {
>>>           nl_parse_act_tunnel_key(act_options, flower);
>>> +    } else if (!strcmp(act_kind, "pedit")) {
>>> +        nl_parse_act_pedit(act_options, flower);
>>> +    } else if (!strcmp(act_kind, "csum")) {
>>> +        /* not doing anything for now */
>>
>>
>> Should we log a message that we're not handling csums?
>>
>
> We aren't handling them because we don't need to, OVS has an implicit
> csum recalculation of rewritten packet headers.

OK.

>> <snip>
>>
>>> +static int
>>> +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
>>> +                                 struct tc_flower *flower)
>>> +{
>>> +    struct {
>>> +        struct tc_pedit sel;
>>> +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
>>> +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
>>> +    } sel = {
>>> +        .sel = {
>>> +            .nkeys = 0
>>> +        }
>>> +    };
>>> +    int i, j;
>>> +
>>> +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
>>> +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
>>> +        struct tc_pedit_key *pedit_key = NULL;
>>> +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
>>> +        uint32_t *mask, *data, first_word_mask, last_word_mask;
>>> +        int cnt = 0, cur_offset = 0;
>>> +
>>> +        if (!m->size) {
>>> +            continue;
>>> +        }
>>> +
>>> +        calc_offsets(flower, m, i, &cur_offset, &cnt, &last_word_mask,
>>> +                     &first_word_mask, &mask, &data);
>>> +
>>> +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
>>> +            uint32_t mask_word = *mask;
>>> +
>>> +            if (j == 0) {
>>> +                mask_word &= first_word_mask;
>>> +            }
>>> +            if (j == cnt - 1) {
>>> +                mask_word &= last_word_mask;
>>> +            }
>>> +            if (!mask_word) {
>>> +                continue;
>>> +            }
>>> +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
>>> +                VLOG_ERR_RL(&error_rl, "reached too many pedit offsets:
>>> %d",
>>> +                            MAX_PEDIT_OFFSETS);
>>> +                return EOPNOTSUPP;
>>> +            }
>>
>>
>> Just wondering if this should be err or warn, since presumably the
>> result is that such flows avoid TC and should end up being correctly
>> handled by the openvswitch kernel datapath?
>
>
> Right, we'll change that.

Thanks.

>>> @@ -1062,7 +1422,7 @@ nl_msg_put_flower_options(struct ofpbuf *request,
>>> struct tc_flower *flower)
>>>           nl_msg_put_flower_tunnel(request, flower);
>>>       }
>>>
>>> -    nl_msg_put_flower_acts(request, flower);
>>> +    return 0;
>>>   }
>>
>>
>> The above snippet seems like it could be logically separate and kept
>> as a different patch, as it shouldn't affect behaviour at all but it
>> does actually introduce additional error checking. This could assist
>> git bisect if necessary.
>
>
> We'll do that, thanks.

OK.

>>
>>>
>>>   int
>>> @@ -1085,11 +1445,17 @@ tc_replace_flower(int ifindex, uint16_t prio,
>>> uint32_t handle,
>>>       nl_msg_put_string(&request, TCA_KIND, "flower");
>>>       basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
>>>       {
>>> -        nl_msg_put_flower_options(&request, flower);
>>> +        error = nl_msg_put_flower_options(&request, flower);
>>> +
>>> +        if (error) {
>>> +            ofpbuf_uninit(&request);
>>> +            return error;
>>> +        }
>>>       }
>>>       nl_msg_end_nested(&request, basic_offset);
>>>
>>>       error = tc_transact(&request, &reply);
>>> +
>>
>>
>> Random extra newline?
>
>
> Will be removed, thanks.
>
>
>>
>>>       if (!error) {
>>>           struct tcmsg *tc =
>>>               ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
>>> diff --git a/lib/tc.h b/lib/tc.h
>>> index 5f363d0..2269a22 100644
>>> --- a/lib/tc.h
>>> +++ b/lib/tc.h
>>> @@ -93,6 +93,7 @@ struct tc_flower_key {
>>>       struct {
>>>           ovs_be32 ipv4_src;
>>>           ovs_be32 ipv4_dst;
>>> +        uint8_t rewrite_ttl;
>>>       } ipv4;
>>>       struct {
>>>           struct in6_addr ipv6_src;
>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>       uint64_t lastused;
>>>
>>>       struct {
>>> +        bool rewrite;
>>> +        uint8_t pad1[3];
>>> +        struct tc_flower_key key;
>>> +        uint8_t pad2[3];
>>> +        struct tc_flower_key mask;
>>> +        uint8_t pad3[3];
>>> +    } rewrite;

Now that I get why the pads are here.. ;)

Is there an existing macro we can use to ensure that these pad out to
32-bit boundaries?

>>> +
>>> +    uint32_t csum_update_flags;
>>> +
>>> +    struct {
>>>           bool set;
>>>           ovs_be64 id;
>>>           ovs_be16 tp_src;
>>
>>
>> This is maybe an aside, but it seems like 'rewrite_ttl' already exists
>> in 'struct tc_flower_key'. Should it also be in 'struct tc_flower'
>> like this?
>>
>
> On latest master, we have ip_ttl in struct tc_flower, which is for matching
> on ipv6/ipv4 ttl/tos.
> This one is for rewriting of ttl, we need to distinguish between
> ipv4 ttl and ipv6 ttl to translate them to different action pedit
> offsets/layers, we didn't want to add ip protocol dependencies in the map.
Paul Blakey Aug. 17, 2017, 9:18 a.m. | #5
On 15/08/2017 20:04, Joe Stringer wrote:
> On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>>
>>
>> On 15/08/2017 00:56, Joe Stringer wrote:
>>>
>>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>>
>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>
>>>> @@ -346,6 +425,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct
>>>> tc_flower *flower) {
>>>>        }
>>>>    }
>>>>
>>>> +static const struct nl_policy pedit_policy[] = {
>>>> +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
>>>> +                                     .min_len = sizeof(struct tc_pedit),
>>>> +                                     .optional = false, },
>>>> +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
>>>> +                                      .optional = false, },
>>>> +};
>>>> +
>>>> +static int
>>>> +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
>>>> +{
>>>> +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
>>>> +    const struct tc_pedit *pe;
>>>> +    const struct tc_pedit_key *keys;
>>>> +    const struct nlattr *nla, *keys_ex, *ex_type;
>>>> +    const void *keys_attr;
>>>> +    char *rewrite_key = (void *) &flower->rewrite.key;
>>>> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
>>>
>>>
>>> These are actually 'struct tc_flower_key', shouldn't we use the actual
>>> types? (I realise there is pointer arithmetic below, but usually we
>>> restrict the usage of void casting and pointer arithmetic as much as
>>> possible).
>>>
>>
>> Yes, It's for the pointer arithmetic. (void *) cast was for the
>> clangs warning "cast increases required alignment of target type"
>> which we couldn't find another way to suppress.
>> I don't think alignments an issue here.
> 
> Ah. I don't have particularly much experience with pointer alignment.
> 
>>
>>>> +    size_t keys_ex_size, left;
>>>> +    int type, i = 0;
>>>> +
>>>> +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
>>>> +                         ARRAY_SIZE(pedit_policy))) {
>>>> +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
>>>> +        return EPROTO;
>>>> +    }
>>>> +
>>>> +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
>>>> +    keys = pe->keys;
>>>> +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
>>>> +    keys_ex = nl_attr_get(keys_attr);
>>>> +    keys_ex_size = nl_attr_get_size(keys_attr);
>>>> +
>>>> +    NL_ATTR_FOR_EACH(nla, left, keys_ex, keys_ex_size) {
>>>> +        if (i >= pe->nkeys) {
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
>>>> +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
>>>> +            type = nl_attr_get_u16(ex_type);
>>>> +
>>>> +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
>>>> +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
>>>> +                int flower_off = j;
>>>> +                int sz = m->size;
>>>> +                int mf = m->offset;
>>>> +
>>>> +                if (!sz || m->htype != type) {
>>>> +                   continue;
>>>> +                }
>>>
>>>
>>> If flower_pedit_map was just a regular array and the offset was
>>> included in 'struct flower_key_to_pedit', then I don't think we need
>>> the above check?
>>>
>>>> +
>>>> +                /* check overlap between current pedit key, which is
>>>> always
>>>> +                 * 4 bytes (range [off, off + 3]), and a map entry in
>>>> +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
>>>> +                if ((keys->off >= mf && keys->off < mf + sz)
>>>> +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz))
>>>> {
>>>> +                    int diff = flower_off + (keys->off - mf);
>>>> +                    uint32_t *dst = (void *) (rewrite_key + diff);
>>>> +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
>>>> +                    uint32_t mask = ~(keys->mask);
>>>> +                    uint32_t zero_bits;
>>>> +
>>>> +                    if (keys->off < mf) {
>>>> +                        zero_bits = 8 * (mf - keys->off);
>>>> +                        mask &= UINT32_MAX << zero_bits;
>>>> +                    } else if (keys->off + 4 > mf + m->size) {
>>>> +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
>>>> +                        mask &= UINT32_MAX >> zero_bits;
>>>> +                    }
>>>
>>>
>>> I think this is all trying to avoid having a giant switch case here
>>> which would handle all of the possible flower key attributes, similar
>>> to somewhere else where this kind of iteration occurs.
>>
>> Right that was the objective.
>>
>>   Is there any
>>>
>>> overlap between this code and calc_offsets()calc_offsets was to calculate
>>> the word offsets and masks for the first
>>
>> and last word to write, here its the reverse so its already in word size
>> with correct masks, and easier to write back.
> 
> OK.
> 
>>>
>>> Could we somehow trigger an assert or a strong warning message if the
>>> offsets of our flower_pedit_map and the flower key don't align, given
>>> that there's a bunch of pointer arithmetic happening here?
>>>
>>
>> Not sure what you mean here, we are aligned to words (4 bytes) and
>> "cut" the words if they overflow the target. we also padded flower.rewrite
>> key and mask so we won't overflow to flower member we don't mean to write.
> 
> What I was trying to figure out is if there is any additional
> compile-time check we could have to ensure that we copy exactly the
> right bits to exactly the right place. I'm not sure that there is.
> 
>>> <snip>
>>>
>>>> @@ -589,6 +758,10 @@ nl_parse_single_action(struct nlattr *action, struct
>>>> tc_flower *flower)
>>>>            nl_parse_act_vlan(act_options, flower);
>>>>        } else if (!strcmp(act_kind, "tunnel_key")) {
>>>>            nl_parse_act_tunnel_key(act_options, flower);
>>>> +    } else if (!strcmp(act_kind, "pedit")) {
>>>> +        nl_parse_act_pedit(act_options, flower);
>>>> +    } else if (!strcmp(act_kind, "csum")) {
>>>> +        /* not doing anything for now */
>>>
>>>
>>> Should we log a message that we're not handling csums?
>>>
>>
>> We aren't handling them because we don't need to, OVS has an implicit
>> csum recalculation of rewritten packet headers.
> 
> OK.
> 
>>> <snip>
>>>
>>>> +static int
>>>> +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
>>>> +                                 struct tc_flower *flower)
>>>> +{
>>>> +    struct {
>>>> +        struct tc_pedit sel;
>>>> +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
>>>> +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
>>>> +    } sel = {
>>>> +        .sel = {
>>>> +            .nkeys = 0
>>>> +        }
>>>> +    };
>>>> +    int i, j;
>>>> +
>>>> +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
>>>> +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
>>>> +        struct tc_pedit_key *pedit_key = NULL;
>>>> +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
>>>> +        uint32_t *mask, *data, first_word_mask, last_word_mask;
>>>> +        int cnt = 0, cur_offset = 0;
>>>> +
>>>> +        if (!m->size) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        calc_offsets(flower, m, i, &cur_offset, &cnt, &last_word_mask,
>>>> +                     &first_word_mask, &mask, &data);
>>>> +
>>>> +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
>>>> +            uint32_t mask_word = *mask;
>>>> +
>>>> +            if (j == 0) {
>>>> +                mask_word &= first_word_mask;
>>>> +            }
>>>> +            if (j == cnt - 1) {
>>>> +                mask_word &= last_word_mask;
>>>> +            }
>>>> +            if (!mask_word) {
>>>> +                continue;
>>>> +            }
>>>> +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
>>>> +                VLOG_ERR_RL(&error_rl, "reached too many pedit offsets:
>>>> %d",
>>>> +                            MAX_PEDIT_OFFSETS);
>>>> +                return EOPNOTSUPP;
>>>> +            }
>>>
>>>
>>> Just wondering if this should be err or warn, since presumably the
>>> result is that such flows avoid TC and should end up being correctly
>>> handled by the openvswitch kernel datapath?
>>
>>
>> Right, we'll change that.
> 
> Thanks.
> 
>>>> @@ -1062,7 +1422,7 @@ nl_msg_put_flower_options(struct ofpbuf *request,
>>>> struct tc_flower *flower)
>>>>            nl_msg_put_flower_tunnel(request, flower);
>>>>        }
>>>>
>>>> -    nl_msg_put_flower_acts(request, flower);
>>>> +    return 0;
>>>>    }
>>>
>>>
>>> The above snippet seems like it could be logically separate and kept
>>> as a different patch, as it shouldn't affect behaviour at all but it
>>> does actually introduce additional error checking. This could assist
>>> git bisect if necessary.
>>
>>
>> We'll do that, thanks.
> 
> OK.
> 
>>>
>>>>
>>>>    int
>>>> @@ -1085,11 +1445,17 @@ tc_replace_flower(int ifindex, uint16_t prio,
>>>> uint32_t handle,
>>>>        nl_msg_put_string(&request, TCA_KIND, "flower");
>>>>        basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
>>>>        {
>>>> -        nl_msg_put_flower_options(&request, flower);
>>>> +        error = nl_msg_put_flower_options(&request, flower);
>>>> +
>>>> +        if (error) {
>>>> +            ofpbuf_uninit(&request);
>>>> +            return error;
>>>> +        }
>>>>        }
>>>>        nl_msg_end_nested(&request, basic_offset);
>>>>
>>>>        error = tc_transact(&request, &reply);
>>>> +
>>>
>>>
>>> Random extra newline?
>>
>>
>> Will be removed, thanks.
>>
>>
>>>
>>>>        if (!error) {
>>>>            struct tcmsg *tc =
>>>>                ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
>>>> diff --git a/lib/tc.h b/lib/tc.h
>>>> index 5f363d0..2269a22 100644
>>>> --- a/lib/tc.h
>>>> +++ b/lib/tc.h
>>>> @@ -93,6 +93,7 @@ struct tc_flower_key {
>>>>        struct {
>>>>            ovs_be32 ipv4_src;
>>>>            ovs_be32 ipv4_dst;
>>>> +        uint8_t rewrite_ttl;
>>>>        } ipv4;
>>>>        struct {
>>>>            struct in6_addr ipv6_src;
>>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>>        uint64_t lastused;
>>>>
>>>>        struct {
>>>> +        bool rewrite;
>>>> +        uint8_t pad1[3];
>>>> +        struct tc_flower_key key;
>>>> +        uint8_t pad2[3];
>>>> +        struct tc_flower_key mask;
>>>> +        uint8_t pad3[3];
>>>> +    } rewrite;
> 
> Now that I get why the pads are here.. ;)
> 
> Is there an existing macro we can use to ensure that these pad out to
> 32-bit boundaries?
> 

I'm not sure if that's possible, the size is a minimum of extra 24bits, 
so its can't overflow with writing on any address below it. The compiler
might add some extra padding but that shouldn't matter.


>>>> +
>>>> +    uint32_t csum_update_flags;
>>>> +
>>>> +    struct {
>>>>            bool set;
>>>>            ovs_be64 id;
>>>>            ovs_be16 tp_src;
>>>
>>>
>>> This is maybe an aside, but it seems like 'rewrite_ttl' already exists
>>> in 'struct tc_flower_key'. Should it also be in 'struct tc_flower'
>>> like this?
>>>
>>
>> On latest master, we have ip_ttl in struct tc_flower, which is for matching
>> on ipv6/ipv4 ttl/tos.
>> This one is for rewriting of ttl, we need to distinguish between
>> ipv4 ttl and ipv6 ttl to translate them to different action pedit
>> offsets/layers, we didn't want to add ip protocol dependencies in the map.
Paul Blakey Aug. 17, 2017, 9:21 a.m. | #6
On 14/08/2017 16:00, Simon Horman wrote:
> On Tue, Aug 08, 2017 at 05:21:53PM +0300, Roi Dayan wrote:
>> From: Paul Blakey <paulb@mellanox.com>
>>
>> To be later used to implement ovs action set offloading.
>>
>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>> ---
>>   lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   lib/tc.h |  12 +++
>>   2 files changed, 381 insertions(+), 3 deletions(-)
>>
>> diff --git a/lib/tc.c b/lib/tc.c
> 
> ...
> 
>> @@ -589,6 +758,10 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
>>           nl_parse_act_vlan(act_options, flower);
>>       } else if (!strcmp(act_kind, "tunnel_key")) {
>>           nl_parse_act_tunnel_key(act_options, flower);
>> +    } else if (!strcmp(act_kind, "pedit")) {
>> +        nl_parse_act_pedit(act_options, flower);
>> +    } else if (!strcmp(act_kind, "csum")) {
>> +        /* not doing anything for now */
>>       } else {
>>           VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
>>           return EINVAL;
>> @@ -790,6 +963,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy)
>>   }
>>   
>>   static void
>> +nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
>> +{
>> +    size_t offset;
>> +
>> +    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
>> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
>> +    {
>> +        struct tc_csum parm = { .action = TC_ACT_PIPE,
>> +                                .update_flags = flags };
> 
> I think it would be a bit nicer to move param to the top of the function
> and remove the extra { } scope it is currently inside.
> 
>> +
>> +        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
>> +    }
>> +    nl_msg_end_nested(request, offset);
>> +}
>> +
>> +static void
>> +nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
>> +                     struct tc_pedit_key_ex *ex)
>> +{
>> +    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
>> +    size_t offset, offset_keys_ex, offset_key;
>> +    int i;
>> +
>> +    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
>> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
>> +    {
> 
> The extra { } scope here seems unnecessary.
> 
>> +        parm->action = TC_ACT_PIPE;
>> +
>> +        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
>> +        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
>> +        for (i = 0; i < parm->nkeys; i++, ex++) {
>> +            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
>> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
>> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
>> +            nl_msg_end_nested(request, offset_key);
>> +        }
>> +        nl_msg_end_nested(request, offset_keys_ex);
>> +    }
>> +    nl_msg_end_nested(request, offset);
>> +}
>> +
>> +static void
>>   nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
>>   {
>>       size_t offset;
> 
> ...
> 

Hi and thanks for the review,

The above style corresponds with the rest of the file (scope and 
nl_msg_put_act_* funcs)
Maybe we do a style patch that remove all this extra scopes in a later 
patch?
Paul Blakey Aug. 17, 2017, 12:27 p.m. | #7
On 15/08/2017 20:04, Joe Stringer wrote:
> On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>>
>>
>> On 15/08/2017 00:56, Joe Stringer wrote:
>>>
>>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>>
>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>

<snip>


>>>> @@ -1062,7 +1422,7 @@ nl_msg_put_flower_options(struct ofpbuf *request,
>>>> struct tc_flower *flower)
>>>>            nl_msg_put_flower_tunnel(request, flower);
>>>>        }
>>>>
>>>> -    nl_msg_put_flower_acts(request, flower);
>>>> +    return 0;
>>>>    }
>>>
>>>
>>> The above snippet seems like it could be logically separate and kept
>>> as a different patch, as it shouldn't affect behaviour at all but it
>>> does actually introduce additional error checking. This could assist
>>> git bisect if necessary.
>>
>>
>> We'll do that, thanks.
> 
> OK.
> 

Looking at this again, we only changed the return from void to int 
because of this change which can fail, so we can't separate this without 
a commit that just always returns 0.
Joe Stringer Aug. 17, 2017, 9:52 p.m. | #8
On 17 August 2017 at 02:18, Paul Blakey <paulb@mellanox.com> wrote:
>
>
> On 15/08/2017 20:04, Joe Stringer wrote:
>>
>> On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>>>
>>>
>>>
>>> On 15/08/2017 00:56, Joe Stringer wrote:
>>>>
>>>>
>>>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>>>
>>>>>
>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>
>>>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>>>        uint64_t lastused;
>>>>>
>>>>>        struct {
>>>>> +        bool rewrite;
>>>>> +        uint8_t pad1[3];
>>>>> +        struct tc_flower_key key;
>>>>> +        uint8_t pad2[3];
>>>>> +        struct tc_flower_key mask;
>>>>> +        uint8_t pad3[3];
>>>>> +    } rewrite;
>>
>>
>> Now that I get why the pads are here.. ;)
>>
>> Is there an existing macro we can use to ensure that these pad out to
>> 32-bit boundaries?
>>
>
> I'm not sure if that's possible, the size is a minimum of extra 24bits, so
> its can't overflow with writing on any address below it. The compiler
> might add some extra padding but that shouldn't matter.

My thought was that the main concern here is to align the fields to
32-bit boundaries, so if it already does this then I wouldn't expect
to need any padding at all? For instance, on my system with these
patches I see with pahole that 'struct tc_flower_key' has size 84, and
the 'pad2' and 'pad3' appear to be unnecessary:

        struct {
               _Bool              rewrite;              /*   216     1 */
               uint8_t            pad1[0];              /*   217     0 */
               struct tc_flower_key key;                /*   220    84 */
               /* --- cacheline 1 boundary (64 bytes) was 21 bytes ago --- */
               uint8_t            pad2[0];              /*   304     0 */
               struct tc_flower_key mask;               /*   308    84 */
               /* --- cacheline 2 boundary (128 bytes) was 41 bytes ago --- */
               uint8_t            pad3[0];              /*   392     0 */
       } rewrite;                                       /*   216   180 */

However, if in future someone adds a 1-byte member to tc_flower_key
then I'm not sure that this alignment will hold. On my system, it
seems like that would end up padding tc_flower_key out to 88B so these
extra padding would still be unnecessary (although pahole says that it
has a brain fart, so I'm not sure exactly how much confidence I should
have in this).

What I was thinking about was whether we could use something like
PADDED_MEMBERS() in this case.
Paul Blakey Aug. 20, 2017, 8:50 a.m. | #9
On 18/08/2017 00:52, Joe Stringer wrote:
> On 17 August 2017 at 02:18, Paul Blakey <paulb@mellanox.com> wrote:
>>
>>
>> On 15/08/2017 20:04, Joe Stringer wrote:
>>>
>>> On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>>>>
>>>>
>>>>
>>>> On 15/08/2017 00:56, Joe Stringer wrote:
>>>>>
>>>>>
>>>>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>>>>
>>>>>>
>>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>>
>>>>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>>>>         uint64_t lastused;
>>>>>>
>>>>>>         struct {
>>>>>> +        bool rewrite;
>>>>>> +        uint8_t pad1[3];
>>>>>> +        struct tc_flower_key key;
>>>>>> +        uint8_t pad2[3];
>>>>>> +        struct tc_flower_key mask;
>>>>>> +        uint8_t pad3[3];
>>>>>> +    } rewrite;
>>>
>>>
>>> Now that I get why the pads are here.. ;)
>>>
>>> Is there an existing macro we can use to ensure that these pad out to
>>> 32-bit boundaries?
>>>
>>
>> I'm not sure if that's possible, the size is a minimum of extra 24bits, so
>> its can't overflow with writing on any address below it. The compiler
>> might add some extra padding but that shouldn't matter.
> 
> My thought was that the main concern here is to align the fields to
> 32-bit boundaries, so if it already does this then I wouldn't expect
> to need any padding at all? For instance, on my system with these
> patches I see with pahole that 'struct tc_flower_key' has size 84, and
> the 'pad2' and 'pad3' appear to be unnecessary:
> 
>          struct {
>                 _Bool              rewrite;              /*   216     1 */
>                 uint8_t            pad1[0];              /*   217     0 */
>                 struct tc_flower_key key;                /*   220    84 */
>                 /* --- cacheline 1 boundary (64 bytes) was 21 bytes ago --- */
>                 uint8_t            pad2[0];              /*   304     0 */
>                 struct tc_flower_key mask;               /*   308    84 */
>                 /* --- cacheline 2 boundary (128 bytes) was 41 bytes ago --- */
>                 uint8_t            pad3[0];              /*   392     0 */
>         } rewrite;                                       /*   216   180 */
> 
> However, if in future someone adds a 1-byte member to tc_flower_key
> then I'm not sure that this alignment will hold. On my system, it
> seems like that would end up padding tc_flower_key out to 88B so these
> extra padding would still be unnecessary (although pahole says that it
> has a brain fart, so I'm not sure exactly how much confidence I should
> have in this).
> 
> What I was thinking about was whether we could use something like
> PADDED_MEMBERS() in this case.
> 

Are you talking about alignment in regards to performance?
Because the padding is there for overflowing.
If someone adds a new member to struct key/mask that is smaller than a 
32bits to the end of the struct, setting it via a pointer might overflow 
the struct. I don't think PADDED_MEMBERS will work for this type of padding.

We do mask the write to the write size, and it should still be in memory 
of owned by struct flower and since the compiler can't reorder the 
struct we actually only need the last padding to cover the above case.

Maybe we can add alignment when we measure the performance of the code?
Joe Stringer Aug. 21, 2017, 5:45 p.m. | #10
On 20 August 2017 at 01:50, Paul Blakey <paulb@mellanox.com> wrote:
>
>
> On 18/08/2017 00:52, Joe Stringer wrote:
>>
>> On 17 August 2017 at 02:18, Paul Blakey <paulb@mellanox.com> wrote:
>>>
>>>
>>>
>>> On 15/08/2017 20:04, Joe Stringer wrote:
>>>>
>>>>
>>>> On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> On 15/08/2017 00:56, Joe Stringer wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>>>
>>>>>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>>>>>         uint64_t lastused;
>>>>>>>
>>>>>>>         struct {
>>>>>>> +        bool rewrite;
>>>>>>> +        uint8_t pad1[3];
>>>>>>> +        struct tc_flower_key key;
>>>>>>> +        uint8_t pad2[3];
>>>>>>> +        struct tc_flower_key mask;
>>>>>>> +        uint8_t pad3[3];
>>>>>>> +    } rewrite;
>>>>
>>>>
>>>>
>>>> Now that I get why the pads are here.. ;)
>>>>
>>>> Is there an existing macro we can use to ensure that these pad out to
>>>> 32-bit boundaries?
>>>>
>>>
>>> I'm not sure if that's possible, the size is a minimum of extra 24bits,
>>> so
>>> its can't overflow with writing on any address below it. The compiler
>>> might add some extra padding but that shouldn't matter.
>>
>>
>> My thought was that the main concern here is to align the fields to
>> 32-bit boundaries, so if it already does this then I wouldn't expect
>> to need any padding at all? For instance, on my system with these
>> patches I see with pahole that 'struct tc_flower_key' has size 84, and
>> the 'pad2' and 'pad3' appear to be unnecessary:
>>
>>          struct {
>>                 _Bool              rewrite;              /*   216     1 */
>>                 uint8_t            pad1[0];              /*   217     0 */
>>                 struct tc_flower_key key;                /*   220    84 */
>>                 /* --- cacheline 1 boundary (64 bytes) was 21 bytes ago
>> --- */
>>                 uint8_t            pad2[0];              /*   304     0 */
>>                 struct tc_flower_key mask;               /*   308    84 */
>>                 /* --- cacheline 2 boundary (128 bytes) was 41 bytes ago
>> --- */
>>                 uint8_t            pad3[0];              /*   392     0 */
>>         } rewrite;                                       /*   216   180 */
>>
>> However, if in future someone adds a 1-byte member to tc_flower_key
>> then I'm not sure that this alignment will hold. On my system, it
>> seems like that would end up padding tc_flower_key out to 88B so these
>> extra padding would still be unnecessary (although pahole says that it
>> has a brain fart, so I'm not sure exactly how much confidence I should
>> have in this).
>>
>> What I was thinking about was whether we could use something like
>> PADDED_MEMBERS() in this case.
>>
>
> Are you talking about alignment in regards to performance?
> Because the padding is there for overflowing.
> If someone adds a new member to struct key/mask that is smaller than a
> 32bits to the end of the struct, setting it via a pointer might overflow the
> struct. I don't think PADDED_MEMBERS will work for this type of padding.
>
> We do mask the write to the write size, and it should still be in memory of
> owned by struct flower and since the compiler can't reorder the struct we
> actually only need the last padding to cover the above case.
>
> Maybe we can add alignment when we measure the performance of the code?

Ah. I wasn't concerned about performance, I was just wondering if this
forces us to add a few extra unnecessary bytes to 'rewrite' regardless
of the size of 'struct tc_flower'. For instance, if 'struct tc_flower'
is 63B long because someone adds a small field to the end of the
structure, then I'd expect to need only one extra byte of padding. If
it were a multiple of 32 bits, I wouldn't expect any need for padding
at all.
Simon Horman Aug. 22, 2017, 8:29 a.m. | #11
On Thu, Aug 17, 2017 at 12:21:43PM +0300, Paul Blakey wrote:
> 
> 
> On 14/08/2017 16:00, Simon Horman wrote:
> >On Tue, Aug 08, 2017 at 05:21:53PM +0300, Roi Dayan wrote:
> >>From: Paul Blakey <paulb@mellanox.com>
> >>
> >>To be later used to implement ovs action set offloading.
> >>
> >>Signed-off-by: Paul Blakey <paulb@mellanox.com>
> >>Reviewed-by: Roi Dayan <roid@mellanox.com>
> >>---
> >>  lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >>  lib/tc.h |  12 +++
> >>  2 files changed, 381 insertions(+), 3 deletions(-)

...

> >>+static void
> >>  nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
> >>  {
> >>      size_t offset;
> 
> Hi and thanks for the review,
> 
> The above style corresponds with the rest of the file (scope and
> nl_msg_put_act_* funcs)
> Maybe we do a style patch that remove all this extra scopes in a later
> patch?

Sure, that is fine by me.
Paul Blakey Aug. 22, 2017, 8:07 p.m. | #12
On 21/08/2017 20:45, Joe Stringer wrote:
> On 20 August 2017 at 01:50, Paul Blakey <paulb@mellanox.com> wrote:
>>
>>
>> On 18/08/2017 00:52, Joe Stringer wrote:
>>>
>>> On 17 August 2017 at 02:18, Paul Blakey <paulb@mellanox.com> wrote:
>>>>
>>>>
>>>>
>>>> On 15/08/2017 20:04, Joe Stringer wrote:
>>>>>
>>>>>
>>>>> On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 15/08/2017 00:56, Joe Stringer wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>>>>
>>>>>>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>>>>>>          uint64_t lastused;
>>>>>>>>
>>>>>>>>          struct {
>>>>>>>> +        bool rewrite;
>>>>>>>> +        uint8_t pad1[3];
>>>>>>>> +        struct tc_flower_key key;
>>>>>>>> +        uint8_t pad2[3];
>>>>>>>> +        struct tc_flower_key mask;
>>>>>>>> +        uint8_t pad3[3];
>>>>>>>> +    } rewrite;
>>>>>
>>>>>
>>>>>
>>>>> Now that I get why the pads are here.. ;)
>>>>>
>>>>> Is there an existing macro we can use to ensure that these pad out to
>>>>> 32-bit boundaries?
>>>>>
>>>>
>>>> I'm not sure if that's possible, the size is a minimum of extra 24bits,
>>>> so
>>>> its can't overflow with writing on any address below it. The compiler
>>>> might add some extra padding but that shouldn't matter.
>>>
>>>
>>> My thought was that the main concern here is to align the fields to
>>> 32-bit boundaries, so if it already does this then I wouldn't expect
>>> to need any padding at all? For instance, on my system with these
>>> patches I see with pahole that 'struct tc_flower_key' has size 84, and
>>> the 'pad2' and 'pad3' appear to be unnecessary:
>>>
>>>           struct {
>>>                  _Bool              rewrite;              /*   216     1 */
>>>                  uint8_t            pad1[0];              /*   217     0 */
>>>                  struct tc_flower_key key;                /*   220    84 */
>>>                  /* --- cacheline 1 boundary (64 bytes) was 21 bytes ago
>>> --- */
>>>                  uint8_t            pad2[0];              /*   304     0 */
>>>                  struct tc_flower_key mask;               /*   308    84 */
>>>                  /* --- cacheline 2 boundary (128 bytes) was 41 bytes ago
>>> --- */
>>>                  uint8_t            pad3[0];              /*   392     0 */
>>>          } rewrite;                                       /*   216   180 */
>>>
>>> However, if in future someone adds a 1-byte member to tc_flower_key
>>> then I'm not sure that this alignment will hold. On my system, it
>>> seems like that would end up padding tc_flower_key out to 88B so these
>>> extra padding would still be unnecessary (although pahole says that it
>>> has a brain fart, so I'm not sure exactly how much confidence I should
>>> have in this).
>>>
>>> What I was thinking about was whether we could use something like
>>> PADDED_MEMBERS() in this case.
>>>
>>
>> Are you talking about alignment in regards to performance?
>> Because the padding is there for overflowing.
>> If someone adds a new member to struct key/mask that is smaller than a
>> 32bits to the end of the struct, setting it via a pointer might overflow the
>> struct. I don't think PADDED_MEMBERS will work for this type of padding.
>>
>> We do mask the write to the write size, and it should still be in memory of
>> owned by struct flower and since the compiler can't reorder the struct we
>> actually only need the last padding to cover the above case.
>>
>> Maybe we can add alignment when we measure the performance of the code?
> 
> Ah. I wasn't concerned about performance, I was just wondering if this
> forces us to add a few extra unnecessary bytes to 'rewrite' regardless
> of the size of 'struct tc_flower'. For instance, if 'struct tc_flower'
> is 63B long because someone adds a small field to the end of the
> structure, then I'd expect to need only one extra byte of padding. If
> it were a multiple of 32 bits, I wouldn't expect any need for padding
> at all.
> 

I see. You're right but keep in mind that this struct is on the stack 
and a couple of bytes shouldn't matter much. I'm not sure how to define 
a macro that adds padding  based on the last member in the struct unless 
I do define it like PADDED_MEMBERS but with the last member isolated from
the rest (e.g PAD_TO_LAST(unit, members, last_member)) so I can test 
it's size. A build time assert would be the same.

Since we mask the writes/reads, Maybe we can assert whether reading 
32bit (or actually 24bit) past the struct rewrite is within struct 
flower bounds. Basically that no one moved it to the end of struct flower.

pseudo code: BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) + 
sizeof(uint32_t) < sizeof(struct flower))


need to be careful that rewrite struct will always be inside flower.
Paul Blakey Aug. 22, 2017, 8:31 p.m. | #13
On 22/08/2017 23:07, Paul Blakey wrote:
> 
> 
> On 21/08/2017 20:45, Joe Stringer wrote:
>> On 20 August 2017 at 01:50, Paul Blakey <paulb@mellanox.com> wrote:
>>>
>>>
>>> On 18/08/2017 00:52, Joe Stringer wrote:
>>>>
>>>> On 17 August 2017 at 02:18, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 15/08/2017 20:04, Joe Stringer wrote:
>>>>>>
>>>>>>
>>>>>> On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 15/08/2017 00:56, Joe Stringer wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>>>>>
>>>>>>>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>>>>>>>          uint64_t lastused;
>>>>>>>>>
>>>>>>>>>          struct {
>>>>>>>>> +        bool rewrite;
>>>>>>>>> +        uint8_t pad1[3];
>>>>>>>>> +        struct tc_flower_key key;
>>>>>>>>> +        uint8_t pad2[3];
>>>>>>>>> +        struct tc_flower_key mask;
>>>>>>>>> +        uint8_t pad3[3];
>>>>>>>>> +    } rewrite;
>>>>>>
>>>>>>
>>>>>>
>>>>>> Now that I get why the pads are here.. ;)
>>>>>>
>>>>>> Is there an existing macro we can use to ensure that these pad out to
>>>>>> 32-bit boundaries?
>>>>>>
>>>>>
>>>>> I'm not sure if that's possible, the size is a minimum of extra 
>>>>> 24bits,
>>>>> so
>>>>> its can't overflow with writing on any address below it. The compiler
>>>>> might add some extra padding but that shouldn't matter.
>>>>
>>>>
>>>> My thought was that the main concern here is to align the fields to
>>>> 32-bit boundaries, so if it already does this then I wouldn't expect
>>>> to need any padding at all? For instance, on my system with these
>>>> patches I see with pahole that 'struct tc_flower_key' has size 84, and
>>>> the 'pad2' and 'pad3' appear to be unnecessary:
>>>>
>>>>           struct {
>>>>                  _Bool              rewrite;              /*   
>>>> 216     1 */
>>>>                  uint8_t            pad1[0];              /*   
>>>> 217     0 */
>>>>                  struct tc_flower_key key;                /*   
>>>> 220    84 */
>>>>                  /* --- cacheline 1 boundary (64 bytes) was 21 bytes 
>>>> ago
>>>> --- */
>>>>                  uint8_t            pad2[0];              /*   
>>>> 304     0 */
>>>>                  struct tc_flower_key mask;               /*   
>>>> 308    84 */
>>>>                  /* --- cacheline 2 boundary (128 bytes) was 41 
>>>> bytes ago
>>>> --- */
>>>>                  uint8_t            pad3[0];              /*   
>>>> 392     0 */
>>>>          } rewrite;                                       /*   216   
>>>> 180 */
>>>>
>>>> However, if in future someone adds a 1-byte member to tc_flower_key
>>>> then I'm not sure that this alignment will hold. On my system, it
>>>> seems like that would end up padding tc_flower_key out to 88B so these
>>>> extra padding would still be unnecessary (although pahole says that it
>>>> has a brain fart, so I'm not sure exactly how much confidence I should
>>>> have in this).
>>>>
>>>> What I was thinking about was whether we could use something like
>>>> PADDED_MEMBERS() in this case.
>>>>
>>>
>>> Are you talking about alignment in regards to performance?
>>> Because the padding is there for overflowing.
>>> If someone adds a new member to struct key/mask that is smaller than a
>>> 32bits to the end of the struct, setting it via a pointer might 
>>> overflow the
>>> struct. I don't think PADDED_MEMBERS will work for this type of padding.
>>>
>>> We do mask the write to the write size, and it should still be in 
>>> memory of
>>> owned by struct flower and since the compiler can't reorder the 
>>> struct we
>>> actually only need the last padding to cover the above case.
>>>
>>> Maybe we can add alignment when we measure the performance of the code?
>>
>> Ah. I wasn't concerned about performance, I was just wondering if this
>> forces us to add a few extra unnecessary bytes to 'rewrite' regardless
>> of the size of 'struct tc_flower'. For instance, if 'struct tc_flower'
>> is 63B long because someone adds a small field to the end of the
>> structure, then I'd expect to need only one extra byte of padding. If
>> it were a multiple of 32 bits, I wouldn't expect any need for padding
>> at all.
>>
> 
> I see. You're right but keep in mind that this struct is on the stack 
> and a couple of bytes shouldn't matter much. I'm not sure how to define 
> a macro that adds padding  based on the last member in the struct unless 
> I do define it like PADDED_MEMBERS but with the last member isolated from
> the rest (e.g PAD_TO_LAST(unit, members, last_member)) so I can test 
> it's size. A build time assert would be the same.
> 
> Since we mask the writes/reads, Maybe we can assert whether reading 
> 32bit (or actually 24bit) past the struct rewrite is within struct 
> flower bounds. Basically that no one moved it to the end of struct flower.
> 
> pseudo code: BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) + 
> sizeof(uint32_t) < sizeof(struct flower))

BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) + 
sizeof(flower.rewrite) + sizeof(uint32_t) -1 < sizeof(struct flower))

> 
> 
> need to be careful that rewrite struct will always be inside flower.
> 
>
Joe Stringer Aug. 23, 2017, 12:20 a.m. | #14
On 22 August 2017 at 13:31, Paul Blakey <paulb@mellanox.com> wrote:
>
>
> On 22/08/2017 23:07, Paul Blakey wrote:
>>
>>
>>
>> On 21/08/2017 20:45, Joe Stringer wrote:
>>>
>>> On 20 August 2017 at 01:50, Paul Blakey <paulb@mellanox.com> wrote:
>>>>
>>>>
>>>>
>>>> On 18/08/2017 00:52, Joe Stringer wrote:
>>>>>
>>>>>
>>>>> On 17 August 2017 at 02:18, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 15/08/2017 20:04, Joe Stringer wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 15/08/2017 00:56, Joe Stringer wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>>>>>>
>>>>>>>>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>>>>>>>>          uint64_t lastused;
>>>>>>>>>>
>>>>>>>>>>          struct {
>>>>>>>>>> +        bool rewrite;
>>>>>>>>>> +        uint8_t pad1[3];
>>>>>>>>>> +        struct tc_flower_key key;
>>>>>>>>>> +        uint8_t pad2[3];
>>>>>>>>>> +        struct tc_flower_key mask;
>>>>>>>>>> +        uint8_t pad3[3];
>>>>>>>>>> +    } rewrite;
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Now that I get why the pads are here.. ;)
>>>>>>>
>>>>>>> Is there an existing macro we can use to ensure that these pad out to
>>>>>>> 32-bit boundaries?
>>>>>>>
>>>>>>
>>>>>> I'm not sure if that's possible, the size is a minimum of extra
>>>>>> 24bits,
>>>>>> so
>>>>>> its can't overflow with writing on any address below it. The compiler
>>>>>> might add some extra padding but that shouldn't matter.
>>>>>
>>>>>
>>>>>
>>>>> My thought was that the main concern here is to align the fields to
>>>>> 32-bit boundaries, so if it already does this then I wouldn't expect
>>>>> to need any padding at all? For instance, on my system with these
>>>>> patches I see with pahole that 'struct tc_flower_key' has size 84, and
>>>>> the 'pad2' and 'pad3' appear to be unnecessary:
>>>>>
>>>>>           struct {
>>>>>                  _Bool              rewrite;              /*   216
>>>>> 1 */
>>>>>                  uint8_t            pad1[0];              /*   217
>>>>> 0 */
>>>>>                  struct tc_flower_key key;                /*   220
>>>>> 84 */
>>>>>                  /* --- cacheline 1 boundary (64 bytes) was 21 bytes
>>>>> ago
>>>>> --- */
>>>>>                  uint8_t            pad2[0];              /*   304
>>>>> 0 */
>>>>>                  struct tc_flower_key mask;               /*   308
>>>>> 84 */
>>>>>                  /* --- cacheline 2 boundary (128 bytes) was 41 bytes
>>>>> ago
>>>>> --- */
>>>>>                  uint8_t            pad3[0];              /*   392
>>>>> 0 */
>>>>>          } rewrite;                                       /*   216
>>>>> 180 */
>>>>>
>>>>> However, if in future someone adds a 1-byte member to tc_flower_key
>>>>> then I'm not sure that this alignment will hold. On my system, it
>>>>> seems like that would end up padding tc_flower_key out to 88B so these
>>>>> extra padding would still be unnecessary (although pahole says that it
>>>>> has a brain fart, so I'm not sure exactly how much confidence I should
>>>>> have in this).
>>>>>
>>>>> What I was thinking about was whether we could use something like
>>>>> PADDED_MEMBERS() in this case.
>>>>>
>>>>
>>>> Are you talking about alignment in regards to performance?
>>>> Because the padding is there for overflowing.
>>>> If someone adds a new member to struct key/mask that is smaller than a
>>>> 32bits to the end of the struct, setting it via a pointer might overflow
>>>> the
>>>> struct. I don't think PADDED_MEMBERS will work for this type of padding.
>>>>
>>>> We do mask the write to the write size, and it should still be in memory
>>>> of
>>>> owned by struct flower and since the compiler can't reorder the struct
>>>> we
>>>> actually only need the last padding to cover the above case.
>>>>
>>>> Maybe we can add alignment when we measure the performance of the code?
>>>
>>>
>>> Ah. I wasn't concerned about performance, I was just wondering if this
>>> forces us to add a few extra unnecessary bytes to 'rewrite' regardless
>>> of the size of 'struct tc_flower'. For instance, if 'struct tc_flower'
>>> is 63B long because someone adds a small field to the end of the
>>> structure, then I'd expect to need only one extra byte of padding. If
>>> it were a multiple of 32 bits, I wouldn't expect any need for padding
>>> at all.
>>>
>>
>> I see. You're right but keep in mind that this struct is on the stack and
>> a couple of bytes shouldn't matter much. I'm not sure how to define a macro
>> that adds padding  based on the last member in the struct unless I do define
>> it like PADDED_MEMBERS but with the last member isolated from
>> the rest (e.g PAD_TO_LAST(unit, members, last_member)) so I can test it's
>> size. A build time assert would be the same.
>>
>> Since we mask the writes/reads, Maybe we can assert whether reading 32bit
>> (or actually 24bit) past the struct rewrite is within struct flower bounds.
>> Basically that no one moved it to the end of struct flower.
>>
>> pseudo code: BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) +
>> sizeof(uint32_t) < sizeof(struct flower))
>
>
> BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) + sizeof(flower.rewrite) +
> sizeof(uint32_t) -1 < sizeof(struct flower))

Could we just check that (member_offsetof(flower.rewrite) +
sizeof(flower.rewrite)) % sizeof(uint32_t) == 0 ?

Also for the flower.rewrite.key and flower.rewrite.mask ?
Paul Blakey Aug. 24, 2017, 7:03 a.m. | #15
On 23/08/2017 03:20, Joe Stringer wrote:
> On 22 August 2017 at 13:31, Paul Blakey <paulb@mellanox.com> wrote:
>>
>>
>> On 22/08/2017 23:07, Paul Blakey wrote:
>>>
>>>
>>>
>>> On 21/08/2017 20:45, Joe Stringer wrote:
>>>>
>>>> On 20 August 2017 at 01:50, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 18/08/2017 00:52, Joe Stringer wrote:
>>>>>>
>>>>>>
>>>>>> On 17 August 2017 at 02:18, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 15/08/2017 20:04, Joe Stringer wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 15/08/2017 00:56, Joe Stringer wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>>>>>>>
>>>>>>>>>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>>>>>>>>>           uint64_t lastused;
>>>>>>>>>>>
>>>>>>>>>>>           struct {
>>>>>>>>>>> +        bool rewrite;
>>>>>>>>>>> +        uint8_t pad1[3];
>>>>>>>>>>> +        struct tc_flower_key key;
>>>>>>>>>>> +        uint8_t pad2[3];
>>>>>>>>>>> +        struct tc_flower_key mask;
>>>>>>>>>>> +        uint8_t pad3[3];
>>>>>>>>>>> +    } rewrite;
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> Now that I get why the pads are here.. ;)
>>>>>>>>
>>>>>>>> Is there an existing macro we can use to ensure that these pad out to
>>>>>>>> 32-bit boundaries?
>>>>>>>>
>>>>>>>
>>>>>>> I'm not sure if that's possible, the size is a minimum of extra
>>>>>>> 24bits,
>>>>>>> so
>>>>>>> its can't overflow with writing on any address below it. The compiler
>>>>>>> might add some extra padding but that shouldn't matter.
>>>>>>
>>>>>>
>>>>>>
>>>>>> My thought was that the main concern here is to align the fields to
>>>>>> 32-bit boundaries, so if it already does this then I wouldn't expect
>>>>>> to need any padding at all? For instance, on my system with these
>>>>>> patches I see with pahole that 'struct tc_flower_key' has size 84, and
>>>>>> the 'pad2' and 'pad3' appear to be unnecessary:
>>>>>>
>>>>>>            struct {
>>>>>>                   _Bool              rewrite;              /*   216
>>>>>> 1 */
>>>>>>                   uint8_t            pad1[0];              /*   217
>>>>>> 0 */
>>>>>>                   struct tc_flower_key key;                /*   220
>>>>>> 84 */
>>>>>>                   /* --- cacheline 1 boundary (64 bytes) was 21 bytes
>>>>>> ago
>>>>>> --- */
>>>>>>                   uint8_t            pad2[0];              /*   304
>>>>>> 0 */
>>>>>>                   struct tc_flower_key mask;               /*   308
>>>>>> 84 */
>>>>>>                   /* --- cacheline 2 boundary (128 bytes) was 41 bytes
>>>>>> ago
>>>>>> --- */
>>>>>>                   uint8_t            pad3[0];              /*   392
>>>>>> 0 */
>>>>>>           } rewrite;                                       /*   216
>>>>>> 180 */
>>>>>>
>>>>>> However, if in future someone adds a 1-byte member to tc_flower_key
>>>>>> then I'm not sure that this alignment will hold. On my system, it
>>>>>> seems like that would end up padding tc_flower_key out to 88B so these
>>>>>> extra padding would still be unnecessary (although pahole says that it
>>>>>> has a brain fart, so I'm not sure exactly how much confidence I should
>>>>>> have in this).
>>>>>>
>>>>>> What I was thinking about was whether we could use something like
>>>>>> PADDED_MEMBERS() in this case.
>>>>>>
>>>>>
>>>>> Are you talking about alignment in regards to performance?
>>>>> Because the padding is there for overflowing.
>>>>> If someone adds a new member to struct key/mask that is smaller than a
>>>>> 32bits to the end of the struct, setting it via a pointer might overflow
>>>>> the
>>>>> struct. I don't think PADDED_MEMBERS will work for this type of padding.
>>>>>
>>>>> We do mask the write to the write size, and it should still be in memory
>>>>> of
>>>>> owned by struct flower and since the compiler can't reorder the struct
>>>>> we
>>>>> actually only need the last padding to cover the above case.
>>>>>
>>>>> Maybe we can add alignment when we measure the performance of the code?
>>>>
>>>>
>>>> Ah. I wasn't concerned about performance, I was just wondering if this
>>>> forces us to add a few extra unnecessary bytes to 'rewrite' regardless
>>>> of the size of 'struct tc_flower'. For instance, if 'struct tc_flower'
>>>> is 63B long because someone adds a small field to the end of the
>>>> structure, then I'd expect to need only one extra byte of padding. If
>>>> it were a multiple of 32 bits, I wouldn't expect any need for padding
>>>> at all.
>>>>
>>>
>>> I see. You're right but keep in mind that this struct is on the stack and
>>> a couple of bytes shouldn't matter much. I'm not sure how to define a macro
>>> that adds padding  based on the last member in the struct unless I do define
>>> it like PADDED_MEMBERS but with the last member isolated from
>>> the rest (e.g PAD_TO_LAST(unit, members, last_member)) so I can test it's
>>> size. A build time assert would be the same.
>>>
>>> Since we mask the writes/reads, Maybe we can assert whether reading 32bit
>>> (or actually 24bit) past the struct rewrite is within struct flower bounds.
>>> Basically that no one moved it to the end of struct flower.
>>>
>>> pseudo code: BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) +
>>> sizeof(uint32_t) < sizeof(struct flower))
>>
>>
>> BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) + sizeof(flower.rewrite) +
>> sizeof(uint32_t) -1 < sizeof(struct flower))
> 
> Could we just check that (member_offsetof(flower.rewrite) +
> sizeof(flower.rewrite)) % sizeof(uint32_t) == 0 ?
> 
> Also for the flower.rewrite.key and flower.rewrite.mask ?
> 

Maybe I'm missing something but that doesn't check that we wouldn't 
spill over to outside of flower if someone defines rewrite (moves the 
rewrite struct) and flower like this:

struct flower_key {
	uint32_t other_members[5]
	uint8_t other_members[3]
	uint8_t some_member;
}

struct rewrite {
	struct flower_key key;
	struct flower_key mask;
}

struct flower {
	uint32_t other_members[10]
	struct rewrite;
}

member_offsetof(flower.rewrite) = 40
sizeof(flower_key) = 24
sizeof(rewrite) = 48
sizeof(flower) = 88

here sizeof(rewrite) % 4 == 0 and member_offsetof(flower.rewrite) % 4 == 
0 so it won't fail. but writing uint32_t to 
offsetof(rewrite.mask.some_member) will overflow by 3 bytes.


This  BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) + 
sizeof(flower.rewrite) + sizeof(uint32_t) - 2 < sizeof(struct flower))
will catch this (40 + 48 + 4 - 2 = 90 < 88)

moving rewrite back to a safe position will pass:

struct flower {
	uint32_t other_members[10]
	struct rewrite;
	uint8_t other_members[3]
}

40 + 48 + 4 - 2 = 90 < 91

any less padding from other_members and we will fail.
Joe Stringer Aug. 26, 2017, 12:57 a.m. | #16
On 24 August 2017 at 00:03, Paul Blakey <paulb@mellanox.com> wrote:
>
>
> On 23/08/2017 03:20, Joe Stringer wrote:
>>
>> On 22 August 2017 at 13:31, Paul Blakey <paulb@mellanox.com> wrote:
>>>
>>>
>>>
>>> On 22/08/2017 23:07, Paul Blakey wrote:
>>>>
>>>>
>>>>
>>>>
>>>> On 21/08/2017 20:45, Joe Stringer wrote:
>>>>>
>>>>>
>>>>> On 20 August 2017 at 01:50, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 18/08/2017 00:52, Joe Stringer wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 17 August 2017 at 02:18, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 15/08/2017 20:04, Joe Stringer wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 15 August 2017 at 01:19, Paul Blakey <paulb@mellanox.com> wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 15/08/2017 00:56, Joe Stringer wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> On 8 August 2017 at 07:21, Roi Dayan <roid@mellanox.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>>>>>>>>
>>>>>>>>>>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>>>>>>>>>>           uint64_t lastused;
>>>>>>>>>>>>
>>>>>>>>>>>>           struct {
>>>>>>>>>>>> +        bool rewrite;
>>>>>>>>>>>> +        uint8_t pad1[3];
>>>>>>>>>>>> +        struct tc_flower_key key;
>>>>>>>>>>>> +        uint8_t pad2[3];
>>>>>>>>>>>> +        struct tc_flower_key mask;
>>>>>>>>>>>> +        uint8_t pad3[3];
>>>>>>>>>>>> +    } rewrite;
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Now that I get why the pads are here.. ;)
>>>>>>>>>
>>>>>>>>> Is there an existing macro we can use to ensure that these pad out
>>>>>>>>> to
>>>>>>>>> 32-bit boundaries?
>>>>>>>>>
>>>>>>>>
>>>>>>>> I'm not sure if that's possible, the size is a minimum of extra
>>>>>>>> 24bits,
>>>>>>>> so
>>>>>>>> its can't overflow with writing on any address below it. The
>>>>>>>> compiler
>>>>>>>> might add some extra padding but that shouldn't matter.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> My thought was that the main concern here is to align the fields to
>>>>>>> 32-bit boundaries, so if it already does this then I wouldn't expect
>>>>>>> to need any padding at all? For instance, on my system with these
>>>>>>> patches I see with pahole that 'struct tc_flower_key' has size 84,
>>>>>>> and
>>>>>>> the 'pad2' and 'pad3' appear to be unnecessary:
>>>>>>>
>>>>>>>            struct {
>>>>>>>                   _Bool              rewrite;              /*   216
>>>>>>> 1 */
>>>>>>>                   uint8_t            pad1[0];              /*   217
>>>>>>> 0 */
>>>>>>>                   struct tc_flower_key key;                /*   220
>>>>>>> 84 */
>>>>>>>                   /* --- cacheline 1 boundary (64 bytes) was 21 bytes
>>>>>>> ago
>>>>>>> --- */
>>>>>>>                   uint8_t            pad2[0];              /*   304
>>>>>>> 0 */
>>>>>>>                   struct tc_flower_key mask;               /*   308
>>>>>>> 84 */
>>>>>>>                   /* --- cacheline 2 boundary (128 bytes) was 41
>>>>>>> bytes
>>>>>>> ago
>>>>>>> --- */
>>>>>>>                   uint8_t            pad3[0];              /*   392
>>>>>>> 0 */
>>>>>>>           } rewrite;                                       /*   216
>>>>>>> 180 */
>>>>>>>
>>>>>>> However, if in future someone adds a 1-byte member to tc_flower_key
>>>>>>> then I'm not sure that this alignment will hold. On my system, it
>>>>>>> seems like that would end up padding tc_flower_key out to 88B so
>>>>>>> these
>>>>>>> extra padding would still be unnecessary (although pahole says that
>>>>>>> it
>>>>>>> has a brain fart, so I'm not sure exactly how much confidence I
>>>>>>> should
>>>>>>> have in this).
>>>>>>>
>>>>>>> What I was thinking about was whether we could use something like
>>>>>>> PADDED_MEMBERS() in this case.
>>>>>>>
>>>>>>
>>>>>> Are you talking about alignment in regards to performance?
>>>>>> Because the padding is there for overflowing.
>>>>>> If someone adds a new member to struct key/mask that is smaller than a
>>>>>> 32bits to the end of the struct, setting it via a pointer might
>>>>>> overflow
>>>>>> the
>>>>>> struct. I don't think PADDED_MEMBERS will work for this type of
>>>>>> padding.
>>>>>>
>>>>>> We do mask the write to the write size, and it should still be in
>>>>>> memory
>>>>>> of
>>>>>> owned by struct flower and since the compiler can't reorder the struct
>>>>>> we
>>>>>> actually only need the last padding to cover the above case.
>>>>>>
>>>>>> Maybe we can add alignment when we measure the performance of the
>>>>>> code?
>>>>>
>>>>>
>>>>>
>>>>> Ah. I wasn't concerned about performance, I was just wondering if this
>>>>> forces us to add a few extra unnecessary bytes to 'rewrite' regardless
>>>>> of the size of 'struct tc_flower'. For instance, if 'struct tc_flower'
>>>>> is 63B long because someone adds a small field to the end of the
>>>>> structure, then I'd expect to need only one extra byte of padding. If
>>>>> it were a multiple of 32 bits, I wouldn't expect any need for padding
>>>>> at all.
>>>>>
>>>>
>>>> I see. You're right but keep in mind that this struct is on the stack
>>>> and
>>>> a couple of bytes shouldn't matter much. I'm not sure how to define a
>>>> macro
>>>> that adds padding  based on the last member in the struct unless I do
>>>> define
>>>> it like PADDED_MEMBERS but with the last member isolated from
>>>> the rest (e.g PAD_TO_LAST(unit, members, last_member)) so I can test
>>>> it's
>>>> size. A build time assert would be the same.
>>>>
>>>> Since we mask the writes/reads, Maybe we can assert whether reading
>>>> 32bit
>>>> (or actually 24bit) past the struct rewrite is within struct flower
>>>> bounds.
>>>> Basically that no one moved it to the end of struct flower.
>>>>
>>>> pseudo code: BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) +
>>>> sizeof(uint32_t) < sizeof(struct flower))
>>>
>>>
>>>
>>> BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) +
>>> sizeof(flower.rewrite) +
>>> sizeof(uint32_t) -1 < sizeof(struct flower))
>>
>>
>> Could we just check that (member_offsetof(flower.rewrite) +
>> sizeof(flower.rewrite)) % sizeof(uint32_t) == 0 ?
>>
>> Also for the flower.rewrite.key and flower.rewrite.mask ?
>>
>
> Maybe I'm missing something but that doesn't check that we wouldn't spill
> over to outside of flower if someone defines rewrite (moves the rewrite
> struct) and flower like this:
>
> struct flower_key {
>         uint32_t other_members[5]
>         uint8_t other_members[3]
>         uint8_t some_member;
> }
>
> struct rewrite {
>         struct flower_key key;
>         struct flower_key mask;
> }
>
> struct flower {
>         uint32_t other_members[10]
>         struct rewrite;
> }
>
> member_offsetof(flower.rewrite) = 40
> sizeof(flower_key) = 24
> sizeof(rewrite) = 48
> sizeof(flower) = 88
>
> here sizeof(rewrite) % 4 == 0 and member_offsetof(flower.rewrite) % 4 == 0
> so it won't fail. but writing uint32_t to offsetof(rewrite.mask.some_member)
> will overflow by 3 bytes.
>
>
> This  BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) +
> sizeof(flower.rewrite) + sizeof(uint32_t) - 2 < sizeof(struct flower))
> will catch this (40 + 48 + 4 - 2 = 90 < 88)
>
> moving rewrite back to a safe position will pass:
>
> struct flower {
>         uint32_t other_members[10]
>         struct rewrite;
>         uint8_t other_members[3]
> }
>
> 40 + 48 + 4 - 2 = 90 < 91
>
> any less padding from other_members and we will fail.

OK, thanks for the thorough explanation. I was under the assumption
that the writes would only ever access on 32-bit boundaries.

Patch

diff --git a/lib/tc.c b/lib/tc.c
index 5c36d0d..4cece17 100644
--- a/lib/tc.c
+++ b/lib/tc.c
@@ -21,8 +21,10 @@ 
 #include <errno.h>
 #include <linux/if_ether.h>
 #include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_csum.h>
 #include <linux/tc_act/tc_gact.h>
 #include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_pedit.h>
 #include <linux/tc_act/tc_tunnel_key.h>
 #include <linux/tc_act/tc_vlan.h>
 #include <linux/gen_stats.h>
@@ -33,11 +35,14 @@ 
 #include "netlink-socket.h"
 #include "netlink.h"
 #include "openvswitch/ofpbuf.h"
+#include "openvswitch/util.h"
 #include "openvswitch/vlog.h"
 #include "packets.h"
 #include "timeval.h"
 #include "unaligned.h"
 
+#define MAX_PEDIT_OFFSETS 8
+
 VLOG_DEFINE_THIS_MODULE(tc);
 
 static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);
@@ -50,6 +55,80 @@  enum tc_offload_policy {
 
 static enum tc_offload_policy tc_policy = TC_POLICY_NONE;
 
+struct tc_pedit_key_ex {
+    enum pedit_header_type htype;
+    enum pedit_cmd cmd;
+};
+
+struct flower_key_to_pedit {
+    enum pedit_header_type htype;
+    int offset;
+    int size;
+};
+
+static struct flower_key_to_pedit flower_pedit_map[] = {
+    [offsetof(struct tc_flower_key, ipv4.ipv4_src)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
+        12,
+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
+    },
+    [offsetof(struct tc_flower_key, ipv4.ipv4_dst)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
+        16,
+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
+    },
+    [offsetof(struct tc_flower_key, ipv4.rewrite_ttl)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
+        8,
+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
+    },
+    [offsetof(struct tc_flower_key, ipv6.ipv6_src)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
+        8,
+        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src)
+    },
+    [offsetof(struct tc_flower_key, ipv6.ipv6_dst)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
+        24,
+        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst)
+    },
+    [offsetof(struct tc_flower_key, src_mac)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
+        6,
+        MEMBER_SIZEOF(struct tc_flower_key, src_mac)
+    },
+    [offsetof(struct tc_flower_key, dst_mac)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
+        0,
+        MEMBER_SIZEOF(struct tc_flower_key, dst_mac)
+    },
+    [offsetof(struct tc_flower_key, eth_type)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
+        12,
+        MEMBER_SIZEOF(struct tc_flower_key, eth_type)
+    },
+    [offsetof(struct tc_flower_key, tcp_src)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
+        0,
+        MEMBER_SIZEOF(struct tc_flower_key, tcp_src)
+    },
+    [offsetof(struct tc_flower_key, tcp_dst)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
+        2,
+        MEMBER_SIZEOF(struct tc_flower_key, tcp_dst)
+    },
+    [offsetof(struct tc_flower_key, udp_src)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
+        0,
+        MEMBER_SIZEOF(struct tc_flower_key, udp_src)
+    },
+    [offsetof(struct tc_flower_key, udp_dst)] = {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
+        2,
+        MEMBER_SIZEOF(struct tc_flower_key, udp_dst)
+    },
+};
+
 struct tcmsg *
 tc_make_request(int ifindex, int type, unsigned int flags,
                 struct ofpbuf *request)
@@ -346,6 +425,96 @@  nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
     }
 }
 
+static const struct nl_policy pedit_policy[] = {
+            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
+                                     .min_len = sizeof(struct tc_pedit),
+                                     .optional = false, },
+            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
+                                      .optional = false, },
+};
+
+static int
+nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
+{
+    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
+    const struct tc_pedit *pe;
+    const struct tc_pedit_key *keys;
+    const struct nlattr *nla, *keys_ex, *ex_type;
+    const void *keys_attr;
+    char *rewrite_key = (void *) &flower->rewrite.key;
+    char *rewrite_mask = (void *) &flower->rewrite.mask;
+    size_t keys_ex_size, left;
+    int type, i = 0;
+
+    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
+                         ARRAY_SIZE(pedit_policy))) {
+        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
+        return EPROTO;
+    }
+
+    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
+    keys = pe->keys;
+    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
+    keys_ex = nl_attr_get(keys_attr);
+    keys_ex_size = nl_attr_get_size(keys_attr);
+
+    NL_ATTR_FOR_EACH(nla, left, keys_ex, keys_ex_size) {
+        if (i >= pe->nkeys) {
+            break;
+        }
+
+        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
+            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
+            type = nl_attr_get_u16(ex_type);
+
+            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
+                struct flower_key_to_pedit *m = &flower_pedit_map[j];
+                int flower_off = j;
+                int sz = m->size;
+                int mf = m->offset;
+
+                if (!sz || m->htype != type) {
+                   continue;
+                }
+
+                /* check overlap between current pedit key, which is always
+                 * 4 bytes (range [off, off + 3]), and a map entry in
+                 * flower_pedit_map (range [mf, mf + sz - 1]) */
+                if ((keys->off >= mf && keys->off < mf + sz)
+                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
+                    int diff = flower_off + (keys->off - mf);
+                    uint32_t *dst = (void *) (rewrite_key + diff);
+                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
+                    uint32_t mask = ~(keys->mask);
+                    uint32_t zero_bits;
+
+                    if (keys->off < mf) {
+                        zero_bits = 8 * (mf - keys->off);
+                        mask &= UINT32_MAX << zero_bits;
+                    } else if (keys->off + 4 > mf + m->size) {
+                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
+                        mask &= UINT32_MAX >> zero_bits;
+                    }
+
+                    *dst_m |= mask;
+                    *dst |= keys->val & mask;
+                }
+            }
+        } else {
+            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
+                        nl_attr_type(nla));
+            return EOPNOTSUPP;
+        }
+
+        keys++;
+        i++;
+    }
+
+    flower->rewrite.rewrite = true;
+
+    return 0;
+}
+
 static const struct nl_policy tunnel_key_policy[] = {
     [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC,
                                .min_len = sizeof(struct tc_tunnel_key),
@@ -589,6 +758,10 @@  nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
         nl_parse_act_vlan(act_options, flower);
     } else if (!strcmp(act_kind, "tunnel_key")) {
         nl_parse_act_tunnel_key(act_options, flower);
+    } else if (!strcmp(act_kind, "pedit")) {
+        nl_parse_act_pedit(act_options, flower);
+    } else if (!strcmp(act_kind, "csum")) {
+        /* not doing anything for now */
     } else {
         VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
         return EINVAL;
@@ -790,6 +963,48 @@  tc_get_tc_cls_policy(enum tc_offload_policy policy)
 }
 
 static void
+nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
+{
+    size_t offset;
+
+    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
+    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
+    {
+        struct tc_csum parm = { .action = TC_ACT_PIPE,
+                                .update_flags = flags };
+
+        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
+    }
+    nl_msg_end_nested(request, offset);
+}
+
+static void
+nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
+                     struct tc_pedit_key_ex *ex)
+{
+    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
+    size_t offset, offset_keys_ex, offset_key;
+    int i;
+
+    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
+    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
+    {
+        parm->action = TC_ACT_PIPE;
+
+        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
+        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
+        for (i = 0; i < parm->nkeys; i++, ex++) {
+            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
+            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
+            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
+            nl_msg_end_nested(request, offset_key);
+        }
+        nl_msg_end_nested(request, offset_keys_ex);
+    }
+    nl_msg_end_nested(request, offset);
+}
+
+static void
 nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
 {
     size_t offset;
@@ -911,7 +1126,129 @@  nl_msg_put_act_cookie(struct ofpbuf *request, struct tc_cookie *ck) {
     }
 }
 
+/* Given flower, a key_to_pedit map entry and src_offset, calculates the rest,
+ * where:
+ *
+ * src_offset - offset in tc_flower_key of where to read the value.
+ * mask, data - pointers of where read the first word of flower->key/mask.
+ * current_offset - which offset to use for the first pedit action.
+ * cnt - max pedits actions to use.
+ * first_word_mask/last_word_mask - the mask to use for the first/last read
+ * (as we read entire words). */
 static void
+calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m,
+             int src_offset, int *cur_offset, int *cnt,
+             uint32_t *last_word_mask, uint32_t *first_word_mask,
+             uint32_t **mask, uint32_t **data)
+{
+    int start_offset, max_offset, total_size;
+    int diff, right_zero_bits, left_zero_bits;
+    char *rewrite_key = (void *) &flower->rewrite.key;
+    char *rewrite_mask = (void *) &flower->rewrite.mask;
+
+    max_offset = m->offset + m->size;
+    start_offset = ROUND_DOWN(m->offset, 4);
+    diff = m->offset - start_offset;
+    total_size = max_offset - start_offset;
+    right_zero_bits = 8 * (4 - (max_offset % 4));
+    left_zero_bits = 8 * (m->offset - start_offset);
+
+    *cur_offset = start_offset;
+    *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0);
+    *last_word_mask = UINT32_MAX >> right_zero_bits;
+    *first_word_mask = UINT32_MAX << left_zero_bits;
+    *data = (void *) (rewrite_key + src_offset - diff);
+    *mask = (void *) (rewrite_mask + src_offset - diff);
+}
+
+static inline void
+csum_update_flag(struct tc_flower *flower,
+                 enum pedit_header_type htype) {
+    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {
+        flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR;
+    }
+    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4
+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6
+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP
+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) {
+        if (flower->key.ip_proto == IPPROTO_TCP) {
+            flower->mask.ip_proto = UINT8_MAX;
+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
+        } else if (flower->key.ip_proto == IPPROTO_UDP) {
+            flower->mask.ip_proto = UINT8_MAX;
+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
+        } else if (flower->key.ip_proto == IPPROTO_ICMP
+                   || flower->key.ip_proto == IPPROTO_ICMPV6) {
+            flower->mask.ip_proto = UINT8_MAX;
+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP;
+        }
+    }
+}
+
+static int
+nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
+                                 struct tc_flower *flower)
+{
+    struct {
+        struct tc_pedit sel;
+        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
+        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
+    } sel = {
+        .sel = {
+            .nkeys = 0
+        }
+    };
+    int i, j;
+
+    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
+        struct flower_key_to_pedit *m = &flower_pedit_map[i];
+        struct tc_pedit_key *pedit_key = NULL;
+        struct tc_pedit_key_ex *pedit_key_ex = NULL;
+        uint32_t *mask, *data, first_word_mask, last_word_mask;
+        int cnt = 0, cur_offset = 0;
+
+        if (!m->size) {
+            continue;
+        }
+
+        calc_offsets(flower, m, i, &cur_offset, &cnt, &last_word_mask,
+                     &first_word_mask, &mask, &data);
+
+        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
+            uint32_t mask_word = *mask;
+
+            if (j == 0) {
+                mask_word &= first_word_mask;
+            }
+            if (j == cnt - 1) {
+                mask_word &= last_word_mask;
+            }
+            if (!mask_word) {
+                continue;
+            }
+            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
+                VLOG_ERR_RL(&error_rl, "reached too many pedit offsets: %d",
+                            MAX_PEDIT_OFFSETS);
+                return EOPNOTSUPP;
+            }
+
+            pedit_key = &sel.keys[sel.sel.nkeys];
+            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
+            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
+            pedit_key_ex->htype = m->htype;
+            pedit_key->off = cur_offset;
+            pedit_key->mask = ~mask_word;
+            pedit_key->val = *data & mask_word;
+            sel.sel.nkeys++;
+            csum_update_flag(flower, m->htype);
+        }
+    }
+    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
+
+    return 0;
+}
+
+static int
 nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
 {
     size_t offset;
@@ -920,7 +1257,20 @@  nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
     offset = nl_msg_start_nested(request, TCA_FLOWER_ACT);
     {
         uint16_t act_index = 1;
+        int error;
+
+        if (flower->rewrite.rewrite) {
+            act_offset = nl_msg_start_nested(request, act_index++);
+            error = nl_msg_put_flower_rewrite_pedits(request, flower);
+            if (error) {
+                return error;
+            }
+            nl_msg_end_nested(request, act_offset);
 
+            act_offset = nl_msg_start_nested(request, act_index++);
+            nl_msg_put_act_csum(request, flower->csum_update_flags);
+            nl_msg_end_nested(request, act_offset);
+        }
         if (flower->set.set) {
             act_offset = nl_msg_start_nested(request, act_index++);
             nl_msg_put_act_tunnel_key_set(request, flower->set.id,
@@ -961,6 +1311,8 @@  nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
         }
     }
     nl_msg_end_nested(request, offset);
+
+    return 0;
 }
 
 static void
@@ -1002,11 +1354,19 @@  nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
     nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
                             &flower->mask.member, sizeof flower->key.member)
 
-static void
+static int
 nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
 {
+
     uint16_t host_eth_type = ntohs(flower->key.eth_type);
     bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
+    int err;
+
+    /* need to parse acts first as some acts require changing the matching */
+    err  = nl_msg_put_flower_acts(request, flower);
+    if (err) {
+        return err;
+    }
 
     if (is_vlan) {
         host_eth_type = ntohs(flower->key.encap_eth_type);
@@ -1062,7 +1422,7 @@  nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
         nl_msg_put_flower_tunnel(request, flower);
     }
 
-    nl_msg_put_flower_acts(request, flower);
+    return 0;
 }
 
 int
@@ -1085,11 +1445,17 @@  tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
     nl_msg_put_string(&request, TCA_KIND, "flower");
     basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
     {
-        nl_msg_put_flower_options(&request, flower);
+        error = nl_msg_put_flower_options(&request, flower);
+
+        if (error) {
+            ofpbuf_uninit(&request);
+            return error;
+        }
     }
     nl_msg_end_nested(&request, basic_offset);
 
     error = tc_transact(&request, &reply);
+
     if (!error) {
         struct tcmsg *tc =
             ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
diff --git a/lib/tc.h b/lib/tc.h
index 5f363d0..2269a22 100644
--- a/lib/tc.h
+++ b/lib/tc.h
@@ -93,6 +93,7 @@  struct tc_flower_key {
     struct {
         ovs_be32 ipv4_src;
         ovs_be32 ipv4_dst;
+        uint8_t rewrite_ttl;
     } ipv4;
     struct {
         struct in6_addr ipv6_src;
@@ -117,6 +118,17 @@  struct tc_flower {
     uint64_t lastused;
 
     struct {
+        bool rewrite;
+        uint8_t pad1[3];
+        struct tc_flower_key key;
+        uint8_t pad2[3];
+        struct tc_flower_key mask;
+        uint8_t pad3[3];
+    } rewrite;
+
+    uint32_t csum_update_flags;
+
+    struct {
         bool set;
         ovs_be64 id;
         ovs_be16 tp_src;