diff mbox series

[ovs-dev,V2,3/4] tc: Add header rewrite using tc pedit action

Message ID 1505708164-10270-4-git-send-email-roid@mellanox.com
State Changes Requested
Headers show
Series Add offload support for action set | expand

Commit Message

Roi Dayan Sept. 18, 2017, 4:16 a.m. UTC
From: Paul Blakey <paulb@mellanox.com>

To be later used to implement ovs action set offloading.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
---
 lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/tc.h |  16 +++
 2 files changed, 385 insertions(+), 3 deletions(-)

Comments

Simon Horman Sept. 18, 2017, 3:01 p.m. UTC | #1
On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
> From: Paul Blakey <paulb@mellanox.com>
> 
> To be later used to implement ovs action set offloading.
> 
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Reviewed-by: Roi Dayan <roid@mellanox.com>
> ---
>  lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  lib/tc.h |  16 +++
>  2 files changed, 385 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/tc.c b/lib/tc.c
> index c9cada2..743b2ee 100644
> --- a/lib/tc.c
> +++ b/lib/tc.c
> @@ -21,8 +21,10 @@
>  #include <errno.h>
>  #include <linux/if_ether.h>
>  #include <linux/rtnetlink.h>
> +#include <linux/tc_act/tc_csum.h>
>  #include <linux/tc_act/tc_gact.h>
>  #include <linux/tc_act/tc_mirred.h>
> +#include <linux/tc_act/tc_pedit.h>
>  #include <linux/tc_act/tc_tunnel_key.h>
>  #include <linux/tc_act/tc_vlan.h>
>  #include <linux/gen_stats.h>
> @@ -33,11 +35,14 @@
>  #include "netlink-socket.h"
>  #include "netlink.h"
>  #include "openvswitch/ofpbuf.h"
> +#include "openvswitch/util.h"
>  #include "openvswitch/vlog.h"
>  #include "packets.h"
>  #include "timeval.h"
>  #include "unaligned.h"
>  
> +#define MAX_PEDIT_OFFSETS 8

Why 8?

> +
>  VLOG_DEFINE_THIS_MODULE(tc);
>  
>  static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);
> @@ -50,6 +55,82 @@ enum tc_offload_policy {
>  
>  static enum tc_offload_policy tc_policy = TC_POLICY_NONE;
>  
> +struct tc_pedit_key_ex {
> +    enum pedit_header_type htype;
> +    enum pedit_cmd cmd;
> +};
> +
> +struct flower_key_to_pedit {
> +    enum pedit_header_type htype;
> +    int flower_offset;
> +    int offset;
> +    int size;
> +};
> +
> +static struct flower_key_to_pedit flower_pedit_map[] = {
> +    {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> +        12,
> +        offsetof(struct tc_flower_key, ipv4.ipv4_src),
> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> +        16,
> +        offsetof(struct tc_flower_key, ipv4.ipv4_dst),
> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> +        8,
> +        offsetof(struct tc_flower_key, ipv4.rewrite_ttl),
> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
> +        8,
> +        offsetof(struct tc_flower_key, ipv6.ipv6_src),
> +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
> +        24,
> +        offsetof(struct tc_flower_key, ipv6.ipv6_dst),
> +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> +        6,
> +        offsetof(struct tc_flower_key, src_mac),
> +        MEMBER_SIZEOF(struct tc_flower_key, src_mac)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> +        0,
> +        offsetof(struct tc_flower_key, dst_mac),
> +        MEMBER_SIZEOF(struct tc_flower_key, dst_mac)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> +        12,
> +        offsetof(struct tc_flower_key, eth_type),
> +        MEMBER_SIZEOF(struct tc_flower_key, eth_type)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
> +        0,
> +        offsetof(struct tc_flower_key, tcp_src),
> +        MEMBER_SIZEOF(struct tc_flower_key, tcp_src)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
> +        2,
> +        offsetof(struct tc_flower_key, tcp_dst),
> +        MEMBER_SIZEOF(struct tc_flower_key, tcp_dst)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
> +        0,
> +        offsetof(struct tc_flower_key, udp_src),
> +        MEMBER_SIZEOF(struct tc_flower_key, udp_src)
> +    }, {
> +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
> +        2,
> +        offsetof(struct tc_flower_key, udp_dst),
> +        MEMBER_SIZEOF(struct tc_flower_key, udp_dst)
> +    },
> +};
> +
>  struct tcmsg *
>  tc_make_request(int ifindex, int type, unsigned int flags,
>                  struct ofpbuf *request)
> @@ -365,6 +446,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
>      }
>  }
>  
> +static const struct nl_policy pedit_policy[] = {
> +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
> +                                     .min_len = sizeof(struct tc_pedit),
> +                                     .optional = false, },
> +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
> +                                      .optional = false, },
> +};
> +
> +static int
> +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
> +{
> +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
> +    const struct tc_pedit *pe;
> +    const struct tc_pedit_key *keys;
> +    const struct nlattr *nla, *keys_ex, *ex_type;
> +    const void *keys_attr;
> +    char *rewrite_key = (void *) &flower->rewrite.key;
> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
> +    size_t keys_ex_size, left;
> +    int type, i = 0;
> +
> +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
> +                         ARRAY_SIZE(pedit_policy))) {
> +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
> +        return EPROTO;
> +    }
> +
> +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
> +    keys = pe->keys;
> +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
> +    keys_ex = nl_attr_get(keys_attr);
> +    keys_ex_size = nl_attr_get_size(keys_attr);
> +
> +    NL_ATTR_FOR_EACH (nla, left, keys_ex, keys_ex_size) {
> +        if (i >= pe->nkeys) {
> +            break;
> +        }
> +
> +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
> +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
> +            type = nl_attr_get_u16(ex_type);
> +
> +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
> +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
> +                int flower_off = m->flower_offset;
> +                int sz = m->size;
> +                int mf = m->offset;
> +
> +                if (m->htype != type) {
> +                   continue;
> +                }
> +
> +                /* check overlap between current pedit key, which is always
> +                 * 4 bytes (range [off, off + 3]), and a map entry in
> +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
> +                if ((keys->off >= mf && keys->off < mf + sz)
> +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
> +                    int diff = flower_off + (keys->off - mf);
> +                    uint32_t *dst = (void *) (rewrite_key + diff);
> +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
> +                    uint32_t mask = ~(keys->mask);
> +                    uint32_t zero_bits;
> +
> +                    if (keys->off < mf) {
> +                        zero_bits = 8 * (mf - keys->off);
> +                        mask &= UINT32_MAX << zero_bits;
> +                    } else if (keys->off + 4 > mf + m->size) {
> +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
> +                        mask &= UINT32_MAX >> zero_bits;
> +                    }
> +
> +                    *dst_m |= mask;
> +                    *dst |= keys->val & mask;
> +                }
> +            }

If I understand the above correctly it is designed to make
pedit actions disjoint. If so, why is that necessary?

> +        } else {
> +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
> +                        nl_attr_type(nla));
> +            return EOPNOTSUPP;
> +        }

I think the code could exit early here as
nl_msg_put_flower_rewrite_pedits() does below.


> +
> +        keys++;
> +        i++;
> +    }
> +
> +    flower->rewrite.rewrite = true;
> +
> +    return 0;
> +}
> +
>  static const struct nl_policy tunnel_key_policy[] = {
>      [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC,
>                                 .min_len = sizeof(struct tc_tunnel_key),
> @@ -608,6 +779,11 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
>          nl_parse_act_vlan(act_options, flower);
>      } else if (!strcmp(act_kind, "tunnel_key")) {
>          nl_parse_act_tunnel_key(act_options, flower);
> +    } else if (!strcmp(act_kind, "pedit")) {
> +        nl_parse_act_pedit(act_options, flower);
> +    } else if (!strcmp(act_kind, "csum")) {
> +        /* not doing anything for now, ovs has an implicit csum recalculation
> +         * with rewriting of packet headers (translating of pedit acts). */

I wonder if the absence of a csum action when needed (by TC)
should be treated as an error.

>      } else {
>          VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
>          return EINVAL;
> @@ -809,6 +985,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy)
>  }
>  
>  static void
> +nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
> +{
> +    size_t offset;
> +
> +    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
> +    {
> +        struct tc_csum parm = { .action = TC_ACT_PIPE,
> +                                .update_flags = flags };
> +
> +        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
> +    }
> +    nl_msg_end_nested(request, offset);
> +}
> +
> +static void
> +nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
> +                     struct tc_pedit_key_ex *ex)
> +{
> +    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));

Are there unnecessary () on the line above?

> +    size_t offset, offset_keys_ex, offset_key;
> +    int i;
> +
> +    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
> +    {
> +        parm->action = TC_ACT_PIPE;
> +
> +        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
> +        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
> +        for (i = 0; i < parm->nkeys; i++, ex++) {
> +            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
> +            nl_msg_end_nested(request, offset_key);
> +        }
> +        nl_msg_end_nested(request, offset_keys_ex);
> +    }
> +    nl_msg_end_nested(request, offset);
> +}
> +
> +static void
>  nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
>  {
>      size_t offset;
> @@ -930,7 +1148,127 @@ nl_msg_put_act_cookie(struct ofpbuf *request, struct tc_cookie *ck) {
>      }
>  }
>  
> +/* Given flower, a key_to_pedit map entry, calculates the rest,
> + * where:
> + *
> + * mask, data - pointers of where read the first word of flower->key/mask.
> + * current_offset - which offset to use for the first pedit action.
> + * cnt - max pedits actions to use.
> + * first_word_mask/last_word_mask - the mask to use for the first/last read
> + * (as we read entire words). */
>  static void
> +calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m,
> +             int *cur_offset, int *cnt, uint32_t *last_word_mask,
> +             uint32_t *first_word_mask, uint32_t **mask, uint32_t **data)
> +{
> +    int start_offset, max_offset, total_size;
> +    int diff, right_zero_bits, left_zero_bits;
> +    char *rewrite_key = (void *) &flower->rewrite.key;
> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
> +
> +    max_offset = m->offset + m->size;
> +    start_offset = ROUND_DOWN(m->offset, 4);
> +    diff = m->offset - start_offset;
> +    total_size = max_offset - start_offset;
> +    right_zero_bits = 8 * (4 - (max_offset % 4));
> +    left_zero_bits = 8 * (m->offset - start_offset);
> +
> +    *cur_offset = start_offset;
> +    *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0);
> +    *last_word_mask = UINT32_MAX >> right_zero_bits;
> +    *first_word_mask = UINT32_MAX << left_zero_bits;
> +    *data = (void *) (rewrite_key + m->flower_offset - diff);
> +    *mask = (void *) (rewrite_mask + m->flower_offset - diff);

The type of *data and *mask is uint32_t *.
Why not cast to that type?

> +}
> +
> +static inline void
> +csum_update_flag(struct tc_flower *flower,
> +                 enum pedit_header_type htype) {

I think the above two lines could be one.

> +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {

A case statement might be nicer here.

> +        flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR;
> +    }
> +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4
> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6
> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP
> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) {
> +        if (flower->key.ip_proto == IPPROTO_TCP) {
> +            flower->mask.ip_proto = UINT8_MAX;

What if the mask was not UINT8_MAX to start with?
Doesn't this create a different flow?

> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
> +        } else if (flower->key.ip_proto == IPPROTO_UDP) {
> +            flower->mask.ip_proto = UINT8_MAX;
> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
> +        } else if (flower->key.ip_proto == IPPROTO_ICMP
> +                   || flower->key.ip_proto == IPPROTO_ICMPV6) {
> +            flower->mask.ip_proto = UINT8_MAX;
> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP;
> +        }
> +    }
> +}
> +
> +static int
> +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
> +                                 struct tc_flower *flower)
> +{
> +    struct {
> +        struct tc_pedit sel;
> +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
> +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
> +    } sel = {
> +        .sel = {
> +            .nkeys = 0
> +        }
> +    };
> +    int i, j;
> +
> +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
> +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
> +        struct tc_pedit_key *pedit_key = NULL;
> +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
> +        uint32_t *mask, *data, first_word_mask, last_word_mask;
> +        int cnt = 0, cur_offset = 0;
> +
> +        if (!m->size) {
> +            continue;
> +        }
> +
> +        calc_offsets(flower, m, &cur_offset, &cnt, &last_word_mask,
> +                     &first_word_mask, &mask, &data);
> +
> +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
> +            uint32_t mask_word = *mask;
> +
> +            if (j == 0) {
> +                mask_word &= first_word_mask;
> +            }
> +            if (j == cnt - 1) {
> +                mask_word &= last_word_mask;
> +            }
> +            if (!mask_word) {
> +                continue;
> +            }
> +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
> +                VLOG_WARN_RL(&error_rl, "reached too many pedit offsets: %d",
> +                             MAX_PEDIT_OFFSETS);
> +                return EOPNOTSUPP;
> +            }
> +
> +            pedit_key = &sel.keys[sel.sel.nkeys];
> +            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
> +            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
> +            pedit_key_ex->htype = m->htype;
> +            pedit_key->off = cur_offset;
> +            pedit_key->mask = ~mask_word;
> +            pedit_key->val = *data & mask_word;
> +            sel.sel.nkeys++;
> +            csum_update_flag(flower, m->htype);
> +        }
> +    }
> +    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
> +
> +    return 0;
> +}
> +
> +static int
>  nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>  {
>      size_t offset;
> @@ -939,7 +1277,20 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>      offset = nl_msg_start_nested(request, TCA_FLOWER_ACT);
>      {
>          uint16_t act_index = 1;
> +        int error;
>  
> +        if (flower->rewrite.rewrite) {
> +            act_offset = nl_msg_start_nested(request, act_index++);
> +            error = nl_msg_put_flower_rewrite_pedits(request, flower);
> +            if (error) {
> +                return error;
> +            }
> +            nl_msg_end_nested(request, act_offset);
> +
> +            act_offset = nl_msg_start_nested(request, act_index++);
> +            nl_msg_put_act_csum(request, flower->csum_update_flags);
> +            nl_msg_end_nested(request, act_offset);
> +        }
>          if (flower->set.set) {
>              act_offset = nl_msg_start_nested(request, act_index++);
>              nl_msg_put_act_tunnel_key_set(request, flower->set.id,
> @@ -980,6 +1331,8 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>          }
>      }
>      nl_msg_end_nested(request, offset);
> +
> +    return 0;
>  }
>  
>  static void
> @@ -1021,11 +1374,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
>      nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
>                              &flower->mask.member, sizeof flower->key.member)
>  
> -static void
> +static int
>  nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>  {
> +
>      uint16_t host_eth_type = ntohs(flower->key.eth_type);
>      bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
> +    int err;
> +
> +    /* need to parse acts first as some acts require changing the matching */

This seems strange to me.

> +    err  = nl_msg_put_flower_acts(request, flower);
> +    if (err) {
> +        return err;
> +    }
>  
>      if (is_vlan) {
>          host_eth_type = ntohs(flower->key.encap_eth_type);
> @@ -1083,7 +1444,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>          nl_msg_put_flower_tunnel(request, flower);
>      }
>  
> -    nl_msg_put_flower_acts(request, flower);
> +    return 0;
>  }
>  
>  int
> @@ -1106,7 +1467,12 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
>      nl_msg_put_string(&request, TCA_KIND, "flower");
>      basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
>      {
> -        nl_msg_put_flower_options(&request, flower);
> +        error = nl_msg_put_flower_options(&request, flower);
> +
> +        if (error) {
> +            ofpbuf_uninit(&request);
> +            return error;
> +        }
>      }
>      nl_msg_end_nested(&request, basic_offset);
>  
> diff --git a/lib/tc.h b/lib/tc.h
> index 6c69b79..7876051 100644
> --- a/lib/tc.h
> +++ b/lib/tc.h
> @@ -96,6 +96,7 @@ struct tc_flower_key {
>      struct {
>          ovs_be32 ipv4_src;
>          ovs_be32 ipv4_dst;
> +        uint8_t rewrite_ttl;
>      } ipv4;
>      struct {
>          struct in6_addr ipv6_src;
> @@ -120,6 +121,14 @@ struct tc_flower {
>      uint64_t lastused;
>  
>      struct {
> +        bool rewrite;
> +        struct tc_flower_key key;
> +        struct tc_flower_key mask;
> +    } rewrite;
> +
> +    uint32_t csum_update_flags;
> +
> +    struct {
>          bool set;
>          ovs_be64 id;
>          ovs_be16 tp_src;
> @@ -152,6 +161,13 @@ struct tc_flower {
>      struct tc_cookie act_cookie;
>  };
>  
> +/* assert that if we overflow with a masked write of uint32_t to the last byte
> + * of flower.rewrite we overflow inside struct flower.
> + * shouldn't happen unless someone moves rewrite to the end of flower */
> +BUILD_ASSERT_DECL(offsetof(struct tc_flower, rewrite)
> +                  + MEMBER_SIZEOF(struct tc_flower, rewrite)
> +                  + sizeof(uint32_t) - 2 < sizeof(struct tc_flower));
> +
>  int tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
>                        struct tc_flower *flower);
>  int tc_del_filter(int ifindex, int prio, int handle);
> -- 
> 2.7.5
>
Paul Blakey Sept. 25, 2017, 1:31 p.m. UTC | #2
On 18/09/2017 18:01, Simon Horman wrote:
> On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
>> From: Paul Blakey <paulb@mellanox.com>
>>
>> To be later used to implement ovs action set offloading.
>>
>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>> ---
>>   lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   lib/tc.h |  16 +++
>>   2 files changed, 385 insertions(+), 3 deletions(-)
>>
>> diff --git a/lib/tc.c b/lib/tc.c
>> index c9cada2..743b2ee 100644
>> --- a/lib/tc.c
>> +++ b/lib/tc.c
>> @@ -21,8 +21,10 @@
>>   #include <errno.h>
>>   #include <linux/if_ether.h>
>>   #include <linux/rtnetlink.h>
>> +#include <linux/tc_act/tc_csum.h>
>>   #include <linux/tc_act/tc_gact.h>
>>   #include <linux/tc_act/tc_mirred.h>
>> +#include <linux/tc_act/tc_pedit.h>
>>   #include <linux/tc_act/tc_tunnel_key.h>
>>   #include <linux/tc_act/tc_vlan.h>
>>   #include <linux/gen_stats.h>
>> @@ -33,11 +35,14 @@
>>   #include "netlink-socket.h"
>>   #include "netlink.h"
>>   #include "openvswitch/ofpbuf.h"
>> +#include "openvswitch/util.h"
>>   #include "openvswitch/vlog.h"
>>   #include "packets.h"
>>   #include "timeval.h"
>>   #include "unaligned.h"
>>   
>> +#define MAX_PEDIT_OFFSETS 8
> 
> Why 8?
We don't expect anything more right now (ipv6 src/dst rewrite requires 8 
pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if 
that's makes sens. do you suggest we increase it? to what?

> 
>> +
>>   VLOG_DEFINE_THIS_MODULE(tc);
>>   
>>   static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);
>> @@ -50,6 +55,82 @@ enum tc_offload_policy {
>>   
>>   static enum tc_offload_policy tc_policy = TC_POLICY_NONE;
>>   
>> +struct tc_pedit_key_ex {
>> +    enum pedit_header_type htype;
>> +    enum pedit_cmd cmd;
>> +};
>> +
>> +struct flower_key_to_pedit {
>> +    enum pedit_header_type htype;
>> +    int flower_offset;
>> +    int offset;
>> +    int size;
>> +};
>> +
>> +static struct flower_key_to_pedit flower_pedit_map[] = {
>> +    {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>> +        12,
>> +        offsetof(struct tc_flower_key, ipv4.ipv4_src),
>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>> +        16,
>> +        offsetof(struct tc_flower_key, ipv4.ipv4_dst),
>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>> +        8,
>> +        offsetof(struct tc_flower_key, ipv4.rewrite_ttl),
>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
>> +        8,
>> +        offsetof(struct tc_flower_key, ipv6.ipv6_src),
>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
>> +        24,
>> +        offsetof(struct tc_flower_key, ipv6.ipv6_dst),
>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
>> +        6,
>> +        offsetof(struct tc_flower_key, src_mac),
>> +        MEMBER_SIZEOF(struct tc_flower_key, src_mac)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
>> +        0,
>> +        offsetof(struct tc_flower_key, dst_mac),
>> +        MEMBER_SIZEOF(struct tc_flower_key, dst_mac)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
>> +        12,
>> +        offsetof(struct tc_flower_key, eth_type),
>> +        MEMBER_SIZEOF(struct tc_flower_key, eth_type)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
>> +        0,
>> +        offsetof(struct tc_flower_key, tcp_src),
>> +        MEMBER_SIZEOF(struct tc_flower_key, tcp_src)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
>> +        2,
>> +        offsetof(struct tc_flower_key, tcp_dst),
>> +        MEMBER_SIZEOF(struct tc_flower_key, tcp_dst)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
>> +        0,
>> +        offsetof(struct tc_flower_key, udp_src),
>> +        MEMBER_SIZEOF(struct tc_flower_key, udp_src)
>> +    }, {
>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
>> +        2,
>> +        offsetof(struct tc_flower_key, udp_dst),
>> +        MEMBER_SIZEOF(struct tc_flower_key, udp_dst)
>> +    },
>> +};
>> +
>>   struct tcmsg *
>>   tc_make_request(int ifindex, int type, unsigned int flags,
>>                   struct ofpbuf *request)
>> @@ -365,6 +446,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
>>       }
>>   }
>>   
>> +static const struct nl_policy pedit_policy[] = {
>> +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
>> +                                     .min_len = sizeof(struct tc_pedit),
>> +                                     .optional = false, },
>> +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
>> +                                      .optional = false, },
>> +};
>> +
>> +static int
>> +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
>> +{
>> +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
>> +    const struct tc_pedit *pe;
>> +    const struct tc_pedit_key *keys;
>> +    const struct nlattr *nla, *keys_ex, *ex_type;
>> +    const void *keys_attr;
>> +    char *rewrite_key = (void *) &flower->rewrite.key;
>> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
>> +    size_t keys_ex_size, left;
>> +    int type, i = 0;
>> +
>> +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
>> +                         ARRAY_SIZE(pedit_policy))) {
>> +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
>> +        return EPROTO;
>> +    }
>> +
>> +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
>> +    keys = pe->keys;
>> +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
>> +    keys_ex = nl_attr_get(keys_attr);
>> +    keys_ex_size = nl_attr_get_size(keys_attr);
>> +
>> +    NL_ATTR_FOR_EACH (nla, left, keys_ex, keys_ex_size) {
>> +        if (i >= pe->nkeys) {
>> +            break;
>> +        }
>> +
>> +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
>> +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
>> +            type = nl_attr_get_u16(ex_type);
>> +
>> +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
>> +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
>> +                int flower_off = m->flower_offset;
>> +                int sz = m->size;
>> +                int mf = m->offset;
>> +
>> +                if (m->htype != type) {
>> +                   continue;
>> +                }
>> +
>> +                /* check overlap between current pedit key, which is always
>> +                 * 4 bytes (range [off, off + 3]), and a map entry in
>> +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
>> +                if ((keys->off >= mf && keys->off < mf + sz)
>> +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
>> +                    int diff = flower_off + (keys->off - mf);
>> +                    uint32_t *dst = (void *) (rewrite_key + diff);
>> +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
>> +                    uint32_t mask = ~(keys->mask);
>> +                    uint32_t zero_bits;
>> +
>> +                    if (keys->off < mf) {
>> +                        zero_bits = 8 * (mf - keys->off);
>> +                        mask &= UINT32_MAX << zero_bits;
>> +                    } else if (keys->off + 4 > mf + m->size) {
>> +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
>> +                        mask &= UINT32_MAX >> zero_bits;
>> +                    }
>> +
>> +                    *dst_m |= mask;
>> +                    *dst |= keys->val & mask;
>> +                }
>> +            }
> 
> If I understand the above correctly it is designed to make
> pedit actions disjoint. If so, why is that necessary? >

It's not, as a single flower key rewrite can be split to multiple pedit 
actions it finds the overlap between a flower key and a pedit action, 
if they do overlap it translates it to the correct offset and masks it out.

>> +        } else {
>> +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
>> +                        nl_attr_type(nla));
>> +            return EOPNOTSUPP;
>> +        }
> 
> I think the code could exit early here as
> nl_msg_put_flower_rewrite_pedits() does below.
> 

Sorry, didn't understand. can you give an example?

> 
>> +
>> +        keys++;
>> +        i++;
>> +    }
>> +
>> +    flower->rewrite.rewrite = true;
>> +
>> +    return 0;
>> +}
>> +
>>   static const struct nl_policy tunnel_key_policy[] = {
>>       [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC,
>>                                  .min_len = sizeof(struct tc_tunnel_key),
>> @@ -608,6 +779,11 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
>>           nl_parse_act_vlan(act_options, flower);
>>       } else if (!strcmp(act_kind, "tunnel_key")) {
>>           nl_parse_act_tunnel_key(act_options, flower);
>> +    } else if (!strcmp(act_kind, "pedit")) {
>> +        nl_parse_act_pedit(act_options, flower);
>> +    } else if (!strcmp(act_kind, "csum")) {
>> +        /* not doing anything for now, ovs has an implicit csum recalculation
>> +         * with rewriting of packet headers (translating of pedit acts). */
> 
> I wonder if the absence of a csum action when needed (by TC)
> should be treated as an error >

Do you mean validating csum flags from each protocol and such (like in put)?

>>       } else {
>>           VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
>>           return EINVAL;
>> @@ -809,6 +985,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy)
>>   }
>>   
>>   static void
>> +nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
>> +{
>> +    size_t offset;
>> +
>> +    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
>> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
>> +    {
>> +        struct tc_csum parm = { .action = TC_ACT_PIPE,
>> +                                .update_flags = flags };
>> +
>> +        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
>> +    }
>> +    nl_msg_end_nested(request, offset);
>> +}
>> +
>> +static void
>> +nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
>> +                     struct tc_pedit_key_ex *ex)
>> +{
>> +    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
> 
> Are there unnecessary () on the line above?
No, I'll remove them.

> 
>> +    size_t offset, offset_keys_ex, offset_key;
>> +    int i;
>> +
>> +    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
>> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
>> +    {
>> +        parm->action = TC_ACT_PIPE;
>> +
>> +        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
>> +        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
>> +        for (i = 0; i < parm->nkeys; i++, ex++) {
>> +            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
>> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
>> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
>> +            nl_msg_end_nested(request, offset_key);
>> +        }
>> +        nl_msg_end_nested(request, offset_keys_ex);
>> +    }
>> +    nl_msg_end_nested(request, offset);
>> +}
>> +
>> +static void
>>   nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
>>   {
>>       size_t offset;
>> @@ -930,7 +1148,127 @@ nl_msg_put_act_cookie(struct ofpbuf *request, struct tc_cookie *ck) {
>>       }
>>   }
>>   
>> +/* Given flower, a key_to_pedit map entry, calculates the rest,
>> + * where:
>> + *
>> + * mask, data - pointers of where read the first word of flower->key/mask.
>> + * current_offset - which offset to use for the first pedit action.
>> + * cnt - max pedits actions to use.
>> + * first_word_mask/last_word_mask - the mask to use for the first/last read
>> + * (as we read entire words). */
>>   static void
>> +calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m,
>> +             int *cur_offset, int *cnt, uint32_t *last_word_mask,
>> +             uint32_t *first_word_mask, uint32_t **mask, uint32_t **data)
>> +{
>> +    int start_offset, max_offset, total_size;
>> +    int diff, right_zero_bits, left_zero_bits;
>> +    char *rewrite_key = (void *) &flower->rewrite.key;
>> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
>> +
>> +    max_offset = m->offset + m->size;
>> +    start_offset = ROUND_DOWN(m->offset, 4);
>> +    diff = m->offset - start_offset;
>> +    total_size = max_offset - start_offset;
>> +    right_zero_bits = 8 * (4 - (max_offset % 4));
>> +    left_zero_bits = 8 * (m->offset - start_offset);
>> +
>> +    *cur_offset = start_offset;
>> +    *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0);
>> +    *last_word_mask = UINT32_MAX >> right_zero_bits;
>> +    *first_word_mask = UINT32_MAX << left_zero_bits;
>> +    *data = (void *) (rewrite_key + m->flower_offset - diff);
>> +    *mask = (void *) (rewrite_mask + m->flower_offset - diff);
> 
> The type of *data and *mask is uint32_t *.
> Why not cast to that type?

It's to avoid warning of increasing pointer alignment (-Wcast-align).

> 
>> +}
>> +
>> +static inline void
>> +csum_update_flag(struct tc_flower *flower,
>> +                 enum pedit_header_type htype) {
> 
> I think the above two lines could be one.
> 
>> +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {
> 
> A case statement might be nicer here.

Right, thanks.

> 
>> +        flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR;
>> +    }
>> +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4
>> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6
>> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP
>> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) {
>> +        if (flower->key.ip_proto == IPPROTO_TCP) {
>> +            flower->mask.ip_proto = UINT8_MAX;
> 
> What if the mask was not UINT8_MAX to start with?
> Doesn't this create a different flow?

This is so the checksum will be strict (not setting/recalc udp checksum 
on non udp packets). It creates a more specific flow, a subset of the 
original. This is fine as datapath is free to ignore a mask, which is 
the same as a full mask we've set.

> 
>> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
>> +        } else if (flower->key.ip_proto == IPPROTO_UDP) {
>> +            flower->mask.ip_proto = UINT8_MAX;
>> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
>> +        } else if (flower->key.ip_proto == IPPROTO_ICMP
>> +                   || flower->key.ip_proto == IPPROTO_ICMPV6) {
>> +            flower->mask.ip_proto = UINT8_MAX;
>> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP;
>> +        }
>> +    }
>> +}
>> +
>> +static int
>> +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
>> +                                 struct tc_flower *flower)
>> +{
>> +    struct {
>> +        struct tc_pedit sel;
>> +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
>> +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
>> +    } sel = {
>> +        .sel = {
>> +            .nkeys = 0
>> +        }
>> +    };
>> +    int i, j;
>> +
>> +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
>> +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
>> +        struct tc_pedit_key *pedit_key = NULL;
>> +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
>> +        uint32_t *mask, *data, first_word_mask, last_word_mask;
>> +        int cnt = 0, cur_offset = 0;
>> +
>> +        if (!m->size) {
>> +            continue;
>> +        }
>> +
>> +        calc_offsets(flower, m, &cur_offset, &cnt, &last_word_mask,
>> +                     &first_word_mask, &mask, &data);
>> +
>> +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
>> +            uint32_t mask_word = *mask;
>> +
>> +            if (j == 0) {
>> +                mask_word &= first_word_mask;
>> +            }
>> +            if (j == cnt - 1) {
>> +                mask_word &= last_word_mask;
>> +            }
>> +            if (!mask_word) {
>> +                continue;
>> +            }
>> +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
>> +                VLOG_WARN_RL(&error_rl, "reached too many pedit offsets: %d",
>> +                             MAX_PEDIT_OFFSETS);
>> +                return EOPNOTSUPP;
>> +            }
>> +
>> +            pedit_key = &sel.keys[sel.sel.nkeys];
>> +            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
>> +            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
>> +            pedit_key_ex->htype = m->htype;
>> +            pedit_key->off = cur_offset;
>> +            pedit_key->mask = ~mask_word;
>> +            pedit_key->val = *data & mask_word;
>> +            sel.sel.nkeys++;
>> +            csum_update_flag(flower, m->htype);
>> +        }
>> +    }
>> +    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
>> +
>> +    return 0;
>> +}
>> +
>> +static int
>>   nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>>   {
>>       size_t offset;
>> @@ -939,7 +1277,20 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>>       offset = nl_msg_start_nested(request, TCA_FLOWER_ACT);
>>       {
>>           uint16_t act_index = 1;
>> +        int error;
>>   
>> +        if (flower->rewrite.rewrite) {
>> +            act_offset = nl_msg_start_nested(request, act_index++);
>> +            error = nl_msg_put_flower_rewrite_pedits(request, flower);
>> +            if (error) {
>> +                return error;
>> +            }
>> +            nl_msg_end_nested(request, act_offset);
>> +
>> +            act_offset = nl_msg_start_nested(request, act_index++);
>> +            nl_msg_put_act_csum(request, flower->csum_update_flags);
>> +            nl_msg_end_nested(request, act_offset);
>> +        }
>>           if (flower->set.set) {
>>               act_offset = nl_msg_start_nested(request, act_index++);
>>               nl_msg_put_act_tunnel_key_set(request, flower->set.id,
>> @@ -980,6 +1331,8 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>>           }
>>       }
>>       nl_msg_end_nested(request, offset);
>> +
>> +    return 0;
>>   }
>>   
>>   static void
>> @@ -1021,11 +1374,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
>>       nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
>>                               &flower->mask.member, sizeof flower->key.member)
>>   
>> -static void
>> +static int
>>   nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>>   {
>> +
>>       uint16_t host_eth_type = ntohs(flower->key.eth_type);
>>       bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
>> +    int err;
>> +
>> +    /* need to parse acts first as some acts require changing the matching */
> 
> This seems strange to me.

from the strict (normalized?) header checksum.

> 
>> +    err  = nl_msg_put_flower_acts(request, flower);
>> +    if (err) {
>> +        return err;
>> +    }
>>   
>>       if (is_vlan) {
>>           host_eth_type = ntohs(flower->key.encap_eth_type);
>> @@ -1083,7 +1444,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>>           nl_msg_put_flower_tunnel(request, flower);
>>       }
>>   
>> -    nl_msg_put_flower_acts(request, flower);
>> +    return 0;
>>   }
>>   
>>   int
>> @@ -1106,7 +1467,12 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
>>       nl_msg_put_string(&request, TCA_KIND, "flower");
>>       basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
>>       {
>> -        nl_msg_put_flower_options(&request, flower);
>> +        error = nl_msg_put_flower_options(&request, flower);
>> +
>> +        if (error) {
>> +            ofpbuf_uninit(&request);
>> +            return error;
>> +        }
>>       }
>>       nl_msg_end_nested(&request, basic_offset);
>>   
>> diff --git a/lib/tc.h b/lib/tc.h
>> index 6c69b79..7876051 100644
>> --- a/lib/tc.h
>> +++ b/lib/tc.h
>> @@ -96,6 +96,7 @@ struct tc_flower_key {
>>       struct {
>>           ovs_be32 ipv4_src;
>>           ovs_be32 ipv4_dst;
>> +        uint8_t rewrite_ttl;
>>       } ipv4;
>>       struct {
>>           struct in6_addr ipv6_src;
>> @@ -120,6 +121,14 @@ struct tc_flower {
>>       uint64_t lastused;
>>   
>>       struct {
>> +        bool rewrite;
>> +        struct tc_flower_key key;
>> +        struct tc_flower_key mask;
>> +    } rewrite;
>> +
>> +    uint32_t csum_update_flags;
>> +
>> +    struct {
>>           bool set;
>>           ovs_be64 id;
>>           ovs_be16 tp_src;
>> @@ -152,6 +161,13 @@ struct tc_flower {
>>       struct tc_cookie act_cookie;
>>   };
>>   
>> +/* assert that if we overflow with a masked write of uint32_t to the last byte
>> + * of flower.rewrite we overflow inside struct flower.
>> + * shouldn't happen unless someone moves rewrite to the end of flower */
>> +BUILD_ASSERT_DECL(offsetof(struct tc_flower, rewrite)
>> +                  + MEMBER_SIZEOF(struct tc_flower, rewrite)
>> +                  + sizeof(uint32_t) - 2 < sizeof(struct tc_flower));
>> +
>>   int tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
>>                         struct tc_flower *flower);
>>   int tc_del_filter(int ifindex, int prio, int handle);
>> -- 
>> 2.7.5
>>
Simon Horman Sept. 27, 2017, 9:08 a.m. UTC | #3
On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
> 
> 
> On 18/09/2017 18:01, Simon Horman wrote:
> >On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
> >>From: Paul Blakey <paulb@mellanox.com>
> >>
> >>To be later used to implement ovs action set offloading.
> >>
> >>Signed-off-by: Paul Blakey <paulb@mellanox.com>
> >>Reviewed-by: Roi Dayan <roid@mellanox.com>
> >>---
> >>  lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >>  lib/tc.h |  16 +++
> >>  2 files changed, 385 insertions(+), 3 deletions(-)
> >>
> >>diff --git a/lib/tc.c b/lib/tc.c
> >>index c9cada2..743b2ee 100644
> >>--- a/lib/tc.c
> >>+++ b/lib/tc.c
> >>@@ -21,8 +21,10 @@
> >>  #include <errno.h>
> >>  #include <linux/if_ether.h>
> >>  #include <linux/rtnetlink.h>
> >>+#include <linux/tc_act/tc_csum.h>
> >>  #include <linux/tc_act/tc_gact.h>
> >>  #include <linux/tc_act/tc_mirred.h>
> >>+#include <linux/tc_act/tc_pedit.h>
> >>  #include <linux/tc_act/tc_tunnel_key.h>
> >>  #include <linux/tc_act/tc_vlan.h>
> >>  #include <linux/gen_stats.h>
> >>@@ -33,11 +35,14 @@
> >>  #include "netlink-socket.h"
> >>  #include "netlink.h"
> >>  #include "openvswitch/ofpbuf.h"
> >>+#include "openvswitch/util.h"
> >>  #include "openvswitch/vlog.h"
> >>  #include "packets.h"
> >>  #include "timeval.h"
> >>  #include "unaligned.h"
> >>+#define MAX_PEDIT_OFFSETS 8
> >
> >Why 8?
> We don't expect anything more right now (ipv6 src/dst rewrite requires 8
> pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if
> that's makes sens. do you suggest we increase it? to what?

It seems strange to me to place a somewhat arbitrary small limit
when none exists in the pedit API being used. I would at prefer if
it was at least a bigger, say 16 or 32.

> >>  VLOG_DEFINE_THIS_MODULE(tc);
> >>  static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);
> >>@@ -50,6 +55,82 @@ enum tc_offload_policy {
> >>  static enum tc_offload_policy tc_policy = TC_POLICY_NONE;
> >>+struct tc_pedit_key_ex {
> >>+    enum pedit_header_type htype;
> >>+    enum pedit_cmd cmd;
> >>+};
> >>+
> >>+struct flower_key_to_pedit {
> >>+    enum pedit_header_type htype;
> >>+    int flower_offset;
> >>+    int offset;
> >>+    int size;
> >>+};
> >>+
> >>+static struct flower_key_to_pedit flower_pedit_map[] = {
> >>+    {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> >>+        12,
> >>+        offsetof(struct tc_flower_key, ipv4.ipv4_src),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> >>+        16,
> >>+        offsetof(struct tc_flower_key, ipv4.ipv4_dst),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> >>+        8,
> >>+        offsetof(struct tc_flower_key, ipv4.rewrite_ttl),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
> >>+        8,
> >>+        offsetof(struct tc_flower_key, ipv6.ipv6_src),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
> >>+        24,
> >>+        offsetof(struct tc_flower_key, ipv6.ipv6_dst),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> >>+        6,
> >>+        offsetof(struct tc_flower_key, src_mac),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, src_mac)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> >>+        0,
> >>+        offsetof(struct tc_flower_key, dst_mac),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, dst_mac)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> >>+        12,
> >>+        offsetof(struct tc_flower_key, eth_type),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, eth_type)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
> >>+        0,
> >>+        offsetof(struct tc_flower_key, tcp_src),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, tcp_src)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
> >>+        2,
> >>+        offsetof(struct tc_flower_key, tcp_dst),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, tcp_dst)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
> >>+        0,
> >>+        offsetof(struct tc_flower_key, udp_src),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, udp_src)
> >>+    }, {
> >>+        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
> >>+        2,
> >>+        offsetof(struct tc_flower_key, udp_dst),
> >>+        MEMBER_SIZEOF(struct tc_flower_key, udp_dst)
> >>+    },
> >>+};
> >>+
> >>  struct tcmsg *
> >>  tc_make_request(int ifindex, int type, unsigned int flags,
> >>                  struct ofpbuf *request)
> >>@@ -365,6 +446,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
> >>      }
> >>  }
> >>+static const struct nl_policy pedit_policy[] = {
> >>+            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
> >>+                                     .min_len = sizeof(struct tc_pedit),
> >>+                                     .optional = false, },
> >>+            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
> >>+                                      .optional = false, },
> >>+};
> >>+
> >>+static int
> >>+nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
> >>+{
> >>+    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
> >>+    const struct tc_pedit *pe;
> >>+    const struct tc_pedit_key *keys;
> >>+    const struct nlattr *nla, *keys_ex, *ex_type;
> >>+    const void *keys_attr;
> >>+    char *rewrite_key = (void *) &flower->rewrite.key;
> >>+    char *rewrite_mask = (void *) &flower->rewrite.mask;
> >>+    size_t keys_ex_size, left;
> >>+    int type, i = 0;
> >>+
> >>+    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
> >>+                         ARRAY_SIZE(pedit_policy))) {
> >>+        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
> >>+        return EPROTO;
> >>+    }
> >>+
> >>+    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
> >>+    keys = pe->keys;
> >>+    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
> >>+    keys_ex = nl_attr_get(keys_attr);
> >>+    keys_ex_size = nl_attr_get_size(keys_attr);
> >>+
> >>+    NL_ATTR_FOR_EACH (nla, left, keys_ex, keys_ex_size) {
> >>+        if (i >= pe->nkeys) {
> >>+            break;
> >>+        }
> >>+
> >>+        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
> >>+            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
> >>+            type = nl_attr_get_u16(ex_type);
> >>+
> >>+            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
> >>+                struct flower_key_to_pedit *m = &flower_pedit_map[j];
> >>+                int flower_off = m->flower_offset;
> >>+                int sz = m->size;
> >>+                int mf = m->offset;
> >>+
> >>+                if (m->htype != type) {
> >>+                   continue;
> >>+                }
> >>+
> >>+                /* check overlap between current pedit key, which is always
> >>+                 * 4 bytes (range [off, off + 3]), and a map entry in
> >>+                 * flower_pedit_map (range [mf, mf + sz - 1]) */
> >>+                if ((keys->off >= mf && keys->off < mf + sz)
> >>+                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
> >>+                    int diff = flower_off + (keys->off - mf);
> >>+                    uint32_t *dst = (void *) (rewrite_key + diff);
> >>+                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
> >>+                    uint32_t mask = ~(keys->mask);
> >>+                    uint32_t zero_bits;
> >>+
> >>+                    if (keys->off < mf) {
> >>+                        zero_bits = 8 * (mf - keys->off);
> >>+                        mask &= UINT32_MAX << zero_bits;
> >>+                    } else if (keys->off + 4 > mf + m->size) {
> >>+                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
> >>+                        mask &= UINT32_MAX >> zero_bits;
> >>+                    }
> >>+
> >>+                    *dst_m |= mask;
> >>+                    *dst |= keys->val & mask;
> >>+                }
> >>+            }
> >
> >If I understand the above correctly it is designed to make
> >pedit actions disjoint. If so, why is that necessary? >
> 
> It's not, as a single flower key rewrite can be split to multiple pedit
> actions it finds the overlap between a flower key and a pedit action, if
> they do overlap it translates it to the correct offset and masks it out.

Thanks, understood.

> 
> >>+        } else {
> >>+            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
> >>+                        nl_attr_type(nla));
> >>+            return EOPNOTSUPP;
> >>+        }
> >
> >I think the code could exit early here as
> >nl_msg_put_flower_rewrite_pedits() does below.
> >
> 
> Sorry, didn't understand. can you give an example?
> 

I meant something like this.

            if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
                VLOG_ERR_RL(...);
                return EOPNOTSUPP;
            }

	    /* Overlap detection code goes here */

> >
> >>+
> >>+        keys++;
> >>+        i++;
> >>+    }
> >>+
> >>+    flower->rewrite.rewrite = true;
> >>+
> >>+    return 0;
> >>+}
> >>+
> >>  static const struct nl_policy tunnel_key_policy[] = {
> >>      [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC,
> >>                                 .min_len = sizeof(struct tc_tunnel_key),
> >>@@ -608,6 +779,11 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
> >>          nl_parse_act_vlan(act_options, flower);
> >>      } else if (!strcmp(act_kind, "tunnel_key")) {
> >>          nl_parse_act_tunnel_key(act_options, flower);
> >>+    } else if (!strcmp(act_kind, "pedit")) {
> >>+        nl_parse_act_pedit(act_options, flower);
> >>+    } else if (!strcmp(act_kind, "csum")) {
> >>+        /* not doing anything for now, ovs has an implicit csum recalculation
> >>+         * with rewriting of packet headers (translating of pedit acts). */
> >
> >I wonder if the absence of a csum action when needed (by TC)
> >should be treated as an error >
> 
> Do you mean validating csum flags from each protocol and such (like in put)?

Yes, I think so.

In OvS (without TC acceleration) csum recalculation is implicit
but with TC an explicit csum update action is required. I see
in your code you are handling this by adding csum actions to TC actions
when translating from OvS DPIF actions. And in the reverse (above) csum
actions are simply ignored.

What I am wondering is if when translating TC actions to OvS DPIF actions
you should track if the TC actions should include a csum rule because
of other actions and treat its absence as an error.

> 
> >>      } else {
> >>          VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
> >>          return EINVAL;
> >>@@ -809,6 +985,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy)
> >>  }
> >>  static void
> >>+nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
> >>+{
> >>+    size_t offset;
> >>+
> >>+    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
> >>+    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
> >>+    {
> >>+        struct tc_csum parm = { .action = TC_ACT_PIPE,
> >>+                                .update_flags = flags };
> >>+
> >>+        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
> >>+    }
> >>+    nl_msg_end_nested(request, offset);
> >>+}
> >>+
> >>+static void
> >>+nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
> >>+                     struct tc_pedit_key_ex *ex)
> >>+{
> >>+    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
> >
> >Are there unnecessary () on the line above?
> No, I'll remove them.
> 
> >
> >>+    size_t offset, offset_keys_ex, offset_key;
> >>+    int i;
> >>+
> >>+    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
> >>+    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
> >>+    {
> >>+        parm->action = TC_ACT_PIPE;
> >>+
> >>+        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
> >>+        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
> >>+        for (i = 0; i < parm->nkeys; i++, ex++) {
> >>+            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
> >>+            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
> >>+            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
> >>+            nl_msg_end_nested(request, offset_key);
> >>+        }
> >>+        nl_msg_end_nested(request, offset_keys_ex);
> >>+    }
> >>+    nl_msg_end_nested(request, offset);
> >>+}
> >>+
> >>+static void
> >>  nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
> >>  {
> >>      size_t offset;
> >>@@ -930,7 +1148,127 @@ nl_msg_put_act_cookie(struct ofpbuf *request, struct tc_cookie *ck) {
> >>      }
> >>  }
> >>+/* Given flower, a key_to_pedit map entry, calculates the rest,
> >>+ * where:
> >>+ *
> >>+ * mask, data - pointers of where read the first word of flower->key/mask.
> >>+ * current_offset - which offset to use for the first pedit action.
> >>+ * cnt - max pedits actions to use.
> >>+ * first_word_mask/last_word_mask - the mask to use for the first/last read
> >>+ * (as we read entire words). */
> >>  static void
> >>+calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m,
> >>+             int *cur_offset, int *cnt, uint32_t *last_word_mask,
> >>+             uint32_t *first_word_mask, uint32_t **mask, uint32_t **data)
> >>+{
> >>+    int start_offset, max_offset, total_size;
> >>+    int diff, right_zero_bits, left_zero_bits;
> >>+    char *rewrite_key = (void *) &flower->rewrite.key;
> >>+    char *rewrite_mask = (void *) &flower->rewrite.mask;
> >>+
> >>+    max_offset = m->offset + m->size;
> >>+    start_offset = ROUND_DOWN(m->offset, 4);
> >>+    diff = m->offset - start_offset;
> >>+    total_size = max_offset - start_offset;
> >>+    right_zero_bits = 8 * (4 - (max_offset % 4));
> >>+    left_zero_bits = 8 * (m->offset - start_offset);
> >>+
> >>+    *cur_offset = start_offset;
> >>+    *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0);
> >>+    *last_word_mask = UINT32_MAX >> right_zero_bits;
> >>+    *first_word_mask = UINT32_MAX << left_zero_bits;
> >>+    *data = (void *) (rewrite_key + m->flower_offset - diff);
> >>+    *mask = (void *) (rewrite_mask + m->flower_offset - diff);
> >
> >The type of *data and *mask is uint32_t *.
> >Why not cast to that type?
> 
> It's to avoid warning of increasing pointer alignment (-Wcast-align).

It sounds like the warning should be addressed rather than overridden using
a cast.

> >
> >>+}
> >>+
> >>+static inline void
> >>+csum_update_flag(struct tc_flower *flower,
> >>+                 enum pedit_header_type htype) {
> >
> >I think the above two lines could be one.
> >
> >>+    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {
> >
> >A case statement might be nicer here.
> 
> Right, thanks.
> 
> >
> >>+        flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR;
> >>+    }
> >>+    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4
> >>+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6
> >>+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP
> >>+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) {
> >>+        if (flower->key.ip_proto == IPPROTO_TCP) {
> >>+            flower->mask.ip_proto = UINT8_MAX;
> >
> >What if the mask was not UINT8_MAX to start with?
> >Doesn't this create a different flow?
> 
> This is so the checksum will be strict (not setting/recalc udp checksum on
> non udp packets). It creates a more specific flow, a subset of the original.
> This is fine as datapath is free to ignore a mask, which is the same as a
> full mask we've set.

I'm not sure that I understand why its fine for the datapath to ignore the
mask.

> >>+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
> >>+        } else if (flower->key.ip_proto == IPPROTO_UDP) {
> >>+            flower->mask.ip_proto = UINT8_MAX;
> >>+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
> >>+        } else if (flower->key.ip_proto == IPPROTO_ICMP
> >>+                   || flower->key.ip_proto == IPPROTO_ICMPV6) {
> >>+            flower->mask.ip_proto = UINT8_MAX;
> >>+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP;
> >>+        }
> >>+    }
> >>+}
> >>+
> >>+static int
> >>+nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
> >>+                                 struct tc_flower *flower)
> >>+{
> >>+    struct {
> >>+        struct tc_pedit sel;
> >>+        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
> >>+        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
> >>+    } sel = {
> >>+        .sel = {
> >>+            .nkeys = 0
> >>+        }
> >>+    };
> >>+    int i, j;
> >>+
> >>+    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
> >>+        struct flower_key_to_pedit *m = &flower_pedit_map[i];
> >>+        struct tc_pedit_key *pedit_key = NULL;
> >>+        struct tc_pedit_key_ex *pedit_key_ex = NULL;
> >>+        uint32_t *mask, *data, first_word_mask, last_word_mask;
> >>+        int cnt = 0, cur_offset = 0;
> >>+
> >>+        if (!m->size) {
> >>+            continue;
> >>+        }
> >>+
> >>+        calc_offsets(flower, m, &cur_offset, &cnt, &last_word_mask,
> >>+                     &first_word_mask, &mask, &data);
> >>+
> >>+        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
> >>+            uint32_t mask_word = *mask;
> >>+
> >>+            if (j == 0) {
> >>+                mask_word &= first_word_mask;
> >>+            }
> >>+            if (j == cnt - 1) {
> >>+                mask_word &= last_word_mask;
> >>+            }
> >>+            if (!mask_word) {
> >>+                continue;
> >>+            }
> >>+            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
> >>+                VLOG_WARN_RL(&error_rl, "reached too many pedit offsets: %d",
> >>+                             MAX_PEDIT_OFFSETS);
> >>+                return EOPNOTSUPP;
> >>+            }
> >>+
> >>+            pedit_key = &sel.keys[sel.sel.nkeys];
> >>+            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
> >>+            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
> >>+            pedit_key_ex->htype = m->htype;
> >>+            pedit_key->off = cur_offset;
> >>+            pedit_key->mask = ~mask_word;
> >>+            pedit_key->val = *data & mask_word;
> >>+            sel.sel.nkeys++;
> >>+            csum_update_flag(flower, m->htype);
> >>+        }
> >>+    }
> >>+    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
> >>+
> >>+    return 0;
> >>+}
> >>+
> >>+static int
> >>  nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
> >>  {
> >>      size_t offset;
> >>@@ -939,7 +1277,20 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
> >>      offset = nl_msg_start_nested(request, TCA_FLOWER_ACT);
> >>      {
> >>          uint16_t act_index = 1;
> >>+        int error;
> >>+        if (flower->rewrite.rewrite) {
> >>+            act_offset = nl_msg_start_nested(request, act_index++);
> >>+            error = nl_msg_put_flower_rewrite_pedits(request, flower);
> >>+            if (error) {
> >>+                return error;
> >>+            }
> >>+            nl_msg_end_nested(request, act_offset);
> >>+
> >>+            act_offset = nl_msg_start_nested(request, act_index++);
> >>+            nl_msg_put_act_csum(request, flower->csum_update_flags);
> >>+            nl_msg_end_nested(request, act_offset);
> >>+        }
> >>          if (flower->set.set) {
> >>              act_offset = nl_msg_start_nested(request, act_index++);
> >>              nl_msg_put_act_tunnel_key_set(request, flower->set.id,
> >>@@ -980,6 +1331,8 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
> >>          }
> >>      }
> >>      nl_msg_end_nested(request, offset);
> >>+
> >>+    return 0;
> >>  }
> >>  static void
> >>@@ -1021,11 +1374,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
> >>      nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
> >>                              &flower->mask.member, sizeof flower->key.member)
> >>-static void
> >>+static int
> >>  nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
> >>  {
> >>+
> >>      uint16_t host_eth_type = ntohs(flower->key.eth_type);
> >>      bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
> >>+    int err;
> >>+
> >>+    /* need to parse acts first as some acts require changing the matching */
> >
> >This seems strange to me.
> 
> from the strict (normalized?) header checksum.

I think this code relates to setting flower->key.ip_proto == IPPROTO_TCP
above. Is that correct. Are there any other cases?

> 
> >
> >>+    err  = nl_msg_put_flower_acts(request, flower);
> >>+    if (err) {
> >>+        return err;
> >>+    }
> >>      if (is_vlan) {
> >>          host_eth_type = ntohs(flower->key.encap_eth_type);
> >>@@ -1083,7 +1444,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
> >>          nl_msg_put_flower_tunnel(request, flower);
> >>      }
> >>-    nl_msg_put_flower_acts(request, flower);
> >>+    return 0;
> >>  }
> >>  int
> >>@@ -1106,7 +1467,12 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
> >>      nl_msg_put_string(&request, TCA_KIND, "flower");
> >>      basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
> >>      {
> >>-        nl_msg_put_flower_options(&request, flower);
> >>+        error = nl_msg_put_flower_options(&request, flower);
> >>+
> >>+        if (error) {
> >>+            ofpbuf_uninit(&request);
> >>+            return error;
> >>+        }
> >>      }
> >>      nl_msg_end_nested(&request, basic_offset);
> >>diff --git a/lib/tc.h b/lib/tc.h
> >>index 6c69b79..7876051 100644
> >>--- a/lib/tc.h
> >>+++ b/lib/tc.h
> >>@@ -96,6 +96,7 @@ struct tc_flower_key {
> >>      struct {
> >>          ovs_be32 ipv4_src;
> >>          ovs_be32 ipv4_dst;
> >>+        uint8_t rewrite_ttl;
> >>      } ipv4;
> >>      struct {
> >>          struct in6_addr ipv6_src;
> >>@@ -120,6 +121,14 @@ struct tc_flower {
> >>      uint64_t lastused;
> >>      struct {
> >>+        bool rewrite;
> >>+        struct tc_flower_key key;
> >>+        struct tc_flower_key mask;
> >>+    } rewrite;
> >>+
> >>+    uint32_t csum_update_flags;
> >>+
> >>+    struct {
> >>          bool set;
> >>          ovs_be64 id;
> >>          ovs_be16 tp_src;
> >>@@ -152,6 +161,13 @@ struct tc_flower {
> >>      struct tc_cookie act_cookie;
> >>  };
> >>+/* assert that if we overflow with a masked write of uint32_t to the last byte
> >>+ * of flower.rewrite we overflow inside struct flower.
> >>+ * shouldn't happen unless someone moves rewrite to the end of flower */
> >>+BUILD_ASSERT_DECL(offsetof(struct tc_flower, rewrite)
> >>+                  + MEMBER_SIZEOF(struct tc_flower, rewrite)
> >>+                  + sizeof(uint32_t) - 2 < sizeof(struct tc_flower));
> >>+
> >>  int tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
> >>                        struct tc_flower *flower);
> >>  int tc_del_filter(int ifindex, int prio, int handle);
> >>-- 
> >>2.7.5
> >>
Roi Dayan Oct. 25, 2017, 11:24 a.m. UTC | #4
On 27/09/2017 12:08, Simon Horman wrote:
> On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
>>
>>
>> On 18/09/2017 18:01, Simon Horman wrote:
>>> On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>
>>>> To be later used to implement ovs action set offloading.
>>>>
>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>>>> ---
>>>>   lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>   lib/tc.h |  16 +++
>>>>   2 files changed, 385 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/lib/tc.c b/lib/tc.c
>>>> index c9cada2..743b2ee 100644
>>>> --- a/lib/tc.c
>>>> +++ b/lib/tc.c
>>>> @@ -21,8 +21,10 @@
>>>>   #include <errno.h>
>>>>   #include <linux/if_ether.h>
>>>>   #include <linux/rtnetlink.h>
>>>> +#include <linux/tc_act/tc_csum.h>
>>>>   #include <linux/tc_act/tc_gact.h>
>>>>   #include <linux/tc_act/tc_mirred.h>
>>>> +#include <linux/tc_act/tc_pedit.h>
>>>>   #include <linux/tc_act/tc_tunnel_key.h>
>>>>   #include <linux/tc_act/tc_vlan.h>
>>>>   #include <linux/gen_stats.h>
>>>> @@ -33,11 +35,14 @@
>>>>   #include "netlink-socket.h"
>>>>   #include "netlink.h"
>>>>   #include "openvswitch/ofpbuf.h"
>>>> +#include "openvswitch/util.h"
>>>>   #include "openvswitch/vlog.h"
>>>>   #include "packets.h"
>>>>   #include "timeval.h"
>>>>   #include "unaligned.h"
>>>> +#define MAX_PEDIT_OFFSETS 8
>>>
>>> Why 8?
>> We don't expect anything more right now (ipv6 src/dst rewrite requires 8
>> pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if
>> that's makes sens. do you suggest we increase it? to what?
> 
> It seems strange to me to place a somewhat arbitrary small limit
> when none exists in the pedit API being used. I would at prefer if
> it was at least a bigger, say 16 or 32.


Hi Simon,

Sorry for the late reply due to holidays and vacations.
Me & Paul going to go over this and do the fixes needed and
also rebase over latest master and run tests again.

I'll answer what I'm more familiar with now and Paul will continue.
The 8 here is too low and you right. We used this definition
for allocation of the pedit keys on the stack in
nl_msg_put_flower_rewrite_pedits()

It was for convenience instead of calculating the maximum possible
keys that could exists and allocating it there and freeing it at
the end.

Increasing it to 32 is probably more than enough and wont waste much.


> 
>>>>   VLOG_DEFINE_THIS_MODULE(tc);
>>>>   static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);
>>>> @@ -50,6 +55,82 @@ enum tc_offload_policy {
>>>>   static enum tc_offload_policy tc_policy = TC_POLICY_NONE;
>>>> +struct tc_pedit_key_ex {
>>>> +    enum pedit_header_type htype;
>>>> +    enum pedit_cmd cmd;
>>>> +};
>>>> +
>>>> +struct flower_key_to_pedit {
>>>> +    enum pedit_header_type htype;
>>>> +    int flower_offset;
>>>> +    int offset;
>>>> +    int size;
>>>> +};
>>>> +
>>>> +static struct flower_key_to_pedit flower_pedit_map[] = {
>>>> +    {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>>>> +        12,
>>>> +        offsetof(struct tc_flower_key, ipv4.ipv4_src),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>>>> +        16,
>>>> +        offsetof(struct tc_flower_key, ipv4.ipv4_dst),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>>>> +        8,
>>>> +        offsetof(struct tc_flower_key, ipv4.rewrite_ttl),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
>>>> +        8,
>>>> +        offsetof(struct tc_flower_key, ipv6.ipv6_src),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
>>>> +        24,
>>>> +        offsetof(struct tc_flower_key, ipv6.ipv6_dst),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
>>>> +        6,
>>>> +        offsetof(struct tc_flower_key, src_mac),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, src_mac)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
>>>> +        0,
>>>> +        offsetof(struct tc_flower_key, dst_mac),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, dst_mac)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
>>>> +        12,
>>>> +        offsetof(struct tc_flower_key, eth_type),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, eth_type)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
>>>> +        0,
>>>> +        offsetof(struct tc_flower_key, tcp_src),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, tcp_src)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
>>>> +        2,
>>>> +        offsetof(struct tc_flower_key, tcp_dst),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, tcp_dst)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
>>>> +        0,
>>>> +        offsetof(struct tc_flower_key, udp_src),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, udp_src)
>>>> +    }, {
>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
>>>> +        2,
>>>> +        offsetof(struct tc_flower_key, udp_dst),
>>>> +        MEMBER_SIZEOF(struct tc_flower_key, udp_dst)
>>>> +    },
>>>> +};
>>>> +
>>>>   struct tcmsg *
>>>>   tc_make_request(int ifindex, int type, unsigned int flags,
>>>>                   struct ofpbuf *request)
>>>> @@ -365,6 +446,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
>>>>       }
>>>>   }
>>>> +static const struct nl_policy pedit_policy[] = {
>>>> +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
>>>> +                                     .min_len = sizeof(struct tc_pedit),
>>>> +                                     .optional = false, },
>>>> +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
>>>> +                                      .optional = false, },
>>>> +};
>>>> +
>>>> +static int
>>>> +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
>>>> +{
>>>> +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
>>>> +    const struct tc_pedit *pe;
>>>> +    const struct tc_pedit_key *keys;
>>>> +    const struct nlattr *nla, *keys_ex, *ex_type;
>>>> +    const void *keys_attr;
>>>> +    char *rewrite_key = (void *) &flower->rewrite.key;
>>>> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
>>>> +    size_t keys_ex_size, left;
>>>> +    int type, i = 0;
>>>> +
>>>> +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
>>>> +                         ARRAY_SIZE(pedit_policy))) {
>>>> +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
>>>> +        return EPROTO;
>>>> +    }
>>>> +
>>>> +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
>>>> +    keys = pe->keys;
>>>> +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
>>>> +    keys_ex = nl_attr_get(keys_attr);
>>>> +    keys_ex_size = nl_attr_get_size(keys_attr);
>>>> +
>>>> +    NL_ATTR_FOR_EACH (nla, left, keys_ex, keys_ex_size) {
>>>> +        if (i >= pe->nkeys) {
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
>>>> +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
>>>> +            type = nl_attr_get_u16(ex_type);
>>>> +
>>>> +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
>>>> +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
>>>> +                int flower_off = m->flower_offset;
>>>> +                int sz = m->size;
>>>> +                int mf = m->offset;
>>>> +
>>>> +                if (m->htype != type) {
>>>> +                   continue;
>>>> +                }
>>>> +
>>>> +                /* check overlap between current pedit key, which is always
>>>> +                 * 4 bytes (range [off, off + 3]), and a map entry in
>>>> +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
>>>> +                if ((keys->off >= mf && keys->off < mf + sz)
>>>> +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
>>>> +                    int diff = flower_off + (keys->off - mf);
>>>> +                    uint32_t *dst = (void *) (rewrite_key + diff);
>>>> +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
>>>> +                    uint32_t mask = ~(keys->mask);
>>>> +                    uint32_t zero_bits;
>>>> +
>>>> +                    if (keys->off < mf) {
>>>> +                        zero_bits = 8 * (mf - keys->off);
>>>> +                        mask &= UINT32_MAX << zero_bits;
>>>> +                    } else if (keys->off + 4 > mf + m->size) {
>>>> +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
>>>> +                        mask &= UINT32_MAX >> zero_bits;
>>>> +                    }
>>>> +
>>>> +                    *dst_m |= mask;
>>>> +                    *dst |= keys->val & mask;
>>>> +                }
>>>> +            }
>>>
>>> If I understand the above correctly it is designed to make
>>> pedit actions disjoint. If so, why is that necessary? >
>>
>> It's not, as a single flower key rewrite can be split to multiple pedit
>> actions it finds the overlap between a flower key and a pedit action, if
>> they do overlap it translates it to the correct offset and masks it out.
> 
> Thanks, understood.
> 
>>
>>>> +        } else {
>>>> +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
>>>> +                        nl_attr_type(nla));
>>>> +            return EOPNOTSUPP;
>>>> +        }
>>>
>>> I think the code could exit early here as
>>> nl_msg_put_flower_rewrite_pedits() does below.
>>>
>>
>> Sorry, didn't understand. can you give an example?
>>
> 
> I meant something like this.
> 
>              if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
>                  VLOG_ERR_RL(...);
>                  return EOPNOTSUPP;
>              }

understood. we'll do that. thanks.

> 
> 	    /* Overlap detection code goes here */
> 
>>>
>>>> +
>>>> +        keys++;
>>>> +        i++;
>>>> +    }
>>>> +
>>>> +    flower->rewrite.rewrite = true;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>>   static const struct nl_policy tunnel_key_policy[] = {
>>>>       [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC,
>>>>                                  .min_len = sizeof(struct tc_tunnel_key),
>>>> @@ -608,6 +779,11 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
>>>>           nl_parse_act_vlan(act_options, flower);
>>>>       } else if (!strcmp(act_kind, "tunnel_key")) {
>>>>           nl_parse_act_tunnel_key(act_options, flower);
>>>> +    } else if (!strcmp(act_kind, "pedit")) {
>>>> +        nl_parse_act_pedit(act_options, flower);
>>>> +    } else if (!strcmp(act_kind, "csum")) {
>>>> +        /* not doing anything for now, ovs has an implicit csum recalculation
>>>> +         * with rewriting of packet headers (translating of pedit acts). */
>>>
>>> I wonder if the absence of a csum action when needed (by TC)
>>> should be treated as an error >
>>
>> Do you mean validating csum flags from each protocol and such (like in put)?
> 
> Yes, I think so.
> 
> In OvS (without TC acceleration) csum recalculation is implicit
> but with TC an explicit csum update action is required. I see
> in your code you are handling this by adding csum actions to TC actions
> when translating from OvS DPIF actions. And in the reverse (above) csum
> actions are simply ignored.
> 
> What I am wondering is if when translating TC actions to OvS DPIF actions
> you should track if the TC actions should include a csum rule because
> of other actions and treat its absence as an error.
> 
>>
>>>>       } else {
>>>>           VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
>>>>           return EINVAL;
>>>> @@ -809,6 +985,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy)
>>>>   }
>>>>   static void
>>>> +nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
>>>> +{
>>>> +    size_t offset;
>>>> +
>>>> +    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
>>>> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
>>>> +    {
>>>> +        struct tc_csum parm = { .action = TC_ACT_PIPE,
>>>> +                                .update_flags = flags };
>>>> +
>>>> +        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
>>>> +    }
>>>> +    nl_msg_end_nested(request, offset);
>>>> +}
>>>> +
>>>> +static void
>>>> +nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
>>>> +                     struct tc_pedit_key_ex *ex)
>>>> +{
>>>> +    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
>>>
>>> Are there unnecessary () on the line above?
>> No, I'll remove them.
>>
>>>
>>>> +    size_t offset, offset_keys_ex, offset_key;
>>>> +    int i;
>>>> +
>>>> +    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
>>>> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
>>>> +    {
>>>> +        parm->action = TC_ACT_PIPE;
>>>> +
>>>> +        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
>>>> +        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
>>>> +        for (i = 0; i < parm->nkeys; i++, ex++) {
>>>> +            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
>>>> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
>>>> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
>>>> +            nl_msg_end_nested(request, offset_key);
>>>> +        }
>>>> +        nl_msg_end_nested(request, offset_keys_ex);
>>>> +    }
>>>> +    nl_msg_end_nested(request, offset);
>>>> +}
>>>> +
>>>> +static void
>>>>   nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
>>>>   {
>>>>       size_t offset;
>>>> @@ -930,7 +1148,127 @@ nl_msg_put_act_cookie(struct ofpbuf *request, struct tc_cookie *ck) {
>>>>       }
>>>>   }
>>>> +/* Given flower, a key_to_pedit map entry, calculates the rest,
>>>> + * where:
>>>> + *
>>>> + * mask, data - pointers of where read the first word of flower->key/mask.
>>>> + * current_offset - which offset to use for the first pedit action.
>>>> + * cnt - max pedits actions to use.
>>>> + * first_word_mask/last_word_mask - the mask to use for the first/last read
>>>> + * (as we read entire words). */
>>>>   static void
>>>> +calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m,
>>>> +             int *cur_offset, int *cnt, uint32_t *last_word_mask,
>>>> +             uint32_t *first_word_mask, uint32_t **mask, uint32_t **data)
>>>> +{
>>>> +    int start_offset, max_offset, total_size;
>>>> +    int diff, right_zero_bits, left_zero_bits;
>>>> +    char *rewrite_key = (void *) &flower->rewrite.key;
>>>> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
>>>> +
>>>> +    max_offset = m->offset + m->size;
>>>> +    start_offset = ROUND_DOWN(m->offset, 4);
>>>> +    diff = m->offset - start_offset;
>>>> +    total_size = max_offset - start_offset;
>>>> +    right_zero_bits = 8 * (4 - (max_offset % 4));
>>>> +    left_zero_bits = 8 * (m->offset - start_offset);
>>>> +
>>>> +    *cur_offset = start_offset;
>>>> +    *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0);
>>>> +    *last_word_mask = UINT32_MAX >> right_zero_bits;
>>>> +    *first_word_mask = UINT32_MAX << left_zero_bits;
>>>> +    *data = (void *) (rewrite_key + m->flower_offset - diff);
>>>> +    *mask = (void *) (rewrite_mask + m->flower_offset - diff);
>>>
>>> The type of *data and *mask is uint32_t *.
>>> Why not cast to that type?
>>
>> It's to avoid warning of increasing pointer alignment (-Wcast-align).
> 
> It sounds like the warning should be addressed rather than overridden using
> a cast.
> 
>>>
>>>> +}
>>>> +
>>>> +static inline void
>>>> +csum_update_flag(struct tc_flower *flower,
>>>> +                 enum pedit_header_type htype) {
>>>
>>> I think the above two lines could be one.
>>>
>>>> +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {
>>>
>>> A case statement might be nicer here.
>>
>> Right, thanks.
>>
>>>
>>>> +        flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR;
>>>> +    }
>>>> +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4
>>>> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6
>>>> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP
>>>> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) {
>>>> +        if (flower->key.ip_proto == IPPROTO_TCP) {
>>>> +            flower->mask.ip_proto = UINT8_MAX;
>>>
>>> What if the mask was not UINT8_MAX to start with?
>>> Doesn't this create a different flow?
>>
>> This is so the checksum will be strict (not setting/recalc udp checksum on
>> non udp packets). It creates a more specific flow, a subset of the original.
>> This is fine as datapath is free to ignore a mask, which is the same as a
>> full mask we've set.
> 
> I'm not sure that I understand why its fine for the datapath to ignore the
> mask.

I'll try to explain.
When we want the HW to recalc the csum we need to also do matching on
it. We do that for TCP, UDP, ICMP.

> 
>>>> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
>>>> +        } else if (flower->key.ip_proto == IPPROTO_UDP) {
>>>> +            flower->mask.ip_proto = UINT8_MAX;
>>>> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
>>>> +        } else if (flower->key.ip_proto == IPPROTO_ICMP
>>>> +                   || flower->key.ip_proto == IPPROTO_ICMPV6) {
>>>> +            flower->mask.ip_proto = UINT8_MAX;
>>>> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP;
>>>> +        }
>>>> +    }
>>>> +}
>>>> +
>>>> +static int
>>>> +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
>>>> +                                 struct tc_flower *flower)
>>>> +{
>>>> +    struct {
>>>> +        struct tc_pedit sel;
>>>> +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
>>>> +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
>>>> +    } sel = {
>>>> +        .sel = {
>>>> +            .nkeys = 0
>>>> +        }
>>>> +    };
>>>> +    int i, j;
>>>> +
>>>> +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
>>>> +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
>>>> +        struct tc_pedit_key *pedit_key = NULL;
>>>> +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
>>>> +        uint32_t *mask, *data, first_word_mask, last_word_mask;
>>>> +        int cnt = 0, cur_offset = 0;
>>>> +
>>>> +        if (!m->size) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        calc_offsets(flower, m, &cur_offset, &cnt, &last_word_mask,
>>>> +                     &first_word_mask, &mask, &data);
>>>> +
>>>> +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
>>>> +            uint32_t mask_word = *mask;
>>>> +
>>>> +            if (j == 0) {
>>>> +                mask_word &= first_word_mask;
>>>> +            }
>>>> +            if (j == cnt - 1) {
>>>> +                mask_word &= last_word_mask;
>>>> +            }
>>>> +            if (!mask_word) {
>>>> +                continue;
>>>> +            }
>>>> +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
>>>> +                VLOG_WARN_RL(&error_rl, "reached too many pedit offsets: %d",
>>>> +                             MAX_PEDIT_OFFSETS);
>>>> +                return EOPNOTSUPP;
>>>> +            }
>>>> +
>>>> +            pedit_key = &sel.keys[sel.sel.nkeys];
>>>> +            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
>>>> +            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
>>>> +            pedit_key_ex->htype = m->htype;
>>>> +            pedit_key->off = cur_offset;
>>>> +            pedit_key->mask = ~mask_word;
>>>> +            pedit_key->val = *data & mask_word;
>>>> +            sel.sel.nkeys++;
>>>> +            csum_update_flag(flower, m->htype);
>>>> +        }
>>>> +    }
>>>> +    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int
>>>>   nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>>>>   {
>>>>       size_t offset;
>>>> @@ -939,7 +1277,20 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>>>>       offset = nl_msg_start_nested(request, TCA_FLOWER_ACT);
>>>>       {
>>>>           uint16_t act_index = 1;
>>>> +        int error;
>>>> +        if (flower->rewrite.rewrite) {
>>>> +            act_offset = nl_msg_start_nested(request, act_index++);
>>>> +            error = nl_msg_put_flower_rewrite_pedits(request, flower);
>>>> +            if (error) {
>>>> +                return error;
>>>> +            }
>>>> +            nl_msg_end_nested(request, act_offset);
>>>> +
>>>> +            act_offset = nl_msg_start_nested(request, act_index++);
>>>> +            nl_msg_put_act_csum(request, flower->csum_update_flags);
>>>> +            nl_msg_end_nested(request, act_offset);
>>>> +        }
>>>>           if (flower->set.set) {
>>>>               act_offset = nl_msg_start_nested(request, act_index++);
>>>>               nl_msg_put_act_tunnel_key_set(request, flower->set.id,
>>>> @@ -980,6 +1331,8 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>>>>           }
>>>>       }
>>>>       nl_msg_end_nested(request, offset);
>>>> +
>>>> +    return 0;
>>>>   }
>>>>   static void
>>>> @@ -1021,11 +1374,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
>>>>       nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
>>>>                               &flower->mask.member, sizeof flower->key.member)
>>>> -static void
>>>> +static int
>>>>   nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>>>>   {
>>>> +
>>>>       uint16_t host_eth_type = ntohs(flower->key.eth_type);
>>>>       bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
>>>> +    int err;
>>>> +
>>>> +    /* need to parse acts first as some acts require changing the matching */
>>>
>>> This seems strange to me.
>>
>> from the strict (normalized?) header checksum.
> 
> I think this code relates to setting flower->key.ip_proto == IPPROTO_TCP
> above. Is that correct. Are there any other cases?

yes and also for UDP and ICMP. as explained above.
So going over the acts in header rewrite case we might want to signal 
the HW recalc csum and we need matching on those fields that require new
csum so the matching mask being updated. this is why we parse the acts
first.


> 
>>
>>>
>>>> +    err  = nl_msg_put_flower_acts(request, flower);
>>>> +    if (err) {
>>>> +        return err;
>>>> +    }
>>>>       if (is_vlan) {
>>>>           host_eth_type = ntohs(flower->key.encap_eth_type);
>>>> @@ -1083,7 +1444,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>>>>           nl_msg_put_flower_tunnel(request, flower);
>>>>       }
>>>> -    nl_msg_put_flower_acts(request, flower);
>>>> +    return 0;
>>>>   }
>>>>   int
>>>> @@ -1106,7 +1467,12 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
>>>>       nl_msg_put_string(&request, TCA_KIND, "flower");
>>>>       basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
>>>>       {
>>>> -        nl_msg_put_flower_options(&request, flower);
>>>> +        error = nl_msg_put_flower_options(&request, flower);
>>>> +
>>>> +        if (error) {
>>>> +            ofpbuf_uninit(&request);
>>>> +            return error;
>>>> +        }
>>>>       }
>>>>       nl_msg_end_nested(&request, basic_offset);
>>>> diff --git a/lib/tc.h b/lib/tc.h
>>>> index 6c69b79..7876051 100644
>>>> --- a/lib/tc.h
>>>> +++ b/lib/tc.h
>>>> @@ -96,6 +96,7 @@ struct tc_flower_key {
>>>>       struct {
>>>>           ovs_be32 ipv4_src;
>>>>           ovs_be32 ipv4_dst;
>>>> +        uint8_t rewrite_ttl;
>>>>       } ipv4;
>>>>       struct {
>>>>           struct in6_addr ipv6_src;
>>>> @@ -120,6 +121,14 @@ struct tc_flower {
>>>>       uint64_t lastused;
>>>>       struct {
>>>> +        bool rewrite;
>>>> +        struct tc_flower_key key;
>>>> +        struct tc_flower_key mask;
>>>> +    } rewrite;
>>>> +
>>>> +    uint32_t csum_update_flags;
>>>> +
>>>> +    struct {
>>>>           bool set;
>>>>           ovs_be64 id;
>>>>           ovs_be16 tp_src;
>>>> @@ -152,6 +161,13 @@ struct tc_flower {
>>>>       struct tc_cookie act_cookie;
>>>>   };
>>>> +/* assert that if we overflow with a masked write of uint32_t to the last byte
>>>> + * of flower.rewrite we overflow inside struct flower.
>>>> + * shouldn't happen unless someone moves rewrite to the end of flower */
>>>> +BUILD_ASSERT_DECL(offsetof(struct tc_flower, rewrite)
>>>> +                  + MEMBER_SIZEOF(struct tc_flower, rewrite)
>>>> +                  + sizeof(uint32_t) - 2 < sizeof(struct tc_flower));
>>>> +
>>>>   int tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
>>>>                         struct tc_flower *flower);
>>>>   int tc_del_filter(int ifindex, int prio, int handle);
>>>> -- 
>>>> 2.7.5
>>>>
Simon Horman Oct. 30, 2017, 1:42 p.m. UTC | #5
On Wed, Oct 25, 2017 at 02:24:15PM +0300, Roi Dayan wrote:
> 
> 
> On 27/09/2017 12:08, Simon Horman wrote:
> > On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
> > > 
> > > 
> > > On 18/09/2017 18:01, Simon Horman wrote:
> > > > On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
> > > > > From: Paul Blakey <paulb@mellanox.com>
> > > > > 
> > > > > To be later used to implement ovs action set offloading.
> > > > > 
> > > > > Signed-off-by: Paul Blakey <paulb@mellanox.com>
> > > > > Reviewed-by: Roi Dayan <roid@mellanox.com>
> > > > > ---
> > > > >   lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > >   lib/tc.h |  16 +++
> > > > >   2 files changed, 385 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/lib/tc.c b/lib/tc.c
> > > > > index c9cada2..743b2ee 100644
> > > > > --- a/lib/tc.c
> > > > > +++ b/lib/tc.c
> > > > > @@ -21,8 +21,10 @@
> > > > >   #include <errno.h>
> > > > >   #include <linux/if_ether.h>
> > > > >   #include <linux/rtnetlink.h>
> > > > > +#include <linux/tc_act/tc_csum.h>
> > > > >   #include <linux/tc_act/tc_gact.h>
> > > > >   #include <linux/tc_act/tc_mirred.h>
> > > > > +#include <linux/tc_act/tc_pedit.h>
> > > > >   #include <linux/tc_act/tc_tunnel_key.h>
> > > > >   #include <linux/tc_act/tc_vlan.h>
> > > > >   #include <linux/gen_stats.h>
> > > > > @@ -33,11 +35,14 @@
> > > > >   #include "netlink-socket.h"
> > > > >   #include "netlink.h"
> > > > >   #include "openvswitch/ofpbuf.h"
> > > > > +#include "openvswitch/util.h"
> > > > >   #include "openvswitch/vlog.h"
> > > > >   #include "packets.h"
> > > > >   #include "timeval.h"
> > > > >   #include "unaligned.h"
> > > > > +#define MAX_PEDIT_OFFSETS 8
> > > > 
> > > > Why 8?
> > > We don't expect anything more right now (ipv6 src/dst rewrite requires 8
> > > pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if
> > > that's makes sens. do you suggest we increase it? to what?
> > 
> > It seems strange to me to place a somewhat arbitrary small limit
> > when none exists in the pedit API being used. I would at prefer if
> > it was at least a bigger, say 16 or 32.
> 
> 
> Hi Simon,
> 
> Sorry for the late reply due to holidays and vacations.
> Me & Paul going to go over this and do the fixes needed and
> also rebase over latest master and run tests again.

Likewise, sorry for not responding earlier (same reason).

> I'll answer what I'm more familiar with now and Paul will continue.
> The 8 here is too low and you right. We used this definition
> for allocation of the pedit keys on the stack in
> nl_msg_put_flower_rewrite_pedits()
> 
> It was for convenience instead of calculating the maximum possible
> keys that could exists and allocating it there and freeing it at
> the end.
> 
> Increasing it to 32 is probably more than enough and wont waste much.

Thanks, that sounds good.

> > > > >   VLOG_DEFINE_THIS_MODULE(tc);
> > > > >   static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);
> > > > > @@ -50,6 +55,82 @@ enum tc_offload_policy {
> > > > >   static enum tc_offload_policy tc_policy = TC_POLICY_NONE;
> > > > > +struct tc_pedit_key_ex {
> > > > > +    enum pedit_header_type htype;
> > > > > +    enum pedit_cmd cmd;
> > > > > +};
> > > > > +
> > > > > +struct flower_key_to_pedit {
> > > > > +    enum pedit_header_type htype;
> > > > > +    int flower_offset;
> > > > > +    int offset;
> > > > > +    int size;
> > > > > +};
> > > > > +
> > > > > +static struct flower_key_to_pedit flower_pedit_map[] = {
> > > > > +    {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> > > > > +        12,
> > > > > +        offsetof(struct tc_flower_key, ipv4.ipv4_src),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> > > > > +        16,
> > > > > +        offsetof(struct tc_flower_key, ipv4.ipv4_dst),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> > > > > +        8,
> > > > > +        offsetof(struct tc_flower_key, ipv4.rewrite_ttl),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
> > > > > +        8,
> > > > > +        offsetof(struct tc_flower_key, ipv6.ipv6_src),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
> > > > > +        24,
> > > > > +        offsetof(struct tc_flower_key, ipv6.ipv6_dst),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> > > > > +        6,
> > > > > +        offsetof(struct tc_flower_key, src_mac),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, src_mac)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> > > > > +        0,
> > > > > +        offsetof(struct tc_flower_key, dst_mac),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, dst_mac)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> > > > > +        12,
> > > > > +        offsetof(struct tc_flower_key, eth_type),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, eth_type)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
> > > > > +        0,
> > > > > +        offsetof(struct tc_flower_key, tcp_src),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, tcp_src)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
> > > > > +        2,
> > > > > +        offsetof(struct tc_flower_key, tcp_dst),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, tcp_dst)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
> > > > > +        0,
> > > > > +        offsetof(struct tc_flower_key, udp_src),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, udp_src)
> > > > > +    }, {
> > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
> > > > > +        2,
> > > > > +        offsetof(struct tc_flower_key, udp_dst),
> > > > > +        MEMBER_SIZEOF(struct tc_flower_key, udp_dst)
> > > > > +    },
> > > > > +};
> > > > > +
> > > > >   struct tcmsg *
> > > > >   tc_make_request(int ifindex, int type, unsigned int flags,
> > > > >                   struct ofpbuf *request)
> > > > > @@ -365,6 +446,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
> > > > >       }
> > > > >   }
> > > > > +static const struct nl_policy pedit_policy[] = {
> > > > > +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
> > > > > +                                     .min_len = sizeof(struct tc_pedit),
> > > > > +                                     .optional = false, },
> > > > > +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
> > > > > +                                      .optional = false, },
> > > > > +};
> > > > > +
> > > > > +static int
> > > > > +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
> > > > > +{
> > > > > +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
> > > > > +    const struct tc_pedit *pe;
> > > > > +    const struct tc_pedit_key *keys;
> > > > > +    const struct nlattr *nla, *keys_ex, *ex_type;
> > > > > +    const void *keys_attr;
> > > > > +    char *rewrite_key = (void *) &flower->rewrite.key;
> > > > > +    char *rewrite_mask = (void *) &flower->rewrite.mask;
> > > > > +    size_t keys_ex_size, left;
> > > > > +    int type, i = 0;
> > > > > +
> > > > > +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
> > > > > +                         ARRAY_SIZE(pedit_policy))) {
> > > > > +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
> > > > > +        return EPROTO;
> > > > > +    }
> > > > > +
> > > > > +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
> > > > > +    keys = pe->keys;
> > > > > +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
> > > > > +    keys_ex = nl_attr_get(keys_attr);
> > > > > +    keys_ex_size = nl_attr_get_size(keys_attr);
> > > > > +
> > > > > +    NL_ATTR_FOR_EACH (nla, left, keys_ex, keys_ex_size) {
> > > > > +        if (i >= pe->nkeys) {
> > > > > +            break;
> > > > > +        }
> > > > > +
> > > > > +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
> > > > > +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
> > > > > +            type = nl_attr_get_u16(ex_type);
> > > > > +
> > > > > +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
> > > > > +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
> > > > > +                int flower_off = m->flower_offset;
> > > > > +                int sz = m->size;
> > > > > +                int mf = m->offset;
> > > > > +
> > > > > +                if (m->htype != type) {
> > > > > +                   continue;
> > > > > +                }
> > > > > +
> > > > > +                /* check overlap between current pedit key, which is always
> > > > > +                 * 4 bytes (range [off, off + 3]), and a map entry in
> > > > > +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
> > > > > +                if ((keys->off >= mf && keys->off < mf + sz)
> > > > > +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
> > > > > +                    int diff = flower_off + (keys->off - mf);
> > > > > +                    uint32_t *dst = (void *) (rewrite_key + diff);
> > > > > +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
> > > > > +                    uint32_t mask = ~(keys->mask);
> > > > > +                    uint32_t zero_bits;
> > > > > +
> > > > > +                    if (keys->off < mf) {
> > > > > +                        zero_bits = 8 * (mf - keys->off);
> > > > > +                        mask &= UINT32_MAX << zero_bits;
> > > > > +                    } else if (keys->off + 4 > mf + m->size) {
> > > > > +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
> > > > > +                        mask &= UINT32_MAX >> zero_bits;
> > > > > +                    }
> > > > > +
> > > > > +                    *dst_m |= mask;
> > > > > +                    *dst |= keys->val & mask;
> > > > > +                }
> > > > > +            }
> > > > 
> > > > If I understand the above correctly it is designed to make
> > > > pedit actions disjoint. If so, why is that necessary? >
> > > 
> > > It's not, as a single flower key rewrite can be split to multiple pedit
> > > actions it finds the overlap between a flower key and a pedit action, if
> > > they do overlap it translates it to the correct offset and masks it out.
> > 
> > Thanks, understood.
> > 
> > > 
> > > > > +        } else {
> > > > > +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
> > > > > +                        nl_attr_type(nla));
> > > > > +            return EOPNOTSUPP;
> > > > > +        }
> > > > 
> > > > I think the code could exit early here as
> > > > nl_msg_put_flower_rewrite_pedits() does below.
> > > > 
> > > 
> > > Sorry, didn't understand. can you give an example?
> > > 
> > 
> > I meant something like this.
> > 
> >              if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
> >                  VLOG_ERR_RL(...);
> >                  return EOPNOTSUPP;
> >              }
> 
> understood. we'll do that. thanks.
> 
> > 
> > 	    /* Overlap detection code goes here */
> > 
> > > > 
> > > > > +
> > > > > +        keys++;
> > > > > +        i++;
> > > > > +    }
> > > > > +
> > > > > +    flower->rewrite.rewrite = true;
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > >   static const struct nl_policy tunnel_key_policy[] = {
> > > > >       [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC,
> > > > >                                  .min_len = sizeof(struct tc_tunnel_key),
> > > > > @@ -608,6 +779,11 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
> > > > >           nl_parse_act_vlan(act_options, flower);
> > > > >       } else if (!strcmp(act_kind, "tunnel_key")) {
> > > > >           nl_parse_act_tunnel_key(act_options, flower);
> > > > > +    } else if (!strcmp(act_kind, "pedit")) {
> > > > > +        nl_parse_act_pedit(act_options, flower);
> > > > > +    } else if (!strcmp(act_kind, "csum")) {
> > > > > +        /* not doing anything for now, ovs has an implicit csum recalculation
> > > > > +         * with rewriting of packet headers (translating of pedit acts). */
> > > > 
> > > > I wonder if the absence of a csum action when needed (by TC)
> > > > should be treated as an error >
> > > 
> > > Do you mean validating csum flags from each protocol and such (like in put)?
> > 
> > Yes, I think so.
> > 
> > In OvS (without TC acceleration) csum recalculation is implicit
> > but with TC an explicit csum update action is required. I see
> > in your code you are handling this by adding csum actions to TC actions
> > when translating from OvS DPIF actions. And in the reverse (above) csum
> > actions are simply ignored.
> > 
> > What I am wondering is if when translating TC actions to OvS DPIF actions
> > you should track if the TC actions should include a csum rule because
> > of other actions and treat its absence as an error.
> > 
> > > 
> > > > >       } else {
> > > > >           VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
> > > > >           return EINVAL;
> > > > > @@ -809,6 +985,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy)
> > > > >   }
> > > > >   static void
> > > > > +nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
> > > > > +{
> > > > > +    size_t offset;
> > > > > +
> > > > > +    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
> > > > > +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
> > > > > +    {
> > > > > +        struct tc_csum parm = { .action = TC_ACT_PIPE,
> > > > > +                                .update_flags = flags };
> > > > > +
> > > > > +        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
> > > > > +    }
> > > > > +    nl_msg_end_nested(request, offset);
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
> > > > > +                     struct tc_pedit_key_ex *ex)
> > > > > +{
> > > > > +    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
> > > > 
> > > > Are there unnecessary () on the line above?
> > > No, I'll remove them.
> > > 
> > > > 
> > > > > +    size_t offset, offset_keys_ex, offset_key;
> > > > > +    int i;
> > > > > +
> > > > > +    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
> > > > > +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
> > > > > +    {
> > > > > +        parm->action = TC_ACT_PIPE;
> > > > > +
> > > > > +        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
> > > > > +        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
> > > > > +        for (i = 0; i < parm->nkeys; i++, ex++) {
> > > > > +            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
> > > > > +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
> > > > > +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
> > > > > +            nl_msg_end_nested(request, offset_key);
> > > > > +        }
> > > > > +        nl_msg_end_nested(request, offset_keys_ex);
> > > > > +    }
> > > > > +    nl_msg_end_nested(request, offset);
> > > > > +}
> > > > > +
> > > > > +static void
> > > > >   nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
> > > > >   {
> > > > >       size_t offset;
> > > > > @@ -930,7 +1148,127 @@ nl_msg_put_act_cookie(struct ofpbuf *request, struct tc_cookie *ck) {
> > > > >       }
> > > > >   }
> > > > > +/* Given flower, a key_to_pedit map entry, calculates the rest,
> > > > > + * where:
> > > > > + *
> > > > > + * mask, data - pointers of where read the first word of flower->key/mask.
> > > > > + * current_offset - which offset to use for the first pedit action.
> > > > > + * cnt - max pedits actions to use.
> > > > > + * first_word_mask/last_word_mask - the mask to use for the first/last read
> > > > > + * (as we read entire words). */
> > > > >   static void
> > > > > +calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m,
> > > > > +             int *cur_offset, int *cnt, uint32_t *last_word_mask,
> > > > > +             uint32_t *first_word_mask, uint32_t **mask, uint32_t **data)
> > > > > +{
> > > > > +    int start_offset, max_offset, total_size;
> > > > > +    int diff, right_zero_bits, left_zero_bits;
> > > > > +    char *rewrite_key = (void *) &flower->rewrite.key;
> > > > > +    char *rewrite_mask = (void *) &flower->rewrite.mask;
> > > > > +
> > > > > +    max_offset = m->offset + m->size;
> > > > > +    start_offset = ROUND_DOWN(m->offset, 4);
> > > > > +    diff = m->offset - start_offset;
> > > > > +    total_size = max_offset - start_offset;
> > > > > +    right_zero_bits = 8 * (4 - (max_offset % 4));
> > > > > +    left_zero_bits = 8 * (m->offset - start_offset);
> > > > > +
> > > > > +    *cur_offset = start_offset;
> > > > > +    *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0);
> > > > > +    *last_word_mask = UINT32_MAX >> right_zero_bits;
> > > > > +    *first_word_mask = UINT32_MAX << left_zero_bits;
> > > > > +    *data = (void *) (rewrite_key + m->flower_offset - diff);
> > > > > +    *mask = (void *) (rewrite_mask + m->flower_offset - diff);
> > > > 
> > > > The type of *data and *mask is uint32_t *.
> > > > Why not cast to that type?
> > > 
> > > It's to avoid warning of increasing pointer alignment (-Wcast-align).
> > 
> > It sounds like the warning should be addressed rather than overridden using
> > a cast.
> > 
> > > > 
> > > > > +}
> > > > > +
> > > > > +static inline void
> > > > > +csum_update_flag(struct tc_flower *flower,
> > > > > +                 enum pedit_header_type htype) {
> > > > 
> > > > I think the above two lines could be one.
> > > > 
> > > > > +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {
> > > > 
> > > > A case statement might be nicer here.
> > > 
> > > Right, thanks.
> > > 
> > > > 
> > > > > +        flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR;
> > > > > +    }
> > > > > +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4
> > > > > +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6
> > > > > +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP
> > > > > +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) {
> > > > > +        if (flower->key.ip_proto == IPPROTO_TCP) {
> > > > > +            flower->mask.ip_proto = UINT8_MAX;
> > > > 
> > > > What if the mask was not UINT8_MAX to start with?
> > > > Doesn't this create a different flow?
> > > 
> > > This is so the checksum will be strict (not setting/recalc udp checksum on
> > > non udp packets). It creates a more specific flow, a subset of the original.
> > > This is fine as datapath is free to ignore a mask, which is the same as a
> > > full mask we've set.
> > 
> > I'm not sure that I understand why its fine for the datapath to ignore the
> > mask.
> 
> I'll try to explain.
> When we want the HW to recalc the csum we need to also do matching on
> it. We do that for TCP, UDP, ICMP.

Ok, that makes sense. But I am concerned that the mask of the offloaded
flow no longer matches the mask of the flow OvS created. What are the
implications of that?

> > > > > +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
> > > > > +        } else if (flower->key.ip_proto == IPPROTO_UDP) {
> > > > > +            flower->mask.ip_proto = UINT8_MAX;
> > > > > +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
> > > > > +        } else if (flower->key.ip_proto == IPPROTO_ICMP
> > > > > +                   || flower->key.ip_proto == IPPROTO_ICMPV6) {
> > > > > +            flower->mask.ip_proto = UINT8_MAX;
> > > > > +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP;
> > > > > +        }
> > > > > +    }
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
> > > > > +                                 struct tc_flower *flower)
> > > > > +{
> > > > > +    struct {
> > > > > +        struct tc_pedit sel;
> > > > > +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
> > > > > +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
> > > > > +    } sel = {
> > > > > +        .sel = {
> > > > > +            .nkeys = 0
> > > > > +        }
> > > > > +    };
> > > > > +    int i, j;
> > > > > +
> > > > > +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
> > > > > +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
> > > > > +        struct tc_pedit_key *pedit_key = NULL;
> > > > > +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
> > > > > +        uint32_t *mask, *data, first_word_mask, last_word_mask;
> > > > > +        int cnt = 0, cur_offset = 0;
> > > > > +
> > > > > +        if (!m->size) {
> > > > > +            continue;
> > > > > +        }
> > > > > +
> > > > > +        calc_offsets(flower, m, &cur_offset, &cnt, &last_word_mask,
> > > > > +                     &first_word_mask, &mask, &data);
> > > > > +
> > > > > +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
> > > > > +            uint32_t mask_word = *mask;
> > > > > +
> > > > > +            if (j == 0) {
> > > > > +                mask_word &= first_word_mask;
> > > > > +            }
> > > > > +            if (j == cnt - 1) {
> > > > > +                mask_word &= last_word_mask;
> > > > > +            }
> > > > > +            if (!mask_word) {
> > > > > +                continue;
> > > > > +            }
> > > > > +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
> > > > > +                VLOG_WARN_RL(&error_rl, "reached too many pedit offsets: %d",
> > > > > +                             MAX_PEDIT_OFFSETS);
> > > > > +                return EOPNOTSUPP;
> > > > > +            }
> > > > > +
> > > > > +            pedit_key = &sel.keys[sel.sel.nkeys];
> > > > > +            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
> > > > > +            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
> > > > > +            pedit_key_ex->htype = m->htype;
> > > > > +            pedit_key->off = cur_offset;
> > > > > +            pedit_key->mask = ~mask_word;
> > > > > +            pedit_key->val = *data & mask_word;
> > > > > +            sel.sel.nkeys++;
> > > > > +            csum_update_flag(flower, m->htype);
> > > > > +        }
> > > > > +    }
> > > > > +    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +static int
> > > > >   nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
> > > > >   {
> > > > >       size_t offset;
> > > > > @@ -939,7 +1277,20 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
> > > > >       offset = nl_msg_start_nested(request, TCA_FLOWER_ACT);
> > > > >       {
> > > > >           uint16_t act_index = 1;
> > > > > +        int error;
> > > > > +        if (flower->rewrite.rewrite) {
> > > > > +            act_offset = nl_msg_start_nested(request, act_index++);
> > > > > +            error = nl_msg_put_flower_rewrite_pedits(request, flower);
> > > > > +            if (error) {
> > > > > +                return error;
> > > > > +            }
> > > > > +            nl_msg_end_nested(request, act_offset);
> > > > > +
> > > > > +            act_offset = nl_msg_start_nested(request, act_index++);
> > > > > +            nl_msg_put_act_csum(request, flower->csum_update_flags);
> > > > > +            nl_msg_end_nested(request, act_offset);
> > > > > +        }
> > > > >           if (flower->set.set) {
> > > > >               act_offset = nl_msg_start_nested(request, act_index++);
> > > > >               nl_msg_put_act_tunnel_key_set(request, flower->set.id,
> > > > > @@ -980,6 +1331,8 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
> > > > >           }
> > > > >       }
> > > > >       nl_msg_end_nested(request, offset);
> > > > > +
> > > > > +    return 0;
> > > > >   }
> > > > >   static void
> > > > > @@ -1021,11 +1374,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
> > > > >       nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
> > > > >                               &flower->mask.member, sizeof flower->key.member)
> > > > > -static void
> > > > > +static int
> > > > >   nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
> > > > >   {
> > > > > +
> > > > >       uint16_t host_eth_type = ntohs(flower->key.eth_type);
> > > > >       bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
> > > > > +    int err;
> > > > > +
> > > > > +    /* need to parse acts first as some acts require changing the matching */
> > > > 
> > > > This seems strange to me.
> > > 
> > > from the strict (normalized?) header checksum.
> > 
> > I think this code relates to setting flower->key.ip_proto == IPPROTO_TCP
> > above. Is that correct. Are there any other cases?
> 
> yes and also for UDP and ICMP. as explained above.
> So going over the acts in header rewrite case we might want to signal the HW
> recalc csum and we need matching on those fields that require new
> csum so the matching mask being updated. this is why we parse the acts
> first.

Thanks, understood.

> > > > > +    err  = nl_msg_put_flower_acts(request, flower);
> > > > > +    if (err) {
> > > > > +        return err;
> > > > > +    }
> > > > >       if (is_vlan) {
> > > > >           host_eth_type = ntohs(flower->key.encap_eth_type);
> > > > > @@ -1083,7 +1444,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
> > > > >           nl_msg_put_flower_tunnel(request, flower);
> > > > >       }
> > > > > -    nl_msg_put_flower_acts(request, flower);
> > > > > +    return 0;
> > > > >   }
> > > > >   int
> > > > > @@ -1106,7 +1467,12 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
> > > > >       nl_msg_put_string(&request, TCA_KIND, "flower");
> > > > >       basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
> > > > >       {
> > > > > -        nl_msg_put_flower_options(&request, flower);
> > > > > +        error = nl_msg_put_flower_options(&request, flower);
> > > > > +
> > > > > +        if (error) {
> > > > > +            ofpbuf_uninit(&request);
> > > > > +            return error;
> > > > > +        }
> > > > >       }
> > > > >       nl_msg_end_nested(&request, basic_offset);
> > > > > diff --git a/lib/tc.h b/lib/tc.h
> > > > > index 6c69b79..7876051 100644
> > > > > --- a/lib/tc.h
> > > > > +++ b/lib/tc.h
> > > > > @@ -96,6 +96,7 @@ struct tc_flower_key {
> > > > >       struct {
> > > > >           ovs_be32 ipv4_src;
> > > > >           ovs_be32 ipv4_dst;
> > > > > +        uint8_t rewrite_ttl;
> > > > >       } ipv4;
> > > > >       struct {
> > > > >           struct in6_addr ipv6_src;
> > > > > @@ -120,6 +121,14 @@ struct tc_flower {
> > > > >       uint64_t lastused;
> > > > >       struct {
> > > > > +        bool rewrite;
> > > > > +        struct tc_flower_key key;
> > > > > +        struct tc_flower_key mask;
> > > > > +    } rewrite;
> > > > > +
> > > > > +    uint32_t csum_update_flags;
> > > > > +
> > > > > +    struct {
> > > > >           bool set;
> > > > >           ovs_be64 id;
> > > > >           ovs_be16 tp_src;
> > > > > @@ -152,6 +161,13 @@ struct tc_flower {
> > > > >       struct tc_cookie act_cookie;
> > > > >   };
> > > > > +/* assert that if we overflow with a masked write of uint32_t to the last byte
> > > > > + * of flower.rewrite we overflow inside struct flower.
> > > > > + * shouldn't happen unless someone moves rewrite to the end of flower */
> > > > > +BUILD_ASSERT_DECL(offsetof(struct tc_flower, rewrite)
> > > > > +                  + MEMBER_SIZEOF(struct tc_flower, rewrite)
> > > > > +                  + sizeof(uint32_t) - 2 < sizeof(struct tc_flower));
> > > > > +
> > > > >   int tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
> > > > >                         struct tc_flower *flower);
> > > > >   int tc_del_filter(int ifindex, int prio, int handle);
> > > > > -- 
> > > > > 2.7.5
> > > > >
Paul Blakey Oct. 31, 2017, 7:20 a.m. UTC | #6
On 30/10/2017 15:42, Simon Horman wrote:
> On Wed, Oct 25, 2017 at 02:24:15PM +0300, Roi Dayan wrote:
>>
>>
>> On 27/09/2017 12:08, Simon Horman wrote:
>>> On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
>>>>
>>>>
>>>> On 18/09/2017 18:01, Simon Horman wrote:
>>>>> On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
>>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>>
>>>>>> To be later used to implement ovs action set offloading.
>>>>>>
>>>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>>>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>>>>>> ---
>>>>>>    lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>>    lib/tc.h |  16 +++
>>>>>>    2 files changed, 385 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/lib/tc.c b/lib/tc.c
>>>>>> index c9cada2..743b2ee 100644
>>>>>> --- a/lib/tc.c
>>>>>> +++ b/lib/tc.c
>>>>>> @@ -21,8 +21,10 @@
>>>>>>    #include <errno.h>
>>>>>>    #include <linux/if_ether.h>
>>>>>>    #include <linux/rtnetlink.h>
>>>>>> +#include <linux/tc_act/tc_csum.h>
>>>>>>    #include <linux/tc_act/tc_gact.h>
>>>>>>    #include <linux/tc_act/tc_mirred.h>
>>>>>> +#include <linux/tc_act/tc_pedit.h>
>>>>>>    #include <linux/tc_act/tc_tunnel_key.h>
>>>>>>    #include <linux/tc_act/tc_vlan.h>
>>>>>>    #include <linux/gen_stats.h>
>>>>>> @@ -33,11 +35,14 @@
>>>>>>    #include "netlink-socket.h"
>>>>>>    #include "netlink.h"
>>>>>>    #include "openvswitch/ofpbuf.h"
>>>>>> +#include "openvswitch/util.h"
>>>>>>    #include "openvswitch/vlog.h"
>>>>>>    #include "packets.h"
>>>>>>    #include "timeval.h"
>>>>>>    #include "unaligned.h"
>>>>>> +#define MAX_PEDIT_OFFSETS 8
>>>>>
>>>>> Why 8?
>>>> We don't expect anything more right now (ipv6 src/dst rewrite requires 8
>>>> pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if
>>>> that's makes sens. do you suggest we increase it? to what?
>>>
>>> It seems strange to me to place a somewhat arbitrary small limit
>>> when none exists in the pedit API being used. I would at prefer if
>>> it was at least a bigger, say 16 or 32.
>>
>>
>> Hi Simon,
>>
>> Sorry for the late reply due to holidays and vacations.
>> Me & Paul going to go over this and do the fixes needed and
>> also rebase over latest master and run tests again.
> 
> Likewise, sorry for not responding earlier (same reason).
> 
>> I'll answer what I'm more familiar with now and Paul will continue.
>> The 8 here is too low and you right. We used this definition
>> for allocation of the pedit keys on the stack in
>> nl_msg_put_flower_rewrite_pedits()
>>
>> It was for convenience instead of calculating the maximum possible
>> keys that could exists and allocating it there and freeing it at
>> the end.
>>
>> Increasing it to 32 is probably more than enough and wont waste much.
> 
> Thanks, that sounds good.
> 
>>>>>>    VLOG_DEFINE_THIS_MODULE(tc);
>>>>>>    static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);
>>>>>> @@ -50,6 +55,82 @@ enum tc_offload_policy {
>>>>>>    static enum tc_offload_policy tc_policy = TC_POLICY_NONE;
>>>>>> +struct tc_pedit_key_ex {
>>>>>> +    enum pedit_header_type htype;
>>>>>> +    enum pedit_cmd cmd;
>>>>>> +};
>>>>>> +
>>>>>> +struct flower_key_to_pedit {
>>>>>> +    enum pedit_header_type htype;
>>>>>> +    int flower_offset;
>>>>>> +    int offset;
>>>>>> +    int size;
>>>>>> +};
>>>>>> +
>>>>>> +static struct flower_key_to_pedit flower_pedit_map[] = {
>>>>>> +    {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>>>>>> +        12,
>>>>>> +        offsetof(struct tc_flower_key, ipv4.ipv4_src),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>>>>>> +        16,
>>>>>> +        offsetof(struct tc_flower_key, ipv4.ipv4_dst),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
>>>>>> +        8,
>>>>>> +        offsetof(struct tc_flower_key, ipv4.rewrite_ttl),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
>>>>>> +        8,
>>>>>> +        offsetof(struct tc_flower_key, ipv6.ipv6_src),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
>>>>>> +        24,
>>>>>> +        offsetof(struct tc_flower_key, ipv6.ipv6_dst),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
>>>>>> +        6,
>>>>>> +        offsetof(struct tc_flower_key, src_mac),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, src_mac)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
>>>>>> +        0,
>>>>>> +        offsetof(struct tc_flower_key, dst_mac),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, dst_mac)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
>>>>>> +        12,
>>>>>> +        offsetof(struct tc_flower_key, eth_type),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, eth_type)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
>>>>>> +        0,
>>>>>> +        offsetof(struct tc_flower_key, tcp_src),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, tcp_src)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
>>>>>> +        2,
>>>>>> +        offsetof(struct tc_flower_key, tcp_dst),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, tcp_dst)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
>>>>>> +        0,
>>>>>> +        offsetof(struct tc_flower_key, udp_src),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, udp_src)
>>>>>> +    }, {
>>>>>> +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
>>>>>> +        2,
>>>>>> +        offsetof(struct tc_flower_key, udp_dst),
>>>>>> +        MEMBER_SIZEOF(struct tc_flower_key, udp_dst)
>>>>>> +    },
>>>>>> +};
>>>>>> +
>>>>>>    struct tcmsg *
>>>>>>    tc_make_request(int ifindex, int type, unsigned int flags,
>>>>>>                    struct ofpbuf *request)
>>>>>> @@ -365,6 +446,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
>>>>>>        }
>>>>>>    }
>>>>>> +static const struct nl_policy pedit_policy[] = {
>>>>>> +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
>>>>>> +                                     .min_len = sizeof(struct tc_pedit),
>>>>>> +                                     .optional = false, },
>>>>>> +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
>>>>>> +                                      .optional = false, },
>>>>>> +};
>>>>>> +
>>>>>> +static int
>>>>>> +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
>>>>>> +{
>>>>>> +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
>>>>>> +    const struct tc_pedit *pe;
>>>>>> +    const struct tc_pedit_key *keys;
>>>>>> +    const struct nlattr *nla, *keys_ex, *ex_type;
>>>>>> +    const void *keys_attr;
>>>>>> +    char *rewrite_key = (void *) &flower->rewrite.key;
>>>>>> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
>>>>>> +    size_t keys_ex_size, left;
>>>>>> +    int type, i = 0;
>>>>>> +
>>>>>> +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
>>>>>> +                         ARRAY_SIZE(pedit_policy))) {
>>>>>> +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
>>>>>> +        return EPROTO;
>>>>>> +    }
>>>>>> +
>>>>>> +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
>>>>>> +    keys = pe->keys;
>>>>>> +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
>>>>>> +    keys_ex = nl_attr_get(keys_attr);
>>>>>> +    keys_ex_size = nl_attr_get_size(keys_attr);
>>>>>> +
>>>>>> +    NL_ATTR_FOR_EACH (nla, left, keys_ex, keys_ex_size) {
>>>>>> +        if (i >= pe->nkeys) {
>>>>>> +            break;
>>>>>> +        }
>>>>>> +
>>>>>> +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
>>>>>> +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
>>>>>> +            type = nl_attr_get_u16(ex_type);
>>>>>> +
>>>>>> +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
>>>>>> +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
>>>>>> +                int flower_off = m->flower_offset;
>>>>>> +                int sz = m->size;
>>>>>> +                int mf = m->offset;
>>>>>> +
>>>>>> +                if (m->htype != type) {
>>>>>> +                   continue;
>>>>>> +                }
>>>>>> +
>>>>>> +                /* check overlap between current pedit key, which is always
>>>>>> +                 * 4 bytes (range [off, off + 3]), and a map entry in
>>>>>> +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
>>>>>> +                if ((keys->off >= mf && keys->off < mf + sz)
>>>>>> +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
>>>>>> +                    int diff = flower_off + (keys->off - mf);
>>>>>> +                    uint32_t *dst = (void *) (rewrite_key + diff);
>>>>>> +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
>>>>>> +                    uint32_t mask = ~(keys->mask);
>>>>>> +                    uint32_t zero_bits;
>>>>>> +
>>>>>> +                    if (keys->off < mf) {
>>>>>> +                        zero_bits = 8 * (mf - keys->off);
>>>>>> +                        mask &= UINT32_MAX << zero_bits;
>>>>>> +                    } else if (keys->off + 4 > mf + m->size) {
>>>>>> +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
>>>>>> +                        mask &= UINT32_MAX >> zero_bits;
>>>>>> +                    }
>>>>>> +
>>>>>> +                    *dst_m |= mask;
>>>>>> +                    *dst |= keys->val & mask;
>>>>>> +                }
>>>>>> +            }
>>>>>
>>>>> If I understand the above correctly it is designed to make
>>>>> pedit actions disjoint. If so, why is that necessary? >
>>>>
>>>> It's not, as a single flower key rewrite can be split to multiple pedit
>>>> actions it finds the overlap between a flower key and a pedit action, if
>>>> they do overlap it translates it to the correct offset and masks it out.
>>>
>>> Thanks, understood.
>>>
>>>>
>>>>>> +        } else {
>>>>>> +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
>>>>>> +                        nl_attr_type(nla));
>>>>>> +            return EOPNOTSUPP;
>>>>>> +        }
>>>>>
>>>>> I think the code could exit early here as
>>>>> nl_msg_put_flower_rewrite_pedits() does below.
>>>>>
>>>>
>>>> Sorry, didn't understand. can you give an example?
>>>>
>>>
>>> I meant something like this.
>>>
>>>               if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
>>>                   VLOG_ERR_RL(...);
>>>                   return EOPNOTSUPP;
>>>               }
>>
>> understood. we'll do that. thanks.
>>
>>>
>>> 	    /* Overlap detection code goes here */
>>>
>>>>>
>>>>>> +
>>>>>> +        keys++;
>>>>>> +        i++;
>>>>>> +    }
>>>>>> +
>>>>>> +    flower->rewrite.rewrite = true;
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>>    static const struct nl_policy tunnel_key_policy[] = {
>>>>>>        [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC,
>>>>>>                                   .min_len = sizeof(struct tc_tunnel_key),
>>>>>> @@ -608,6 +779,11 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
>>>>>>            nl_parse_act_vlan(act_options, flower);
>>>>>>        } else if (!strcmp(act_kind, "tunnel_key")) {
>>>>>>            nl_parse_act_tunnel_key(act_options, flower);
>>>>>> +    } else if (!strcmp(act_kind, "pedit")) {
>>>>>> +        nl_parse_act_pedit(act_options, flower);
>>>>>> +    } else if (!strcmp(act_kind, "csum")) {
>>>>>> +        /* not doing anything for now, ovs has an implicit csum recalculation
>>>>>> +         * with rewriting of packet headers (translating of pedit acts). */
>>>>>
>>>>> I wonder if the absence of a csum action when needed (by TC)
>>>>> should be treated as an error >
>>>>
>>>> Do you mean validating csum flags from each protocol and such (like in put)?
>>>
>>> Yes, I think so.
>>>
>>> In OvS (without TC acceleration) csum recalculation is implicit
>>> but with TC an explicit csum update action is required. I see
>>> in your code you are handling this by adding csum actions to TC actions
>>> when translating from OvS DPIF actions. And in the reverse (above) csum
>>> actions are simply ignored.
>>>
>>> What I am wondering is if when translating TC actions to OvS DPIF actions
>>> you should track if the TC actions should include a csum rule because
>>> of other actions and treat its absence as an error.
>>>
>>>>
>>>>>>        } else {
>>>>>>            VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
>>>>>>            return EINVAL;
>>>>>> @@ -809,6 +985,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy)
>>>>>>    }
>>>>>>    static void
>>>>>> +nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
>>>>>> +{
>>>>>> +    size_t offset;
>>>>>> +
>>>>>> +    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
>>>>>> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
>>>>>> +    {
>>>>>> +        struct tc_csum parm = { .action = TC_ACT_PIPE,
>>>>>> +                                .update_flags = flags };
>>>>>> +
>>>>>> +        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
>>>>>> +    }
>>>>>> +    nl_msg_end_nested(request, offset);
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
>>>>>> +                     struct tc_pedit_key_ex *ex)
>>>>>> +{
>>>>>> +    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
>>>>>
>>>>> Are there unnecessary () on the line above?
>>>> No, I'll remove them.
>>>>
>>>>>
>>>>>> +    size_t offset, offset_keys_ex, offset_key;
>>>>>> +    int i;
>>>>>> +
>>>>>> +    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
>>>>>> +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
>>>>>> +    {
>>>>>> +        parm->action = TC_ACT_PIPE;
>>>>>> +
>>>>>> +        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
>>>>>> +        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
>>>>>> +        for (i = 0; i < parm->nkeys; i++, ex++) {
>>>>>> +            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
>>>>>> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
>>>>>> +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
>>>>>> +            nl_msg_end_nested(request, offset_key);
>>>>>> +        }
>>>>>> +        nl_msg_end_nested(request, offset_keys_ex);
>>>>>> +    }
>>>>>> +    nl_msg_end_nested(request, offset);
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>>    nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
>>>>>>    {
>>>>>>        size_t offset;
>>>>>> @@ -930,7 +1148,127 @@ nl_msg_put_act_cookie(struct ofpbuf *request, struct tc_cookie *ck) {
>>>>>>        }
>>>>>>    }
>>>>>> +/* Given flower, a key_to_pedit map entry, calculates the rest,
>>>>>> + * where:
>>>>>> + *
>>>>>> + * mask, data - pointers of where read the first word of flower->key/mask.
>>>>>> + * current_offset - which offset to use for the first pedit action.
>>>>>> + * cnt - max pedits actions to use.
>>>>>> + * first_word_mask/last_word_mask - the mask to use for the first/last read
>>>>>> + * (as we read entire words). */
>>>>>>    static void
>>>>>> +calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m,
>>>>>> +             int *cur_offset, int *cnt, uint32_t *last_word_mask,
>>>>>> +             uint32_t *first_word_mask, uint32_t **mask, uint32_t **data)
>>>>>> +{
>>>>>> +    int start_offset, max_offset, total_size;
>>>>>> +    int diff, right_zero_bits, left_zero_bits;
>>>>>> +    char *rewrite_key = (void *) &flower->rewrite.key;
>>>>>> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
>>>>>> +
>>>>>> +    max_offset = m->offset + m->size;
>>>>>> +    start_offset = ROUND_DOWN(m->offset, 4);
>>>>>> +    diff = m->offset - start_offset;
>>>>>> +    total_size = max_offset - start_offset;
>>>>>> +    right_zero_bits = 8 * (4 - (max_offset % 4));
>>>>>> +    left_zero_bits = 8 * (m->offset - start_offset);
>>>>>> +
>>>>>> +    *cur_offset = start_offset;
>>>>>> +    *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0);
>>>>>> +    *last_word_mask = UINT32_MAX >> right_zero_bits;
>>>>>> +    *first_word_mask = UINT32_MAX << left_zero_bits;
>>>>>> +    *data = (void *) (rewrite_key + m->flower_offset - diff);
>>>>>> +    *mask = (void *) (rewrite_mask + m->flower_offset - diff);
>>>>>
>>>>> The type of *data and *mask is uint32_t *.
>>>>> Why not cast to that type?
>>>>
>>>> It's to avoid warning of increasing pointer alignment (-Wcast-align).
>>>
>>> It sounds like the warning should be addressed rather than overridden using
>>> a cast.
>>>
>>>>>
>>>>>> +}
>>>>>> +
>>>>>> +static inline void
>>>>>> +csum_update_flag(struct tc_flower *flower,
>>>>>> +                 enum pedit_header_type htype) {
>>>>>
>>>>> I think the above two lines could be one.
>>>>>
>>>>>> +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {
>>>>>
>>>>> A case statement might be nicer here.
>>>>
>>>> Right, thanks.
>>>>
>>>>>
>>>>>> +        flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR;
>>>>>> +    }
>>>>>> +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4
>>>>>> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6
>>>>>> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP
>>>>>> +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) {
>>>>>> +        if (flower->key.ip_proto == IPPROTO_TCP) {
>>>>>> +            flower->mask.ip_proto = UINT8_MAX;
>>>>>
>>>>> What if the mask was not UINT8_MAX to start with?
>>>>> Doesn't this create a different flow?
>>>>
>>>> This is so the checksum will be strict (not setting/recalc udp checksum on
>>>> non udp packets). It creates a more specific flow, a subset of the original.
>>>> This is fine as datapath is free to ignore a mask, which is the same as a
>>>> full mask we've set.
>>>
>>> I'm not sure that I understand why its fine for the datapath to ignore the
>>> mask.
>>
>> I'll try to explain.
>> When we want the HW to recalc the csum we need to also do matching on
>> it. We do that for TCP, UDP, ICMP.
> 
> Ok, that makes sense. But I am concerned that the mask of the offloaded
> flow no longer matches the mask of the flow OvS created. What are the
> implications of that?
> 

Hi, I remember the mask being a suggestion to the datapath (can't find 
it right now) as the datapath might not support the full mask. After the 
flow is generated from a miss upcall it will be put to the datapath and 
isn't saved anywhere, only the flow ufid (hash based on key, mask and 
some seed, saved in ukey map at ofproto upcall...), if I recall 
correctly. We return the same ufid for the more specific flow that we 
put instead, so it will be aged correctly.
On dump it will show the more specific flow, as its a direct parse of 
dump from the datapath where there is the more specific one and not a 
saved flow.
On a miss that could have been caught by the original flow, ovs will 
generate a new UFID, since the key is different. So we will have
another flow put and again a more specific flow and so on.
This flows will be disjointed and dumped and aged correctly.


>>>>>> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
>>>>>> +        } else if (flower->key.ip_proto == IPPROTO_UDP) {
>>>>>> +            flower->mask.ip_proto = UINT8_MAX;
>>>>>> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
>>>>>> +        } else if (flower->key.ip_proto == IPPROTO_ICMP
>>>>>> +                   || flower->key.ip_proto == IPPROTO_ICMPV6) {
>>>>>> +            flower->mask.ip_proto = UINT8_MAX;
>>>>>> +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP;
>>>>>> +        }
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +static int
>>>>>> +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
>>>>>> +                                 struct tc_flower *flower)
>>>>>> +{
>>>>>> +    struct {
>>>>>> +        struct tc_pedit sel;
>>>>>> +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
>>>>>> +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
>>>>>> +    } sel = {
>>>>>> +        .sel = {
>>>>>> +            .nkeys = 0
>>>>>> +        }
>>>>>> +    };
>>>>>> +    int i, j;
>>>>>> +
>>>>>> +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
>>>>>> +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
>>>>>> +        struct tc_pedit_key *pedit_key = NULL;
>>>>>> +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
>>>>>> +        uint32_t *mask, *data, first_word_mask, last_word_mask;
>>>>>> +        int cnt = 0, cur_offset = 0;
>>>>>> +
>>>>>> +        if (!m->size) {
>>>>>> +            continue;
>>>>>> +        }
>>>>>> +
>>>>>> +        calc_offsets(flower, m, &cur_offset, &cnt, &last_word_mask,
>>>>>> +                     &first_word_mask, &mask, &data);
>>>>>> +
>>>>>> +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
>>>>>> +            uint32_t mask_word = *mask;
>>>>>> +
>>>>>> +            if (j == 0) {
>>>>>> +                mask_word &= first_word_mask;
>>>>>> +            }
>>>>>> +            if (j == cnt - 1) {
>>>>>> +                mask_word &= last_word_mask;
>>>>>> +            }
>>>>>> +            if (!mask_word) {
>>>>>> +                continue;
>>>>>> +            }
>>>>>> +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
>>>>>> +                VLOG_WARN_RL(&error_rl, "reached too many pedit offsets: %d",
>>>>>> +                             MAX_PEDIT_OFFSETS);
>>>>>> +                return EOPNOTSUPP;
>>>>>> +            }
>>>>>> +
>>>>>> +            pedit_key = &sel.keys[sel.sel.nkeys];
>>>>>> +            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
>>>>>> +            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
>>>>>> +            pedit_key_ex->htype = m->htype;
>>>>>> +            pedit_key->off = cur_offset;
>>>>>> +            pedit_key->mask = ~mask_word;
>>>>>> +            pedit_key->val = *data & mask_word;
>>>>>> +            sel.sel.nkeys++;
>>>>>> +            csum_update_flag(flower, m->htype);
>>>>>> +        }
>>>>>> +    }
>>>>>> +    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int
>>>>>>    nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>>>>>>    {
>>>>>>        size_t offset;
>>>>>> @@ -939,7 +1277,20 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>>>>>>        offset = nl_msg_start_nested(request, TCA_FLOWER_ACT);
>>>>>>        {
>>>>>>            uint16_t act_index = 1;
>>>>>> +        int error;
>>>>>> +        if (flower->rewrite.rewrite) {
>>>>>> +            act_offset = nl_msg_start_nested(request, act_index++);
>>>>>> +            error = nl_msg_put_flower_rewrite_pedits(request, flower);
>>>>>> +            if (error) {
>>>>>> +                return error;
>>>>>> +            }
>>>>>> +            nl_msg_end_nested(request, act_offset);
>>>>>> +
>>>>>> +            act_offset = nl_msg_start_nested(request, act_index++);
>>>>>> +            nl_msg_put_act_csum(request, flower->csum_update_flags);
>>>>>> +            nl_msg_end_nested(request, act_offset);
>>>>>> +        }
>>>>>>            if (flower->set.set) {
>>>>>>                act_offset = nl_msg_start_nested(request, act_index++);
>>>>>>                nl_msg_put_act_tunnel_key_set(request, flower->set.id,
>>>>>> @@ -980,6 +1331,8 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
>>>>>>            }
>>>>>>        }
>>>>>>        nl_msg_end_nested(request, offset);
>>>>>> +
>>>>>> +    return 0;
>>>>>>    }
>>>>>>    static void
>>>>>> @@ -1021,11 +1374,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
>>>>>>        nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
>>>>>>                                &flower->mask.member, sizeof flower->key.member)
>>>>>> -static void
>>>>>> +static int
>>>>>>    nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>>>>>>    {
>>>>>> +
>>>>>>        uint16_t host_eth_type = ntohs(flower->key.eth_type);
>>>>>>        bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
>>>>>> +    int err;
>>>>>> +
>>>>>> +    /* need to parse acts first as some acts require changing the matching */
>>>>>
>>>>> This seems strange to me.
>>>>
>>>> from the strict (normalized?) header checksum.
>>>
>>> I think this code relates to setting flower->key.ip_proto == IPPROTO_TCP
>>> above. Is that correct. Are there any other cases?
>>
>> yes and also for UDP and ICMP. as explained above.
>> So going over the acts in header rewrite case we might want to signal the HW
>> recalc csum and we need matching on those fields that require new
>> csum so the matching mask being updated. this is why we parse the acts
>> first.
> 
> Thanks, understood.
> 
>>>>>> +    err  = nl_msg_put_flower_acts(request, flower);
>>>>>> +    if (err) {
>>>>>> +        return err;
>>>>>> +    }
>>>>>>        if (is_vlan) {
>>>>>>            host_eth_type = ntohs(flower->key.encap_eth_type);
>>>>>> @@ -1083,7 +1444,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
>>>>>>            nl_msg_put_flower_tunnel(request, flower);
>>>>>>        }
>>>>>> -    nl_msg_put_flower_acts(request, flower);
>>>>>> +    return 0;
>>>>>>    }
>>>>>>    int
>>>>>> @@ -1106,7 +1467,12 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
>>>>>>        nl_msg_put_string(&request, TCA_KIND, "flower");
>>>>>>        basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
>>>>>>        {
>>>>>> -        nl_msg_put_flower_options(&request, flower);
>>>>>> +        error = nl_msg_put_flower_options(&request, flower);
>>>>>> +
>>>>>> +        if (error) {
>>>>>> +            ofpbuf_uninit(&request);
>>>>>> +            return error;
>>>>>> +        }
>>>>>>        }
>>>>>>        nl_msg_end_nested(&request, basic_offset);
>>>>>> diff --git a/lib/tc.h b/lib/tc.h
>>>>>> index 6c69b79..7876051 100644
>>>>>> --- a/lib/tc.h
>>>>>> +++ b/lib/tc.h
>>>>>> @@ -96,6 +96,7 @@ struct tc_flower_key {
>>>>>>        struct {
>>>>>>            ovs_be32 ipv4_src;
>>>>>>            ovs_be32 ipv4_dst;
>>>>>> +        uint8_t rewrite_ttl;
>>>>>>        } ipv4;
>>>>>>        struct {
>>>>>>            struct in6_addr ipv6_src;
>>>>>> @@ -120,6 +121,14 @@ struct tc_flower {
>>>>>>        uint64_t lastused;
>>>>>>        struct {
>>>>>> +        bool rewrite;
>>>>>> +        struct tc_flower_key key;
>>>>>> +        struct tc_flower_key mask;
>>>>>> +    } rewrite;
>>>>>> +
>>>>>> +    uint32_t csum_update_flags;
>>>>>> +
>>>>>> +    struct {
>>>>>>            bool set;
>>>>>>            ovs_be64 id;
>>>>>>            ovs_be16 tp_src;
>>>>>> @@ -152,6 +161,13 @@ struct tc_flower {
>>>>>>        struct tc_cookie act_cookie;
>>>>>>    };
>>>>>> +/* assert that if we overflow with a masked write of uint32_t to the last byte
>>>>>> + * of flower.rewrite we overflow inside struct flower.
>>>>>> + * shouldn't happen unless someone moves rewrite to the end of flower */
>>>>>> +BUILD_ASSERT_DECL(offsetof(struct tc_flower, rewrite)
>>>>>> +                  + MEMBER_SIZEOF(struct tc_flower, rewrite)
>>>>>> +                  + sizeof(uint32_t) - 2 < sizeof(struct tc_flower));
>>>>>> +
>>>>>>    int tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
>>>>>>                          struct tc_flower *flower);
>>>>>>    int tc_del_filter(int ifindex, int prio, int handle);
>>>>>> -- 
>>>>>> 2.7.5
>>>>>>
Simon Horman Oct. 31, 2017, 2:29 p.m. UTC | #7
On Tue, Oct 31, 2017 at 09:20:55AM +0200, Paul Blakey wrote:
> 
> 
> On 30/10/2017 15:42, Simon Horman wrote:
> > On Wed, Oct 25, 2017 at 02:24:15PM +0300, Roi Dayan wrote:
> > > 
> > > 
> > > On 27/09/2017 12:08, Simon Horman wrote:
> > > > On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
> > > > > 
> > > > > 
> > > > > On 18/09/2017 18:01, Simon Horman wrote:
> > > > > > On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
> > > > > > > From: Paul Blakey <paulb@mellanox.com>
> > > > > > > 
> > > > > > > To be later used to implement ovs action set offloading.
> > > > > > > 
> > > > > > > Signed-off-by: Paul Blakey <paulb@mellanox.com>
> > > > > > > Reviewed-by: Roi Dayan <roid@mellanox.com>
> > > > > > > ---
> > > > > > >    lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > >    lib/tc.h |  16 +++
> > > > > > >    2 files changed, 385 insertions(+), 3 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/lib/tc.c b/lib/tc.c
> > > > > > > index c9cada2..743b2ee 100644
> > > > > > > --- a/lib/tc.c
> > > > > > > +++ b/lib/tc.c
> > > > > > > @@ -21,8 +21,10 @@
> > > > > > >    #include <errno.h>
> > > > > > >    #include <linux/if_ether.h>
> > > > > > >    #include <linux/rtnetlink.h>
> > > > > > > +#include <linux/tc_act/tc_csum.h>
> > > > > > >    #include <linux/tc_act/tc_gact.h>
> > > > > > >    #include <linux/tc_act/tc_mirred.h>
> > > > > > > +#include <linux/tc_act/tc_pedit.h>
> > > > > > >    #include <linux/tc_act/tc_tunnel_key.h>
> > > > > > >    #include <linux/tc_act/tc_vlan.h>
> > > > > > >    #include <linux/gen_stats.h>
> > > > > > > @@ -33,11 +35,14 @@
> > > > > > >    #include "netlink-socket.h"
> > > > > > >    #include "netlink.h"
> > > > > > >    #include "openvswitch/ofpbuf.h"
> > > > > > > +#include "openvswitch/util.h"
> > > > > > >    #include "openvswitch/vlog.h"
> > > > > > >    #include "packets.h"
> > > > > > >    #include "timeval.h"
> > > > > > >    #include "unaligned.h"
> > > > > > > +#define MAX_PEDIT_OFFSETS 8
> > > > > > 
> > > > > > Why 8?
> > > > > We don't expect anything more right now (ipv6 src/dst rewrite requires 8
> > > > > pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if
> > > > > that's makes sens. do you suggest we increase it? to what?
> > > > 
> > > > It seems strange to me to place a somewhat arbitrary small limit
> > > > when none exists in the pedit API being used. I would at prefer if
> > > > it was at least a bigger, say 16 or 32.
> > > 
> > > 
> > > Hi Simon,
> > > 
> > > Sorry for the late reply due to holidays and vacations.
> > > Me & Paul going to go over this and do the fixes needed and
> > > also rebase over latest master and run tests again.
> > 
> > Likewise, sorry for not responding earlier (same reason).
> > 
> > > I'll answer what I'm more familiar with now and Paul will continue.
> > > The 8 here is too low and you right. We used this definition
> > > for allocation of the pedit keys on the stack in
> > > nl_msg_put_flower_rewrite_pedits()
> > > 
> > > It was for convenience instead of calculating the maximum possible
> > > keys that could exists and allocating it there and freeing it at
> > > the end.
> > > 
> > > Increasing it to 32 is probably more than enough and wont waste much.
> > 
> > Thanks, that sounds good.
> > 
> > > > > > >    VLOG_DEFINE_THIS_MODULE(tc);
> > > > > > >    static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);
> > > > > > > @@ -50,6 +55,82 @@ enum tc_offload_policy {
> > > > > > >    static enum tc_offload_policy tc_policy = TC_POLICY_NONE;
> > > > > > > +struct tc_pedit_key_ex {
> > > > > > > +    enum pedit_header_type htype;
> > > > > > > +    enum pedit_cmd cmd;
> > > > > > > +};
> > > > > > > +
> > > > > > > +struct flower_key_to_pedit {
> > > > > > > +    enum pedit_header_type htype;
> > > > > > > +    int flower_offset;
> > > > > > > +    int offset;
> > > > > > > +    int size;
> > > > > > > +};
> > > > > > > +
> > > > > > > +static struct flower_key_to_pedit flower_pedit_map[] = {
> > > > > > > +    {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> > > > > > > +        12,
> > > > > > > +        offsetof(struct tc_flower_key, ipv4.ipv4_src),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> > > > > > > +        16,
> > > > > > > +        offsetof(struct tc_flower_key, ipv4.ipv4_dst),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
> > > > > > > +        8,
> > > > > > > +        offsetof(struct tc_flower_key, ipv4.rewrite_ttl),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
> > > > > > > +        8,
> > > > > > > +        offsetof(struct tc_flower_key, ipv6.ipv6_src),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
> > > > > > > +        24,
> > > > > > > +        offsetof(struct tc_flower_key, ipv6.ipv6_dst),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> > > > > > > +        6,
> > > > > > > +        offsetof(struct tc_flower_key, src_mac),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, src_mac)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> > > > > > > +        0,
> > > > > > > +        offsetof(struct tc_flower_key, dst_mac),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, dst_mac)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
> > > > > > > +        12,
> > > > > > > +        offsetof(struct tc_flower_key, eth_type),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, eth_type)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
> > > > > > > +        0,
> > > > > > > +        offsetof(struct tc_flower_key, tcp_src),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, tcp_src)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
> > > > > > > +        2,
> > > > > > > +        offsetof(struct tc_flower_key, tcp_dst),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, tcp_dst)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
> > > > > > > +        0,
> > > > > > > +        offsetof(struct tc_flower_key, udp_src),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, udp_src)
> > > > > > > +    }, {
> > > > > > > +        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
> > > > > > > +        2,
> > > > > > > +        offsetof(struct tc_flower_key, udp_dst),
> > > > > > > +        MEMBER_SIZEOF(struct tc_flower_key, udp_dst)
> > > > > > > +    },
> > > > > > > +};
> > > > > > > +
> > > > > > >    struct tcmsg *
> > > > > > >    tc_make_request(int ifindex, int type, unsigned int flags,
> > > > > > >                    struct ofpbuf *request)
> > > > > > > @@ -365,6 +446,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
> > > > > > >        }
> > > > > > >    }
> > > > > > > +static const struct nl_policy pedit_policy[] = {
> > > > > > > +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
> > > > > > > +                                     .min_len = sizeof(struct tc_pedit),
> > > > > > > +                                     .optional = false, },
> > > > > > > +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
> > > > > > > +                                      .optional = false, },
> > > > > > > +};
> > > > > > > +
> > > > > > > +static int
> > > > > > > +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
> > > > > > > +{
> > > > > > > +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
> > > > > > > +    const struct tc_pedit *pe;
> > > > > > > +    const struct tc_pedit_key *keys;
> > > > > > > +    const struct nlattr *nla, *keys_ex, *ex_type;
> > > > > > > +    const void *keys_attr;
> > > > > > > +    char *rewrite_key = (void *) &flower->rewrite.key;
> > > > > > > +    char *rewrite_mask = (void *) &flower->rewrite.mask;
> > > > > > > +    size_t keys_ex_size, left;
> > > > > > > +    int type, i = 0;
> > > > > > > +
> > > > > > > +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
> > > > > > > +                         ARRAY_SIZE(pedit_policy))) {
> > > > > > > +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
> > > > > > > +        return EPROTO;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
> > > > > > > +    keys = pe->keys;
> > > > > > > +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
> > > > > > > +    keys_ex = nl_attr_get(keys_attr);
> > > > > > > +    keys_ex_size = nl_attr_get_size(keys_attr);
> > > > > > > +
> > > > > > > +    NL_ATTR_FOR_EACH (nla, left, keys_ex, keys_ex_size) {
> > > > > > > +        if (i >= pe->nkeys) {
> > > > > > > +            break;
> > > > > > > +        }
> > > > > > > +
> > > > > > > +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
> > > > > > > +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
> > > > > > > +            type = nl_attr_get_u16(ex_type);
> > > > > > > +
> > > > > > > +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
> > > > > > > +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
> > > > > > > +                int flower_off = m->flower_offset;
> > > > > > > +                int sz = m->size;
> > > > > > > +                int mf = m->offset;
> > > > > > > +
> > > > > > > +                if (m->htype != type) {
> > > > > > > +                   continue;
> > > > > > > +                }
> > > > > > > +
> > > > > > > +                /* check overlap between current pedit key, which is always
> > > > > > > +                 * 4 bytes (range [off, off + 3]), and a map entry in
> > > > > > > +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
> > > > > > > +                if ((keys->off >= mf && keys->off < mf + sz)
> > > > > > > +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
> > > > > > > +                    int diff = flower_off + (keys->off - mf);
> > > > > > > +                    uint32_t *dst = (void *) (rewrite_key + diff);
> > > > > > > +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
> > > > > > > +                    uint32_t mask = ~(keys->mask);
> > > > > > > +                    uint32_t zero_bits;
> > > > > > > +
> > > > > > > +                    if (keys->off < mf) {
> > > > > > > +                        zero_bits = 8 * (mf - keys->off);
> > > > > > > +                        mask &= UINT32_MAX << zero_bits;
> > > > > > > +                    } else if (keys->off + 4 > mf + m->size) {
> > > > > > > +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
> > > > > > > +                        mask &= UINT32_MAX >> zero_bits;
> > > > > > > +                    }
> > > > > > > +
> > > > > > > +                    *dst_m |= mask;
> > > > > > > +                    *dst |= keys->val & mask;
> > > > > > > +                }
> > > > > > > +            }
> > > > > > 
> > > > > > If I understand the above correctly it is designed to make
> > > > > > pedit actions disjoint. If so, why is that necessary? >
> > > > > 
> > > > > It's not, as a single flower key rewrite can be split to multiple pedit
> > > > > actions it finds the overlap between a flower key and a pedit action, if
> > > > > they do overlap it translates it to the correct offset and masks it out.
> > > > 
> > > > Thanks, understood.
> > > > 
> > > > > 
> > > > > > > +        } else {
> > > > > > > +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
> > > > > > > +                        nl_attr_type(nla));
> > > > > > > +            return EOPNOTSUPP;
> > > > > > > +        }
> > > > > > 
> > > > > > I think the code could exit early here as
> > > > > > nl_msg_put_flower_rewrite_pedits() does below.
> > > > > > 
> > > > > 
> > > > > Sorry, didn't understand. can you give an example?
> > > > > 
> > > > 
> > > > I meant something like this.
> > > > 
> > > >               if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
> > > >                   VLOG_ERR_RL(...);
> > > >                   return EOPNOTSUPP;
> > > >               }
> > > 
> > > understood. we'll do that. thanks.
> > > 
> > > > 
> > > > 	    /* Overlap detection code goes here */
> > > > 
> > > > > > 
> > > > > > > +
> > > > > > > +        keys++;
> > > > > > > +        i++;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    flower->rewrite.rewrite = true;
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > >    static const struct nl_policy tunnel_key_policy[] = {
> > > > > > >        [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC,
> > > > > > >                                   .min_len = sizeof(struct tc_tunnel_key),
> > > > > > > @@ -608,6 +779,11 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
> > > > > > >            nl_parse_act_vlan(act_options, flower);
> > > > > > >        } else if (!strcmp(act_kind, "tunnel_key")) {
> > > > > > >            nl_parse_act_tunnel_key(act_options, flower);
> > > > > > > +    } else if (!strcmp(act_kind, "pedit")) {
> > > > > > > +        nl_parse_act_pedit(act_options, flower);
> > > > > > > +    } else if (!strcmp(act_kind, "csum")) {
> > > > > > > +        /* not doing anything for now, ovs has an implicit csum recalculation
> > > > > > > +         * with rewriting of packet headers (translating of pedit acts). */
> > > > > > 
> > > > > > I wonder if the absence of a csum action when needed (by TC)
> > > > > > should be treated as an error >
> > > > > 
> > > > > Do you mean validating csum flags from each protocol and such (like in put)?
> > > > 
> > > > Yes, I think so.
> > > > 
> > > > In OvS (without TC acceleration) csum recalculation is implicit
> > > > but with TC an explicit csum update action is required. I see
> > > > in your code you are handling this by adding csum actions to TC actions
> > > > when translating from OvS DPIF actions. And in the reverse (above) csum
> > > > actions are simply ignored.
> > > > 
> > > > What I am wondering is if when translating TC actions to OvS DPIF actions
> > > > you should track if the TC actions should include a csum rule because
> > > > of other actions and treat its absence as an error.
> > > > 
> > > > > 
> > > > > > >        } else {
> > > > > > >            VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
> > > > > > >            return EINVAL;
> > > > > > > @@ -809,6 +985,48 @@ tc_get_tc_cls_policy(enum tc_offload_policy policy)
> > > > > > >    }
> > > > > > >    static void
> > > > > > > +nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
> > > > > > > +{
> > > > > > > +    size_t offset;
> > > > > > > +
> > > > > > > +    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
> > > > > > > +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
> > > > > > > +    {
> > > > > > > +        struct tc_csum parm = { .action = TC_ACT_PIPE,
> > > > > > > +                                .update_flags = flags };
> > > > > > > +
> > > > > > > +        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
> > > > > > > +    }
> > > > > > > +    nl_msg_end_nested(request, offset);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
> > > > > > > +                     struct tc_pedit_key_ex *ex)
> > > > > > > +{
> > > > > > > +    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
> > > > > > 
> > > > > > Are there unnecessary () on the line above?
> > > > > No, I'll remove them.
> > > > > 
> > > > > > 
> > > > > > > +    size_t offset, offset_keys_ex, offset_key;
> > > > > > > +    int i;
> > > > > > > +
> > > > > > > +    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
> > > > > > > +    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
> > > > > > > +    {
> > > > > > > +        parm->action = TC_ACT_PIPE;
> > > > > > > +
> > > > > > > +        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
> > > > > > > +        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
> > > > > > > +        for (i = 0; i < parm->nkeys; i++, ex++) {
> > > > > > > +            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
> > > > > > > +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
> > > > > > > +            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
> > > > > > > +            nl_msg_end_nested(request, offset_key);
> > > > > > > +        }
> > > > > > > +        nl_msg_end_nested(request, offset_keys_ex);
> > > > > > > +    }
> > > > > > > +    nl_msg_end_nested(request, offset);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > >    nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
> > > > > > >    {
> > > > > > >        size_t offset;
> > > > > > > @@ -930,7 +1148,127 @@ nl_msg_put_act_cookie(struct ofpbuf *request, struct tc_cookie *ck) {
> > > > > > >        }
> > > > > > >    }
> > > > > > > +/* Given flower, a key_to_pedit map entry, calculates the rest,
> > > > > > > + * where:
> > > > > > > + *
> > > > > > > + * mask, data - pointers of where read the first word of flower->key/mask.
> > > > > > > + * current_offset - which offset to use for the first pedit action.
> > > > > > > + * cnt - max pedits actions to use.
> > > > > > > + * first_word_mask/last_word_mask - the mask to use for the first/last read
> > > > > > > + * (as we read entire words). */
> > > > > > >    static void
> > > > > > > +calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m,
> > > > > > > +             int *cur_offset, int *cnt, uint32_t *last_word_mask,
> > > > > > > +             uint32_t *first_word_mask, uint32_t **mask, uint32_t **data)
> > > > > > > +{
> > > > > > > +    int start_offset, max_offset, total_size;
> > > > > > > +    int diff, right_zero_bits, left_zero_bits;
> > > > > > > +    char *rewrite_key = (void *) &flower->rewrite.key;
> > > > > > > +    char *rewrite_mask = (void *) &flower->rewrite.mask;
> > > > > > > +
> > > > > > > +    max_offset = m->offset + m->size;
> > > > > > > +    start_offset = ROUND_DOWN(m->offset, 4);
> > > > > > > +    diff = m->offset - start_offset;
> > > > > > > +    total_size = max_offset - start_offset;
> > > > > > > +    right_zero_bits = 8 * (4 - (max_offset % 4));
> > > > > > > +    left_zero_bits = 8 * (m->offset - start_offset);
> > > > > > > +
> > > > > > > +    *cur_offset = start_offset;
> > > > > > > +    *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0);
> > > > > > > +    *last_word_mask = UINT32_MAX >> right_zero_bits;
> > > > > > > +    *first_word_mask = UINT32_MAX << left_zero_bits;
> > > > > > > +    *data = (void *) (rewrite_key + m->flower_offset - diff);
> > > > > > > +    *mask = (void *) (rewrite_mask + m->flower_offset - diff);
> > > > > > 
> > > > > > The type of *data and *mask is uint32_t *.
> > > > > > Why not cast to that type?
> > > > > 
> > > > > It's to avoid warning of increasing pointer alignment (-Wcast-align).
> > > > 
> > > > It sounds like the warning should be addressed rather than overridden using
> > > > a cast.
> > > > 
> > > > > > 
> > > > > > > +}
> > > > > > > +
> > > > > > > +static inline void
> > > > > > > +csum_update_flag(struct tc_flower *flower,
> > > > > > > +                 enum pedit_header_type htype) {
> > > > > > 
> > > > > > I think the above two lines could be one.
> > > > > > 
> > > > > > > +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {
> > > > > > 
> > > > > > A case statement might be nicer here.
> > > > > 
> > > > > Right, thanks.
> > > > > 
> > > > > > 
> > > > > > > +        flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR;
> > > > > > > +    }
> > > > > > > +    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4
> > > > > > > +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6
> > > > > > > +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP
> > > > > > > +        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) {
> > > > > > > +        if (flower->key.ip_proto == IPPROTO_TCP) {
> > > > > > > +            flower->mask.ip_proto = UINT8_MAX;
> > > > > > 
> > > > > > What if the mask was not UINT8_MAX to start with?
> > > > > > Doesn't this create a different flow?
> > > > > 
> > > > > This is so the checksum will be strict (not setting/recalc udp checksum on
> > > > > non udp packets). It creates a more specific flow, a subset of the original.
> > > > > This is fine as datapath is free to ignore a mask, which is the same as a
> > > > > full mask we've set.
> > > > 
> > > > I'm not sure that I understand why its fine for the datapath to ignore the
> > > > mask.
> > > 
> > > I'll try to explain.
> > > When we want the HW to recalc the csum we need to also do matching on
> > > it. We do that for TCP, UDP, ICMP.
> > 
> > Ok, that makes sense. But I am concerned that the mask of the offloaded
> > flow no longer matches the mask of the flow OvS created. What are the
> > implications of that?
> > 
> 
> Hi, I remember the mask being a suggestion to the datapath (can't find it
> right now) as the datapath might not support the full mask. After the flow
> is generated from a miss upcall it will be put to the datapath and isn't
> saved anywhere, only the flow ufid (hash based on key, mask and some seed,
> saved in ukey map at ofproto upcall...), if I recall correctly. We return
> the same ufid for the more specific flow that we put instead, so it will be
> aged correctly.
> On dump it will show the more specific flow, as its a direct parse of dump
> from the datapath where there is the more specific one and not a saved flow.
> On a miss that could have been caught by the original flow, ovs will
> generate a new UFID, since the key is different. So we will have
> another flow put and again a more specific flow and so on.
> This flows will be disjointed and dumped and aged correctly.

Thanks for the explanation. I don't think I have any outstanding questions
and I'd be happy to see a new version of this patchset.

> 
> 
> > > > > > > +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
> > > > > > > +        } else if (flower->key.ip_proto == IPPROTO_UDP) {
> > > > > > > +            flower->mask.ip_proto = UINT8_MAX;
> > > > > > > +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
> > > > > > > +        } else if (flower->key.ip_proto == IPPROTO_ICMP
> > > > > > > +                   || flower->key.ip_proto == IPPROTO_ICMPV6) {
> > > > > > > +            flower->mask.ip_proto = UINT8_MAX;
> > > > > > > +            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP;
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > > +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
> > > > > > > +                                 struct tc_flower *flower)
> > > > > > > +{
> > > > > > > +    struct {
> > > > > > > +        struct tc_pedit sel;
> > > > > > > +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
> > > > > > > +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
> > > > > > > +    } sel = {
> > > > > > > +        .sel = {
> > > > > > > +            .nkeys = 0
> > > > > > > +        }
> > > > > > > +    };
> > > > > > > +    int i, j;
> > > > > > > +
> > > > > > > +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
> > > > > > > +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
> > > > > > > +        struct tc_pedit_key *pedit_key = NULL;
> > > > > > > +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
> > > > > > > +        uint32_t *mask, *data, first_word_mask, last_word_mask;
> > > > > > > +        int cnt = 0, cur_offset = 0;
> > > > > > > +
> > > > > > > +        if (!m->size) {
> > > > > > > +            continue;
> > > > > > > +        }
> > > > > > > +
> > > > > > > +        calc_offsets(flower, m, &cur_offset, &cnt, &last_word_mask,
> > > > > > > +                     &first_word_mask, &mask, &data);
> > > > > > > +
> > > > > > > +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
> > > > > > > +            uint32_t mask_word = *mask;
> > > > > > > +
> > > > > > > +            if (j == 0) {
> > > > > > > +                mask_word &= first_word_mask;
> > > > > > > +            }
> > > > > > > +            if (j == cnt - 1) {
> > > > > > > +                mask_word &= last_word_mask;
> > > > > > > +            }
> > > > > > > +            if (!mask_word) {
> > > > > > > +                continue;
> > > > > > > +            }
> > > > > > > +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
> > > > > > > +                VLOG_WARN_RL(&error_rl, "reached too many pedit offsets: %d",
> > > > > > > +                             MAX_PEDIT_OFFSETS);
> > > > > > > +                return EOPNOTSUPP;
> > > > > > > +            }
> > > > > > > +
> > > > > > > +            pedit_key = &sel.keys[sel.sel.nkeys];
> > > > > > > +            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
> > > > > > > +            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
> > > > > > > +            pedit_key_ex->htype = m->htype;
> > > > > > > +            pedit_key->off = cur_offset;
> > > > > > > +            pedit_key->mask = ~mask_word;
> > > > > > > +            pedit_key->val = *data & mask_word;
> > > > > > > +            sel.sel.nkeys++;
> > > > > > > +            csum_update_flag(flower, m->htype);
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > >    nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
> > > > > > >    {
> > > > > > >        size_t offset;
> > > > > > > @@ -939,7 +1277,20 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
> > > > > > >        offset = nl_msg_start_nested(request, TCA_FLOWER_ACT);
> > > > > > >        {
> > > > > > >            uint16_t act_index = 1;
> > > > > > > +        int error;
> > > > > > > +        if (flower->rewrite.rewrite) {
> > > > > > > +            act_offset = nl_msg_start_nested(request, act_index++);
> > > > > > > +            error = nl_msg_put_flower_rewrite_pedits(request, flower);
> > > > > > > +            if (error) {
> > > > > > > +                return error;
> > > > > > > +            }
> > > > > > > +            nl_msg_end_nested(request, act_offset);
> > > > > > > +
> > > > > > > +            act_offset = nl_msg_start_nested(request, act_index++);
> > > > > > > +            nl_msg_put_act_csum(request, flower->csum_update_flags);
> > > > > > > +            nl_msg_end_nested(request, act_offset);
> > > > > > > +        }
> > > > > > >            if (flower->set.set) {
> > > > > > >                act_offset = nl_msg_start_nested(request, act_index++);
> > > > > > >                nl_msg_put_act_tunnel_key_set(request, flower->set.id,
> > > > > > > @@ -980,6 +1331,8 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
> > > > > > >            }
> > > > > > >        }
> > > > > > >        nl_msg_end_nested(request, offset);
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > >    }
> > > > > > >    static void
> > > > > > > @@ -1021,11 +1374,19 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
> > > > > > >        nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
> > > > > > >                                &flower->mask.member, sizeof flower->key.member)
> > > > > > > -static void
> > > > > > > +static int
> > > > > > >    nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
> > > > > > >    {
> > > > > > > +
> > > > > > >        uint16_t host_eth_type = ntohs(flower->key.eth_type);
> > > > > > >        bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
> > > > > > > +    int err;
> > > > > > > +
> > > > > > > +    /* need to parse acts first as some acts require changing the matching */
> > > > > > 
> > > > > > This seems strange to me.
> > > > > 
> > > > > from the strict (normalized?) header checksum.
> > > > 
> > > > I think this code relates to setting flower->key.ip_proto == IPPROTO_TCP
> > > > above. Is that correct. Are there any other cases?
> > > 
> > > yes and also for UDP and ICMP. as explained above.
> > > So going over the acts in header rewrite case we might want to signal the HW
> > > recalc csum and we need matching on those fields that require new
> > > csum so the matching mask being updated. this is why we parse the acts
> > > first.
> > 
> > Thanks, understood.
> > 
> > > > > > > +    err  = nl_msg_put_flower_acts(request, flower);
> > > > > > > +    if (err) {
> > > > > > > +        return err;
> > > > > > > +    }
> > > > > > >        if (is_vlan) {
> > > > > > >            host_eth_type = ntohs(flower->key.encap_eth_type);
> > > > > > > @@ -1083,7 +1444,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
> > > > > > >            nl_msg_put_flower_tunnel(request, flower);
> > > > > > >        }
> > > > > > > -    nl_msg_put_flower_acts(request, flower);
> > > > > > > +    return 0;
> > > > > > >    }
> > > > > > >    int
> > > > > > > @@ -1106,7 +1467,12 @@ tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
> > > > > > >        nl_msg_put_string(&request, TCA_KIND, "flower");
> > > > > > >        basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
> > > > > > >        {
> > > > > > > -        nl_msg_put_flower_options(&request, flower);
> > > > > > > +        error = nl_msg_put_flower_options(&request, flower);
> > > > > > > +
> > > > > > > +        if (error) {
> > > > > > > +            ofpbuf_uninit(&request);
> > > > > > > +            return error;
> > > > > > > +        }
> > > > > > >        }
> > > > > > >        nl_msg_end_nested(&request, basic_offset);
> > > > > > > diff --git a/lib/tc.h b/lib/tc.h
> > > > > > > index 6c69b79..7876051 100644
> > > > > > > --- a/lib/tc.h
> > > > > > > +++ b/lib/tc.h
> > > > > > > @@ -96,6 +96,7 @@ struct tc_flower_key {
> > > > > > >        struct {
> > > > > > >            ovs_be32 ipv4_src;
> > > > > > >            ovs_be32 ipv4_dst;
> > > > > > > +        uint8_t rewrite_ttl;
> > > > > > >        } ipv4;
> > > > > > >        struct {
> > > > > > >            struct in6_addr ipv6_src;
> > > > > > > @@ -120,6 +121,14 @@ struct tc_flower {
> > > > > > >        uint64_t lastused;
> > > > > > >        struct {
> > > > > > > +        bool rewrite;
> > > > > > > +        struct tc_flower_key key;
> > > > > > > +        struct tc_flower_key mask;
> > > > > > > +    } rewrite;
> > > > > > > +
> > > > > > > +    uint32_t csum_update_flags;
> > > > > > > +
> > > > > > > +    struct {
> > > > > > >            bool set;
> > > > > > >            ovs_be64 id;
> > > > > > >            ovs_be16 tp_src;
> > > > > > > @@ -152,6 +161,13 @@ struct tc_flower {
> > > > > > >        struct tc_cookie act_cookie;
> > > > > > >    };
> > > > > > > +/* assert that if we overflow with a masked write of uint32_t to the last byte
> > > > > > > + * of flower.rewrite we overflow inside struct flower.
> > > > > > > + * shouldn't happen unless someone moves rewrite to the end of flower */
> > > > > > > +BUILD_ASSERT_DECL(offsetof(struct tc_flower, rewrite)
> > > > > > > +                  + MEMBER_SIZEOF(struct tc_flower, rewrite)
> > > > > > > +                  + sizeof(uint32_t) - 2 < sizeof(struct tc_flower));
> > > > > > > +
> > > > > > >    int tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
> > > > > > >                          struct tc_flower *flower);
> > > > > > >    int tc_del_filter(int ifindex, int prio, int handle);
> > > > > > > -- 
> > > > > > > 2.7.5
> > > > > > >
Simon Horman Nov. 16, 2017, 4:29 p.m. UTC | #8
On Wed, Oct 25, 2017 at 02:24:15PM +0300, Roi Dayan wrote:
> 
> 
> On 27/09/2017 12:08, Simon Horman wrote:
> > On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
> > > 
> > > 
> > > On 18/09/2017 18:01, Simon Horman wrote:
> > > > On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
> > > > > From: Paul Blakey <paulb@mellanox.com>
> > > > > 
> > > > > To be later used to implement ovs action set offloading.
> > > > > 
> > > > > Signed-off-by: Paul Blakey <paulb@mellanox.com>
> > > > > Reviewed-by: Roi Dayan <roid@mellanox.com>
> > > > > ---
> > > > >   lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > >   lib/tc.h |  16 +++
> > > > >   2 files changed, 385 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/lib/tc.c b/lib/tc.c
> > > > > index c9cada2..743b2ee 100644
> > > > > --- a/lib/tc.c
> > > > > +++ b/lib/tc.c
> > > > > @@ -21,8 +21,10 @@
> > > > >   #include <errno.h>
> > > > >   #include <linux/if_ether.h>
> > > > >   #include <linux/rtnetlink.h>
> > > > > +#include <linux/tc_act/tc_csum.h>
> > > > >   #include <linux/tc_act/tc_gact.h>
> > > > >   #include <linux/tc_act/tc_mirred.h>
> > > > > +#include <linux/tc_act/tc_pedit.h>
> > > > >   #include <linux/tc_act/tc_tunnel_key.h>
> > > > >   #include <linux/tc_act/tc_vlan.h>
> > > > >   #include <linux/gen_stats.h>
> > > > > @@ -33,11 +35,14 @@
> > > > >   #include "netlink-socket.h"
> > > > >   #include "netlink.h"
> > > > >   #include "openvswitch/ofpbuf.h"
> > > > > +#include "openvswitch/util.h"
> > > > >   #include "openvswitch/vlog.h"
> > > > >   #include "packets.h"
> > > > >   #include "timeval.h"
> > > > >   #include "unaligned.h"
> > > > > +#define MAX_PEDIT_OFFSETS 8
> > > > 
> > > > Why 8?
> > > We don't expect anything more right now (ipv6 src/dst rewrite requires 8
> > > pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if
> > > that's makes sens. do you suggest we increase it? to what?
> > 
> > It seems strange to me to place a somewhat arbitrary small limit
> > when none exists in the pedit API being used. I would at prefer if
> > it was at least a bigger, say 16 or 32.
> 

> Hi Simon,
> 
> Sorry for the late reply due to holidays and vacations.
> Me & Paul going to go over this and do the fixes needed and
> also rebase over latest master and run tests again.
> 
> I'll answer what I'm more familiar with now and Paul will continue.
> The 8 here is too low and you right. We used this definition
> for allocation of the pedit keys on the stack in
> nl_msg_put_flower_rewrite_pedits()
> 
> It was for convenience instead of calculating the maximum possible
> keys that could exists and allocating it there and freeing it at
> the end.
> 
> Increasing it to 32 is probably more than enough and wont waste much.

I updated the value to 32 when applying the patch.

...

> > > > If I understand the above correctly it is designed to make
> > > > pedit actions disjoint. If so, why is that necessary? >
> > > 
> > > It's not, as a single flower key rewrite can be split to multiple pedit
> > > actions it finds the overlap between a flower key and a pedit action, if
> > > they do overlap it translates it to the correct offset and masks it out.
> > 
> > Thanks, understood.
> > 
> > > 
> > > > > +        } else {
> > > > > +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
> > > > > +                        nl_attr_type(nla));
> > > > > +            return EOPNOTSUPP;
> > > > > +        }
> > > > 
> > > > I think the code could exit early here as
> > > > nl_msg_put_flower_rewrite_pedits() does below.
> > > > 
> > > 
> > > Sorry, didn't understand. can you give an example?
> > > 
> > 
> > I meant something like this.
> > 
> >              if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
> >                  VLOG_ERR_RL(...);
> >                  return EOPNOTSUPP;
> >              }
> 
> understood. we'll do that. thanks.

I also fixed this when applying the patch.

...
Roi Dayan Nov. 19, 2017, 6:45 a.m. UTC | #9
On 16/11/2017 18:29, Simon Horman wrote:
> On Wed, Oct 25, 2017 at 02:24:15PM +0300, Roi Dayan wrote:
>>
>>
>> On 27/09/2017 12:08, Simon Horman wrote:
>>> On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
>>>>
>>>>
>>>> On 18/09/2017 18:01, Simon Horman wrote:
>>>>> On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
>>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>>
>>>>>> To be later used to implement ovs action set offloading.
>>>>>>
>>>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>>>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>>>>>> ---
>>>>>>    lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>>    lib/tc.h |  16 +++
>>>>>>    2 files changed, 385 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/lib/tc.c b/lib/tc.c
>>>>>> index c9cada2..743b2ee 100644
>>>>>> --- a/lib/tc.c
>>>>>> +++ b/lib/tc.c
>>>>>> @@ -21,8 +21,10 @@
>>>>>>    #include <errno.h>
>>>>>>    #include <linux/if_ether.h>
>>>>>>    #include <linux/rtnetlink.h>
>>>>>> +#include <linux/tc_act/tc_csum.h>
>>>>>>    #include <linux/tc_act/tc_gact.h>
>>>>>>    #include <linux/tc_act/tc_mirred.h>
>>>>>> +#include <linux/tc_act/tc_pedit.h>
>>>>>>    #include <linux/tc_act/tc_tunnel_key.h>
>>>>>>    #include <linux/tc_act/tc_vlan.h>
>>>>>>    #include <linux/gen_stats.h>
>>>>>> @@ -33,11 +35,14 @@
>>>>>>    #include "netlink-socket.h"
>>>>>>    #include "netlink.h"
>>>>>>    #include "openvswitch/ofpbuf.h"
>>>>>> +#include "openvswitch/util.h"
>>>>>>    #include "openvswitch/vlog.h"
>>>>>>    #include "packets.h"
>>>>>>    #include "timeval.h"
>>>>>>    #include "unaligned.h"
>>>>>> +#define MAX_PEDIT_OFFSETS 8
>>>>>
>>>>> Why 8?
>>>> We don't expect anything more right now (ipv6 src/dst rewrite requires 8
>>>> pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if
>>>> that's makes sens. do you suggest we increase it? to what?
>>>
>>> It seems strange to me to place a somewhat arbitrary small limit
>>> when none exists in the pedit API being used. I would at prefer if
>>> it was at least a bigger, say 16 or 32.
>>
> 
>> Hi Simon,
>>
>> Sorry for the late reply due to holidays and vacations.
>> Me & Paul going to go over this and do the fixes needed and
>> also rebase over latest master and run tests again.
>>
>> I'll answer what I'm more familiar with now and Paul will continue.
>> The 8 here is too low and you right. We used this definition
>> for allocation of the pedit keys on the stack in
>> nl_msg_put_flower_rewrite_pedits()
>>
>> It was for convenience instead of calculating the maximum possible
>> keys that could exists and allocating it there and freeing it at
>> the end.
>>
>> Increasing it to 32 is probably more than enough and wont waste much.
> 
> I updated the value to 32 when applying the patch.
> 
> ...
> 
>>>>> If I understand the above correctly it is designed to make
>>>>> pedit actions disjoint. If so, why is that necessary? >
>>>>
>>>> It's not, as a single flower key rewrite can be split to multiple pedit
>>>> actions it finds the overlap between a flower key and a pedit action, if
>>>> they do overlap it translates it to the correct offset and masks it out.
>>>
>>> Thanks, understood.
>>>
>>>>
>>>>>> +        } else {
>>>>>> +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
>>>>>> +                        nl_attr_type(nla));
>>>>>> +            return EOPNOTSUPP;
>>>>>> +        }
>>>>>
>>>>> I think the code could exit early here as
>>>>> nl_msg_put_flower_rewrite_pedits() does below.
>>>>>
>>>>
>>>> Sorry, didn't understand. can you give an example?
>>>>
>>>
>>> I meant something like this.
>>>
>>>               if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
>>>                   VLOG_ERR_RL(...);
>>>                   return EOPNOTSUPP;
>>>               }
>>
>> understood. we'll do that. thanks.
> 
> I also fixed this when applying the patch.
> 
> ...
> 


Thanks Simon. sorry we were not responsive enough with this feature.
We'll improve that.
Simon Horman Nov. 20, 2017, 12:30 p.m. UTC | #10
On Sun, Nov 19, 2017 at 08:45:19AM +0200, Roi Dayan wrote:
> 
> 
> On 16/11/2017 18:29, Simon Horman wrote:
> > On Wed, Oct 25, 2017 at 02:24:15PM +0300, Roi Dayan wrote:
> > > 
> > > 
> > > On 27/09/2017 12:08, Simon Horman wrote:
> > > > On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
> > > > > 
> > > > > 
> > > > > On 18/09/2017 18:01, Simon Horman wrote:
> > > > > > On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
> > > > > > > From: Paul Blakey <paulb@mellanox.com>
> > > > > > > 
> > > > > > > To be later used to implement ovs action set offloading.
> > > > > > > 
> > > > > > > Signed-off-by: Paul Blakey <paulb@mellanox.com>
> > > > > > > Reviewed-by: Roi Dayan <roid@mellanox.com>
> > > > > > > ---
> > > > > > >    lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > >    lib/tc.h |  16 +++
> > > > > > >    2 files changed, 385 insertions(+), 3 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/lib/tc.c b/lib/tc.c
> > > > > > > index c9cada2..743b2ee 100644
> > > > > > > --- a/lib/tc.c
> > > > > > > +++ b/lib/tc.c
> > > > > > > @@ -21,8 +21,10 @@
> > > > > > >    #include <errno.h>
> > > > > > >    #include <linux/if_ether.h>
> > > > > > >    #include <linux/rtnetlink.h>
> > > > > > > +#include <linux/tc_act/tc_csum.h>
> > > > > > >    #include <linux/tc_act/tc_gact.h>
> > > > > > >    #include <linux/tc_act/tc_mirred.h>
> > > > > > > +#include <linux/tc_act/tc_pedit.h>
> > > > > > >    #include <linux/tc_act/tc_tunnel_key.h>
> > > > > > >    #include <linux/tc_act/tc_vlan.h>
> > > > > > >    #include <linux/gen_stats.h>
> > > > > > > @@ -33,11 +35,14 @@
> > > > > > >    #include "netlink-socket.h"
> > > > > > >    #include "netlink.h"
> > > > > > >    #include "openvswitch/ofpbuf.h"
> > > > > > > +#include "openvswitch/util.h"
> > > > > > >    #include "openvswitch/vlog.h"
> > > > > > >    #include "packets.h"
> > > > > > >    #include "timeval.h"
> > > > > > >    #include "unaligned.h"
> > > > > > > +#define MAX_PEDIT_OFFSETS 8
> > > > > > 
> > > > > > Why 8?
> > > > > We don't expect anything more right now (ipv6 src/dst rewrite requires 8
> > > > > pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if
> > > > > that's makes sens. do you suggest we increase it? to what?
> > > > 
> > > > It seems strange to me to place a somewhat arbitrary small limit
> > > > when none exists in the pedit API being used. I would at prefer if
> > > > it was at least a bigger, say 16 or 32.
> > > 
> > 
> > > Hi Simon,
> > > 
> > > Sorry for the late reply due to holidays and vacations.
> > > Me & Paul going to go over this and do the fixes needed and
> > > also rebase over latest master and run tests again.
> > > 
> > > I'll answer what I'm more familiar with now and Paul will continue.
> > > The 8 here is too low and you right. We used this definition
> > > for allocation of the pedit keys on the stack in
> > > nl_msg_put_flower_rewrite_pedits()
> > > 
> > > It was for convenience instead of calculating the maximum possible
> > > keys that could exists and allocating it there and freeing it at
> > > the end.
> > > 
> > > Increasing it to 32 is probably more than enough and wont waste much.
> > 
> > I updated the value to 32 when applying the patch.
> > 
> > ...
> > 
> > > > > > If I understand the above correctly it is designed to make
> > > > > > pedit actions disjoint. If so, why is that necessary? >
> > > > > 
> > > > > It's not, as a single flower key rewrite can be split to multiple pedit
> > > > > actions it finds the overlap between a flower key and a pedit action, if
> > > > > they do overlap it translates it to the correct offset and masks it out.
> > > > 
> > > > Thanks, understood.
> > > > 
> > > > > 
> > > > > > > +        } else {
> > > > > > > +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
> > > > > > > +                        nl_attr_type(nla));
> > > > > > > +            return EOPNOTSUPP;
> > > > > > > +        }
> > > > > > 
> > > > > > I think the code could exit early here as
> > > > > > nl_msg_put_flower_rewrite_pedits() does below.
> > > > > > 
> > > > > 
> > > > > Sorry, didn't understand. can you give an example?
> > > > > 
> > > > 
> > > > I meant something like this.
> > > > 
> > > >               if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
> > > >                   VLOG_ERR_RL(...);
> > > >                   return EOPNOTSUPP;
> > > >               }
> > > 
> > > understood. we'll do that. thanks.
> > 
> > I also fixed this when applying the patch.
> > 
> > ...
> > 
> 
> 
> Thanks Simon. sorry we were not responsive enough with this feature.
> We'll improve that.
> 

No problem.

Could you work on a fix for the travis failure that Eric Garver flagged?
Roi Dayan Nov. 21, 2017, 5:50 a.m. UTC | #11
On 20/11/2017 14:30, Simon Horman wrote:
> On Sun, Nov 19, 2017 at 08:45:19AM +0200, Roi Dayan wrote:
>>
>>
>> On 16/11/2017 18:29, Simon Horman wrote:
>>> On Wed, Oct 25, 2017 at 02:24:15PM +0300, Roi Dayan wrote:
>>>>
>>>>
>>>> On 27/09/2017 12:08, Simon Horman wrote:
>>>>> On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
>>>>>>
>>>>>>
>>>>>> On 18/09/2017 18:01, Simon Horman wrote:
>>>>>>> On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
>>>>>>>> From: Paul Blakey <paulb@mellanox.com>
>>>>>>>>
>>>>>>>> To be later used to implement ovs action set offloading.
>>>>>>>>
>>>>>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>>>>>> Reviewed-by: Roi Dayan <roid@mellanox.com>
>>>>>>>> ---
>>>>>>>>     lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>>>>     lib/tc.h |  16 +++
>>>>>>>>     2 files changed, 385 insertions(+), 3 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/lib/tc.c b/lib/tc.c
>>>>>>>> index c9cada2..743b2ee 100644
>>>>>>>> --- a/lib/tc.c
>>>>>>>> +++ b/lib/tc.c
>>>>>>>> @@ -21,8 +21,10 @@
>>>>>>>>     #include <errno.h>
>>>>>>>>     #include <linux/if_ether.h>
>>>>>>>>     #include <linux/rtnetlink.h>
>>>>>>>> +#include <linux/tc_act/tc_csum.h>
>>>>>>>>     #include <linux/tc_act/tc_gact.h>
>>>>>>>>     #include <linux/tc_act/tc_mirred.h>
>>>>>>>> +#include <linux/tc_act/tc_pedit.h>
>>>>>>>>     #include <linux/tc_act/tc_tunnel_key.h>
>>>>>>>>     #include <linux/tc_act/tc_vlan.h>
>>>>>>>>     #include <linux/gen_stats.h>
>>>>>>>> @@ -33,11 +35,14 @@
>>>>>>>>     #include "netlink-socket.h"
>>>>>>>>     #include "netlink.h"
>>>>>>>>     #include "openvswitch/ofpbuf.h"
>>>>>>>> +#include "openvswitch/util.h"
>>>>>>>>     #include "openvswitch/vlog.h"
>>>>>>>>     #include "packets.h"
>>>>>>>>     #include "timeval.h"
>>>>>>>>     #include "unaligned.h"
>>>>>>>> +#define MAX_PEDIT_OFFSETS 8
>>>>>>>
>>>>>>> Why 8?
>>>>>> We don't expect anything more right now (ipv6 src/dst rewrite requires 8
>>>>>> pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if
>>>>>> that's makes sens. do you suggest we increase it? to what?
>>>>>
>>>>> It seems strange to me to place a somewhat arbitrary small limit
>>>>> when none exists in the pedit API being used. I would at prefer if
>>>>> it was at least a bigger, say 16 or 32.
>>>>
>>>
>>>> Hi Simon,
>>>>
>>>> Sorry for the late reply due to holidays and vacations.
>>>> Me & Paul going to go over this and do the fixes needed and
>>>> also rebase over latest master and run tests again.
>>>>
>>>> I'll answer what I'm more familiar with now and Paul will continue.
>>>> The 8 here is too low and you right. We used this definition
>>>> for allocation of the pedit keys on the stack in
>>>> nl_msg_put_flower_rewrite_pedits()
>>>>
>>>> It was for convenience instead of calculating the maximum possible
>>>> keys that could exists and allocating it there and freeing it at
>>>> the end.
>>>>
>>>> Increasing it to 32 is probably more than enough and wont waste much.
>>>
>>> I updated the value to 32 when applying the patch.
>>>
>>> ...
>>>
>>>>>>> If I understand the above correctly it is designed to make
>>>>>>> pedit actions disjoint. If so, why is that necessary? >
>>>>>>
>>>>>> It's not, as a single flower key rewrite can be split to multiple pedit
>>>>>> actions it finds the overlap between a flower key and a pedit action, if
>>>>>> they do overlap it translates it to the correct offset and masks it out.
>>>>>
>>>>> Thanks, understood.
>>>>>
>>>>>>
>>>>>>>> +        } else {
>>>>>>>> +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
>>>>>>>> +                        nl_attr_type(nla));
>>>>>>>> +            return EOPNOTSUPP;
>>>>>>>> +        }
>>>>>>>
>>>>>>> I think the code could exit early here as
>>>>>>> nl_msg_put_flower_rewrite_pedits() does below.
>>>>>>>
>>>>>>
>>>>>> Sorry, didn't understand. can you give an example?
>>>>>>
>>>>>
>>>>> I meant something like this.
>>>>>
>>>>>                if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
>>>>>                    VLOG_ERR_RL(...);
>>>>>                    return EOPNOTSUPP;
>>>>>                }
>>>>
>>>> understood. we'll do that. thanks.
>>>
>>> I also fixed this when applying the patch.
>>>
>>> ...
>>>
>>
>>
>> Thanks Simon. sorry we were not responsive enough with this feature.
>> We'll improve that.
>>
> 
> No problem.
> 
> Could you work on a fix for the travis failure that Eric Garver flagged?
> 

yes we have a fix. we did more fixes and running some tests. we'll push
the fixes today.
Simon Horman Nov. 21, 2017, 9:23 a.m. UTC | #12
On Tue, Nov 21, 2017 at 07:50:24AM +0200, Roi Dayan wrote:
> 
> 
> On 20/11/2017 14:30, Simon Horman wrote:
> > On Sun, Nov 19, 2017 at 08:45:19AM +0200, Roi Dayan wrote:
> > > 
> > > 
> > > On 16/11/2017 18:29, Simon Horman wrote:
> > > > On Wed, Oct 25, 2017 at 02:24:15PM +0300, Roi Dayan wrote:
> > > > > 
> > > > > 
> > > > > On 27/09/2017 12:08, Simon Horman wrote:
> > > > > > On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
> > > > > > > 
> > > > > > > 
> > > > > > > On 18/09/2017 18:01, Simon Horman wrote:
> > > > > > > > On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
> > > > > > > > > From: Paul Blakey <paulb@mellanox.com>
> > > > > > > > > 
> > > > > > > > > To be later used to implement ovs action set offloading.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Paul Blakey <paulb@mellanox.com>
> > > > > > > > > Reviewed-by: Roi Dayan <roid@mellanox.com>
> > > > > > > > > ---
> > > > > > > > >     lib/tc.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > > > >     lib/tc.h |  16 +++
> > > > > > > > >     2 files changed, 385 insertions(+), 3 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/lib/tc.c b/lib/tc.c
> > > > > > > > > index c9cada2..743b2ee 100644
> > > > > > > > > --- a/lib/tc.c
> > > > > > > > > +++ b/lib/tc.c
> > > > > > > > > @@ -21,8 +21,10 @@
> > > > > > > > >     #include <errno.h>
> > > > > > > > >     #include <linux/if_ether.h>
> > > > > > > > >     #include <linux/rtnetlink.h>
> > > > > > > > > +#include <linux/tc_act/tc_csum.h>
> > > > > > > > >     #include <linux/tc_act/tc_gact.h>
> > > > > > > > >     #include <linux/tc_act/tc_mirred.h>
> > > > > > > > > +#include <linux/tc_act/tc_pedit.h>
> > > > > > > > >     #include <linux/tc_act/tc_tunnel_key.h>
> > > > > > > > >     #include <linux/tc_act/tc_vlan.h>
> > > > > > > > >     #include <linux/gen_stats.h>
> > > > > > > > > @@ -33,11 +35,14 @@
> > > > > > > > >     #include "netlink-socket.h"
> > > > > > > > >     #include "netlink.h"
> > > > > > > > >     #include "openvswitch/ofpbuf.h"
> > > > > > > > > +#include "openvswitch/util.h"
> > > > > > > > >     #include "openvswitch/vlog.h"
> > > > > > > > >     #include "packets.h"
> > > > > > > > >     #include "timeval.h"
> > > > > > > > >     #include "unaligned.h"
> > > > > > > > > +#define MAX_PEDIT_OFFSETS 8
> > > > > > > > 
> > > > > > > > Why 8?
> > > > > > > We don't expect anything more right now (ipv6 src/dst rewrite requires 8
> > > > > > > pedits iirc). I can't think of a larger use case, maybe ipv6 + macs if
> > > > > > > that's makes sens. do you suggest we increase it? to what?
> > > > > > 
> > > > > > It seems strange to me to place a somewhat arbitrary small limit
> > > > > > when none exists in the pedit API being used. I would at prefer if
> > > > > > it was at least a bigger, say 16 or 32.
> > > > > 
> > > > 
> > > > > Hi Simon,
> > > > > 
> > > > > Sorry for the late reply due to holidays and vacations.
> > > > > Me & Paul going to go over this and do the fixes needed and
> > > > > also rebase over latest master and run tests again.
> > > > > 
> > > > > I'll answer what I'm more familiar with now and Paul will continue.
> > > > > The 8 here is too low and you right. We used this definition
> > > > > for allocation of the pedit keys on the stack in
> > > > > nl_msg_put_flower_rewrite_pedits()
> > > > > 
> > > > > It was for convenience instead of calculating the maximum possible
> > > > > keys that could exists and allocating it there and freeing it at
> > > > > the end.
> > > > > 
> > > > > Increasing it to 32 is probably more than enough and wont waste much.
> > > > 
> > > > I updated the value to 32 when applying the patch.
> > > > 
> > > > ...
> > > > 
> > > > > > > > If I understand the above correctly it is designed to make
> > > > > > > > pedit actions disjoint. If so, why is that necessary? >
> > > > > > > 
> > > > > > > It's not, as a single flower key rewrite can be split to multiple pedit
> > > > > > > actions it finds the overlap between a flower key and a pedit action, if
> > > > > > > they do overlap it translates it to the correct offset and masks it out.
> > > > > > 
> > > > > > Thanks, understood.
> > > > > > 
> > > > > > > 
> > > > > > > > > +        } else {
> > > > > > > > > +            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
> > > > > > > > > +                        nl_attr_type(nla));
> > > > > > > > > +            return EOPNOTSUPP;
> > > > > > > > > +        }
> > > > > > > > 
> > > > > > > > I think the code could exit early here as
> > > > > > > > nl_msg_put_flower_rewrite_pedits() does below.
> > > > > > > > 
> > > > > > > 
> > > > > > > Sorry, didn't understand. can you give an example?
> > > > > > > 
> > > > > > 
> > > > > > I meant something like this.
> > > > > > 
> > > > > >                if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
> > > > > >                    VLOG_ERR_RL(...);
> > > > > >                    return EOPNOTSUPP;
> > > > > >                }
> > > > > 
> > > > > understood. we'll do that. thanks.
> > > > 
> > > > I also fixed this when applying the patch.
> > > > 
> > > > ...
> > > > 
> > > 
> > > 
> > > Thanks Simon. sorry we were not responsive enough with this feature.
> > > We'll improve that.
> > > 
> > 
> > No problem.
> > 
> > Could you work on a fix for the travis failure that Eric Garver flagged?
> > 
> 
> yes we have a fix. we did more fixes and running some tests. we'll push
> the fixes today.

Excellent, please CC me.
diff mbox series

Patch

diff --git a/lib/tc.c b/lib/tc.c
index c9cada2..743b2ee 100644
--- a/lib/tc.c
+++ b/lib/tc.c
@@ -21,8 +21,10 @@ 
 #include <errno.h>
 #include <linux/if_ether.h>
 #include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_csum.h>
 #include <linux/tc_act/tc_gact.h>
 #include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_pedit.h>
 #include <linux/tc_act/tc_tunnel_key.h>
 #include <linux/tc_act/tc_vlan.h>
 #include <linux/gen_stats.h>
@@ -33,11 +35,14 @@ 
 #include "netlink-socket.h"
 #include "netlink.h"
 #include "openvswitch/ofpbuf.h"
+#include "openvswitch/util.h"
 #include "openvswitch/vlog.h"
 #include "packets.h"
 #include "timeval.h"
 #include "unaligned.h"
 
+#define MAX_PEDIT_OFFSETS 8
+
 VLOG_DEFINE_THIS_MODULE(tc);
 
 static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);
@@ -50,6 +55,82 @@  enum tc_offload_policy {
 
 static enum tc_offload_policy tc_policy = TC_POLICY_NONE;
 
+struct tc_pedit_key_ex {
+    enum pedit_header_type htype;
+    enum pedit_cmd cmd;
+};
+
+struct flower_key_to_pedit {
+    enum pedit_header_type htype;
+    int flower_offset;
+    int offset;
+    int size;
+};
+
+static struct flower_key_to_pedit flower_pedit_map[] = {
+    {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
+        12,
+        offsetof(struct tc_flower_key, ipv4.ipv4_src),
+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_src)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
+        16,
+        offsetof(struct tc_flower_key, ipv4.ipv4_dst),
+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.ipv4_dst)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP4,
+        8,
+        offsetof(struct tc_flower_key, ipv4.rewrite_ttl),
+        MEMBER_SIZEOF(struct tc_flower_key, ipv4.rewrite_ttl)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
+        8,
+        offsetof(struct tc_flower_key, ipv6.ipv6_src),
+        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_src)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_IP6,
+        24,
+        offsetof(struct tc_flower_key, ipv6.ipv6_dst),
+        MEMBER_SIZEOF(struct tc_flower_key, ipv6.ipv6_dst)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
+        6,
+        offsetof(struct tc_flower_key, src_mac),
+        MEMBER_SIZEOF(struct tc_flower_key, src_mac)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
+        0,
+        offsetof(struct tc_flower_key, dst_mac),
+        MEMBER_SIZEOF(struct tc_flower_key, dst_mac)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_ETH,
+        12,
+        offsetof(struct tc_flower_key, eth_type),
+        MEMBER_SIZEOF(struct tc_flower_key, eth_type)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
+        0,
+        offsetof(struct tc_flower_key, tcp_src),
+        MEMBER_SIZEOF(struct tc_flower_key, tcp_src)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_TCP,
+        2,
+        offsetof(struct tc_flower_key, tcp_dst),
+        MEMBER_SIZEOF(struct tc_flower_key, tcp_dst)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
+        0,
+        offsetof(struct tc_flower_key, udp_src),
+        MEMBER_SIZEOF(struct tc_flower_key, udp_src)
+    }, {
+        TCA_PEDIT_KEY_EX_HDR_TYPE_UDP,
+        2,
+        offsetof(struct tc_flower_key, udp_dst),
+        MEMBER_SIZEOF(struct tc_flower_key, udp_dst)
+    },
+};
+
 struct tcmsg *
 tc_make_request(int ifindex, int type, unsigned int flags,
                 struct ofpbuf *request)
@@ -365,6 +446,96 @@  nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) {
     }
 }
 
+static const struct nl_policy pedit_policy[] = {
+            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
+                                     .min_len = sizeof(struct tc_pedit),
+                                     .optional = false, },
+            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
+                                      .optional = false, },
+};
+
+static int
+nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
+{
+    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
+    const struct tc_pedit *pe;
+    const struct tc_pedit_key *keys;
+    const struct nlattr *nla, *keys_ex, *ex_type;
+    const void *keys_attr;
+    char *rewrite_key = (void *) &flower->rewrite.key;
+    char *rewrite_mask = (void *) &flower->rewrite.mask;
+    size_t keys_ex_size, left;
+    int type, i = 0;
+
+    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
+                         ARRAY_SIZE(pedit_policy))) {
+        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
+        return EPROTO;
+    }
+
+    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
+    keys = pe->keys;
+    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
+    keys_ex = nl_attr_get(keys_attr);
+    keys_ex_size = nl_attr_get_size(keys_attr);
+
+    NL_ATTR_FOR_EACH (nla, left, keys_ex, keys_ex_size) {
+        if (i >= pe->nkeys) {
+            break;
+        }
+
+        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
+            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
+            type = nl_attr_get_u16(ex_type);
+
+            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
+                struct flower_key_to_pedit *m = &flower_pedit_map[j];
+                int flower_off = m->flower_offset;
+                int sz = m->size;
+                int mf = m->offset;
+
+                if (m->htype != type) {
+                   continue;
+                }
+
+                /* check overlap between current pedit key, which is always
+                 * 4 bytes (range [off, off + 3]), and a map entry in
+                 * flower_pedit_map (range [mf, mf + sz - 1]) */
+                if ((keys->off >= mf && keys->off < mf + sz)
+                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz)) {
+                    int diff = flower_off + (keys->off - mf);
+                    uint32_t *dst = (void *) (rewrite_key + diff);
+                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
+                    uint32_t mask = ~(keys->mask);
+                    uint32_t zero_bits;
+
+                    if (keys->off < mf) {
+                        zero_bits = 8 * (mf - keys->off);
+                        mask &= UINT32_MAX << zero_bits;
+                    } else if (keys->off + 4 > mf + m->size) {
+                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
+                        mask &= UINT32_MAX >> zero_bits;
+                    }
+
+                    *dst_m |= mask;
+                    *dst |= keys->val & mask;
+                }
+            }
+        } else {
+            VLOG_ERR_RL(&error_rl, "unable to parse legacy pedit type: %d",
+                        nl_attr_type(nla));
+            return EOPNOTSUPP;
+        }
+
+        keys++;
+        i++;
+    }
+
+    flower->rewrite.rewrite = true;
+
+    return 0;
+}
+
 static const struct nl_policy tunnel_key_policy[] = {
     [TCA_TUNNEL_KEY_PARMS] = { .type = NL_A_UNSPEC,
                                .min_len = sizeof(struct tc_tunnel_key),
@@ -608,6 +779,11 @@  nl_parse_single_action(struct nlattr *action, struct tc_flower *flower)
         nl_parse_act_vlan(act_options, flower);
     } else if (!strcmp(act_kind, "tunnel_key")) {
         nl_parse_act_tunnel_key(act_options, flower);
+    } else if (!strcmp(act_kind, "pedit")) {
+        nl_parse_act_pedit(act_options, flower);
+    } else if (!strcmp(act_kind, "csum")) {
+        /* not doing anything for now, ovs has an implicit csum recalculation
+         * with rewriting of packet headers (translating of pedit acts). */
     } else {
         VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind);
         return EINVAL;
@@ -809,6 +985,48 @@  tc_get_tc_cls_policy(enum tc_offload_policy policy)
 }
 
 static void
+nl_msg_put_act_csum(struct ofpbuf *request, uint32_t flags)
+{
+    size_t offset;
+
+    nl_msg_put_string(request, TCA_ACT_KIND, "csum");
+    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
+    {
+        struct tc_csum parm = { .action = TC_ACT_PIPE,
+                                .update_flags = flags };
+
+        nl_msg_put_unspec(request, TCA_CSUM_PARMS, &parm, sizeof parm);
+    }
+    nl_msg_end_nested(request, offset);
+}
+
+static void
+nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
+                     struct tc_pedit_key_ex *ex)
+{
+    size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
+    size_t offset, offset_keys_ex, offset_key;
+    int i;
+
+    nl_msg_put_string(request, TCA_ACT_KIND, "pedit");
+    offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
+    {
+        parm->action = TC_ACT_PIPE;
+
+        nl_msg_put_unspec(request, TCA_PEDIT_PARMS_EX, parm, ksize);
+        offset_keys_ex = nl_msg_start_nested(request, TCA_PEDIT_KEYS_EX);
+        for (i = 0; i < parm->nkeys; i++, ex++) {
+            offset_key = nl_msg_start_nested(request, TCA_PEDIT_KEY_EX);
+            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_HTYPE, ex->htype);
+            nl_msg_put_u16(request, TCA_PEDIT_KEY_EX_CMD, ex->cmd);
+            nl_msg_end_nested(request, offset_key);
+        }
+        nl_msg_end_nested(request, offset_keys_ex);
+    }
+    nl_msg_end_nested(request, offset);
+}
+
+static void
 nl_msg_put_act_push_vlan(struct ofpbuf *request, uint16_t vid, uint8_t prio)
 {
     size_t offset;
@@ -930,7 +1148,127 @@  nl_msg_put_act_cookie(struct ofpbuf *request, struct tc_cookie *ck) {
     }
 }
 
+/* Given flower, a key_to_pedit map entry, calculates the rest,
+ * where:
+ *
+ * mask, data - pointers of where read the first word of flower->key/mask.
+ * current_offset - which offset to use for the first pedit action.
+ * cnt - max pedits actions to use.
+ * first_word_mask/last_word_mask - the mask to use for the first/last read
+ * (as we read entire words). */
 static void
+calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m,
+             int *cur_offset, int *cnt, uint32_t *last_word_mask,
+             uint32_t *first_word_mask, uint32_t **mask, uint32_t **data)
+{
+    int start_offset, max_offset, total_size;
+    int diff, right_zero_bits, left_zero_bits;
+    char *rewrite_key = (void *) &flower->rewrite.key;
+    char *rewrite_mask = (void *) &flower->rewrite.mask;
+
+    max_offset = m->offset + m->size;
+    start_offset = ROUND_DOWN(m->offset, 4);
+    diff = m->offset - start_offset;
+    total_size = max_offset - start_offset;
+    right_zero_bits = 8 * (4 - (max_offset % 4));
+    left_zero_bits = 8 * (m->offset - start_offset);
+
+    *cur_offset = start_offset;
+    *cnt = (total_size / 4) + (total_size % 4 ? 1 : 0);
+    *last_word_mask = UINT32_MAX >> right_zero_bits;
+    *first_word_mask = UINT32_MAX << left_zero_bits;
+    *data = (void *) (rewrite_key + m->flower_offset - diff);
+    *mask = (void *) (rewrite_mask + m->flower_offset - diff);
+}
+
+static inline void
+csum_update_flag(struct tc_flower *flower,
+                 enum pedit_header_type htype) {
+    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {
+        flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_IPV4HDR;
+    }
+    if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4
+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6
+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_TCP
+        || htype == TCA_PEDIT_KEY_EX_HDR_TYPE_UDP) {
+        if (flower->key.ip_proto == IPPROTO_TCP) {
+            flower->mask.ip_proto = UINT8_MAX;
+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
+        } else if (flower->key.ip_proto == IPPROTO_UDP) {
+            flower->mask.ip_proto = UINT8_MAX;
+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
+        } else if (flower->key.ip_proto == IPPROTO_ICMP
+                   || flower->key.ip_proto == IPPROTO_ICMPV6) {
+            flower->mask.ip_proto = UINT8_MAX;
+            flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP;
+        }
+    }
+}
+
+static int
+nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
+                                 struct tc_flower *flower)
+{
+    struct {
+        struct tc_pedit sel;
+        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
+        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
+    } sel = {
+        .sel = {
+            .nkeys = 0
+        }
+    };
+    int i, j;
+
+    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
+        struct flower_key_to_pedit *m = &flower_pedit_map[i];
+        struct tc_pedit_key *pedit_key = NULL;
+        struct tc_pedit_key_ex *pedit_key_ex = NULL;
+        uint32_t *mask, *data, first_word_mask, last_word_mask;
+        int cnt = 0, cur_offset = 0;
+
+        if (!m->size) {
+            continue;
+        }
+
+        calc_offsets(flower, m, &cur_offset, &cnt, &last_word_mask,
+                     &first_word_mask, &mask, &data);
+
+        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
+            uint32_t mask_word = *mask;
+
+            if (j == 0) {
+                mask_word &= first_word_mask;
+            }
+            if (j == cnt - 1) {
+                mask_word &= last_word_mask;
+            }
+            if (!mask_word) {
+                continue;
+            }
+            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
+                VLOG_WARN_RL(&error_rl, "reached too many pedit offsets: %d",
+                             MAX_PEDIT_OFFSETS);
+                return EOPNOTSUPP;
+            }
+
+            pedit_key = &sel.keys[sel.sel.nkeys];
+            pedit_key_ex = &sel.keys_ex[sel.sel.nkeys];
+            pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET;
+            pedit_key_ex->htype = m->htype;
+            pedit_key->off = cur_offset;
+            pedit_key->mask = ~mask_word;
+            pedit_key->val = *data & mask_word;
+            sel.sel.nkeys++;
+            csum_update_flag(flower, m->htype);
+        }
+    }
+    nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex);
+
+    return 0;
+}
+
+static int
 nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
 {
     size_t offset;
@@ -939,7 +1277,20 @@  nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
     offset = nl_msg_start_nested(request, TCA_FLOWER_ACT);
     {
         uint16_t act_index = 1;
+        int error;
 
+        if (flower->rewrite.rewrite) {
+            act_offset = nl_msg_start_nested(request, act_index++);
+            error = nl_msg_put_flower_rewrite_pedits(request, flower);
+            if (error) {
+                return error;
+            }
+            nl_msg_end_nested(request, act_offset);
+
+            act_offset = nl_msg_start_nested(request, act_index++);
+            nl_msg_put_act_csum(request, flower->csum_update_flags);
+            nl_msg_end_nested(request, act_offset);
+        }
         if (flower->set.set) {
             act_offset = nl_msg_start_nested(request, act_index++);
             nl_msg_put_act_tunnel_key_set(request, flower->set.id,
@@ -980,6 +1331,8 @@  nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower)
         }
     }
     nl_msg_end_nested(request, offset);
+
+    return 0;
 }
 
 static void
@@ -1021,11 +1374,19 @@  nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower)
     nl_msg_put_masked_value(request, type, type##_MASK, &flower->key.member, \
                             &flower->mask.member, sizeof flower->key.member)
 
-static void
+static int
 nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
 {
+
     uint16_t host_eth_type = ntohs(flower->key.eth_type);
     bool is_vlan = (host_eth_type == ETH_TYPE_VLAN);
+    int err;
+
+    /* need to parse acts first as some acts require changing the matching */
+    err  = nl_msg_put_flower_acts(request, flower);
+    if (err) {
+        return err;
+    }
 
     if (is_vlan) {
         host_eth_type = ntohs(flower->key.encap_eth_type);
@@ -1083,7 +1444,7 @@  nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower)
         nl_msg_put_flower_tunnel(request, flower);
     }
 
-    nl_msg_put_flower_acts(request, flower);
+    return 0;
 }
 
 int
@@ -1106,7 +1467,12 @@  tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
     nl_msg_put_string(&request, TCA_KIND, "flower");
     basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
     {
-        nl_msg_put_flower_options(&request, flower);
+        error = nl_msg_put_flower_options(&request, flower);
+
+        if (error) {
+            ofpbuf_uninit(&request);
+            return error;
+        }
     }
     nl_msg_end_nested(&request, basic_offset);
 
diff --git a/lib/tc.h b/lib/tc.h
index 6c69b79..7876051 100644
--- a/lib/tc.h
+++ b/lib/tc.h
@@ -96,6 +96,7 @@  struct tc_flower_key {
     struct {
         ovs_be32 ipv4_src;
         ovs_be32 ipv4_dst;
+        uint8_t rewrite_ttl;
     } ipv4;
     struct {
         struct in6_addr ipv6_src;
@@ -120,6 +121,14 @@  struct tc_flower {
     uint64_t lastused;
 
     struct {
+        bool rewrite;
+        struct tc_flower_key key;
+        struct tc_flower_key mask;
+    } rewrite;
+
+    uint32_t csum_update_flags;
+
+    struct {
         bool set;
         ovs_be64 id;
         ovs_be16 tp_src;
@@ -152,6 +161,13 @@  struct tc_flower {
     struct tc_cookie act_cookie;
 };
 
+/* assert that if we overflow with a masked write of uint32_t to the last byte
+ * of flower.rewrite we overflow inside struct flower.
+ * shouldn't happen unless someone moves rewrite to the end of flower */
+BUILD_ASSERT_DECL(offsetof(struct tc_flower, rewrite)
+                  + MEMBER_SIZEOF(struct tc_flower, rewrite)
+                  + sizeof(uint32_t) - 2 < sizeof(struct tc_flower));
+
 int tc_replace_flower(int ifindex, uint16_t prio, uint32_t handle,
                       struct tc_flower *flower);
 int tc_del_filter(int ifindex, int prio, int handle);