diff mbox series

[ovs-dev,net-next,v3] net: openvswitch: add hash info to upcall

Message ID 1573571327-6906-1-git-send-email-xiangxia.m.yue@gmail.com
State Awaiting Upstream
Headers show
Series [ovs-dev,net-next,v3] net: openvswitch: add hash info to upcall | expand

Commit Message

Tonghao Zhang Nov. 12, 2019, 3:08 p.m. UTC
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

When using the kernel datapath, the upcall don't
include skb hash info relatived. That will introduce
some problem, because the hash of skb is important
in kernel stack. For example, VXLAN module uses
it to select UDP src port. The tx queue selection
may also use the hash in stack.

Hash is computed in different ways. Hash is random
for a TCP socket, and hash may be computed in hardware,
or software stack. Recalculation hash is not easy.

Hash of TCP socket is computed:
  tcp_v4_connect
    -> sk_set_txhash (is random)

  __tcp_transmit_skb
    -> skb_set_hash_from_sk

There will be one upcall, without information of skb
hash, to ovs-vswitchd, for the first packet of a TCP
session. The rest packets will be processed in Open vSwitch
modules, hash kept. If this tcp session is forward to
VXLAN module, then the UDP src port of first tcp packet
is different from rest packets.

TCP packets may come from the host or dockers, to Open vSwitch.
To fix it, we store the hash info to upcall, and restore hash
when packets sent back.

+---------------+          +-------------------------+
|   Docker/VMs  |          |     ovs-vswitchd        |
+----+----------+          +-+--------------------+--+
     |                       ^                    |
     |                       |                    |
     |                       |  upcall            v restore packet hash (not recalculate)
     |                     +-+--------------------+--+
     |  tap netdev         |                         |   vxlan module
     +--------------->     +-->  Open vSwitch ko     +-->
       or internal type    |                         |
                           +-------------------------+

Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
v3:
* add enum ovs_pkt_hash_types
* avoid duplicate call the skb_get_hash_raw.
* explain why we should fix this problem.
---
 include/uapi/linux/openvswitch.h |  2 ++
 net/openvswitch/datapath.c       | 30 +++++++++++++++++++++++++++++-
 net/openvswitch/datapath.h       | 12 ++++++++++++
 3 files changed, 43 insertions(+), 1 deletion(-)

Comments

Gregory Rose Nov. 12, 2019, 6:06 p.m. UTC | #1
On 11/12/2019 7:08 AM, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> When using the kernel datapath, the upcall don't
> include skb hash info relatived. That will introduce
> some problem, because the hash of skb is important
> in kernel stack. For example, VXLAN module uses
> it to select UDP src port. The tx queue selection
> may also use the hash in stack.
>
> Hash is computed in different ways. Hash is random
> for a TCP socket, and hash may be computed in hardware,
> or software stack. Recalculation hash is not easy.
>
> Hash of TCP socket is computed:
>    tcp_v4_connect
>      -> sk_set_txhash (is random)
>
>    __tcp_transmit_skb
>      -> skb_set_hash_from_sk
>
> There will be one upcall, without information of skb
> hash, to ovs-vswitchd, for the first packet of a TCP
> session. The rest packets will be processed in Open vSwitch
> modules, hash kept. If this tcp session is forward to
> VXLAN module, then the UDP src port of first tcp packet
> is different from rest packets.
>
> TCP packets may come from the host or dockers, to Open vSwitch.
> To fix it, we store the hash info to upcall, and restore hash
> when packets sent back.
>
> +---------------+          +-------------------------+
> |   Docker/VMs  |          |     ovs-vswitchd        |
> +----+----------+          +-+--------------------+--+
>       |                       ^                    |
>       |                       |                    |
>       |                       |  upcall            v restore packet hash (not recalculate)
>       |                     +-+--------------------+--+
>       |  tap netdev         |                         |   vxlan module
>       +--------------->     +-->  Open vSwitch ko     +-->
>         or internal type    |                         |
>                             +-------------------------+
>
> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> ---
> v3:
> * add enum ovs_pkt_hash_types
> * avoid duplicate call the skb_get_hash_raw.
> * explain why we should fix this problem.
> ---
>   include/uapi/linux/openvswitch.h |  2 ++
>   net/openvswitch/datapath.c       | 30 +++++++++++++++++++++++++++++-
>   net/openvswitch/datapath.h       | 12 ++++++++++++
>   3 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
> index 1887a451c388..e65407c1f366 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -170,6 +170,7 @@ enum ovs_packet_cmd {
>    * output port is actually a tunnel port. Contains the output tunnel key
>    * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
>    * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
> + * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb).
>    * @OVS_PACKET_ATTR_LEN: Packet size before truncation.
>    * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
>    * size.
> @@ -190,6 +191,7 @@ enum ovs_packet_attr {
>   	OVS_PACKET_ATTR_PROBE,      /* Packet operation is a feature probe,
>   				       error logging should be suppressed. */
>   	OVS_PACKET_ATTR_MRU,	    /* Maximum received IP fragment size. */
> +	OVS_PACKET_ATTR_HASH,	    /* Packet hash. */
>   	OVS_PACKET_ATTR_LEN,		/* Packet size before truncation. */
>   	__OVS_PACKET_ATTR_MAX
>   };

Why do you add the new enum before the last entry OVS_PACKET_ATTR_LEN 
instead of
just adding it to the end of the list?

Just curious.

- Greg

> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index 2088619c03f0..b556cf62b77c 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -350,7 +350,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
>   	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
>   		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
>   		+ nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
> -		+ nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
> +		+ nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */
> +		+ nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */
>   
>   	/* OVS_PACKET_ATTR_USERDATA */
>   	if (upcall_info->userdata)
> @@ -393,6 +394,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
>   	size_t len;
>   	unsigned int hlen;
>   	int err, dp_ifindex;
> +	u64 hash;
>   
>   	dp_ifindex = get_dpifindex(dp);
>   	if (!dp_ifindex)
> @@ -504,6 +506,23 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
>   		pad_packet(dp, user_skb);
>   	}
>   
> +	hash = skb_get_hash_raw(skb);
> +	if (hash) {
> +		if (skb->sw_hash)
> +			hash |= OVS_PACKET_HASH_SW_BIT;
> +
> +		if (skb->l4_hash)
> +			hash |= OVS_PACKET_HASH_L4_BIT;
> +
> +		if (nla_put(user_skb, OVS_PACKET_ATTR_HASH,
> +			    sizeof (u64), &hash)) {
> +			err = -ENOBUFS;
> +			goto out;
> +		}
> +
> +		pad_packet(dp, user_skb);
> +	}
> +
>   	/* Only reserve room for attribute header, packet data is added
>   	 * in skb_zerocopy() */
>   	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
> @@ -543,6 +562,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>   	struct datapath *dp;
>   	struct vport *input_vport;
>   	u16 mru = 0;
> +	u64 hash;
>   	int len;
>   	int err;
>   	bool log = !a[OVS_PACKET_ATTR_PROBE];
> @@ -568,6 +588,14 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>   	}
>   	OVS_CB(packet)->mru = mru;
>   
> +	if (a[OVS_PACKET_ATTR_HASH]) {
> +		hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]);
> +
> +		__skb_set_hash(packet, hash & 0xFFFFFFFFULL,
> +			       !!(hash & OVS_PACKET_HASH_SW_BIT),
> +			       !!(hash & OVS_PACKET_HASH_L4_BIT));
> +	}
> +
>   	/* Build an sw_flow for sending this packet. */
>   	flow = ovs_flow_alloc();
>   	err = PTR_ERR(flow);
> diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
> index 81e85dde8217..e239a46c2f94 100644
> --- a/net/openvswitch/datapath.h
> +++ b/net/openvswitch/datapath.h
> @@ -139,6 +139,18 @@ struct ovs_net {
>   	bool xt_label;
>   };
>   
> +/**
> + * enum ovs_pkt_hash_types - hash info to include with a packet
> + * to send to userspace.
> + * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack.
> + * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash
> + * over transport ports.
> + */
> +enum ovs_pkt_hash_types {
> +	OVS_PACKET_HASH_SW_BIT = (1ULL << 32),
> +	OVS_PACKET_HASH_L4_BIT = (1ULL << 33),
> +};
> +
>   extern unsigned int ovs_net_id;
>   void ovs_lock(void);
>   void ovs_unlock(void);
Tonghao Zhang Nov. 12, 2019, 6:39 p.m. UTC | #2
On Wed, Nov 13, 2019 at 2:06 AM Gregory Rose <gvrose8192@gmail.com> wrote:
>
>
> On 11/12/2019 7:08 AM, xiangxia.m.yue@gmail.com wrote:
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > When using the kernel datapath, the upcall don't
> > include skb hash info relatived. That will introduce
> > some problem, because the hash of skb is important
> > in kernel stack. For example, VXLAN module uses
> > it to select UDP src port. The tx queue selection
> > may also use the hash in stack.
> >
> > Hash is computed in different ways. Hash is random
> > for a TCP socket, and hash may be computed in hardware,
> > or software stack. Recalculation hash is not easy.
> >
> > Hash of TCP socket is computed:
> >    tcp_v4_connect
> >      -> sk_set_txhash (is random)
> >
> >    __tcp_transmit_skb
> >      -> skb_set_hash_from_sk
> >
> > There will be one upcall, without information of skb
> > hash, to ovs-vswitchd, for the first packet of a TCP
> > session. The rest packets will be processed in Open vSwitch
> > modules, hash kept. If this tcp session is forward to
> > VXLAN module, then the UDP src port of first tcp packet
> > is different from rest packets.
> >
> > TCP packets may come from the host or dockers, to Open vSwitch.
> > To fix it, we store the hash info to upcall, and restore hash
> > when packets sent back.
> >
> > +---------------+          +-------------------------+
> > |   Docker/VMs  |          |     ovs-vswitchd        |
> > +----+----------+          +-+--------------------+--+
> >       |                       ^                    |
> >       |                       |                    |
> >       |                       |  upcall            v restore packet hash (not recalculate)
> >       |                     +-+--------------------+--+
> >       |  tap netdev         |                         |   vxlan module
> >       +--------------->     +-->  Open vSwitch ko     +-->
> >         or internal type    |                         |
> >                             +-------------------------+
> >
> > Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
> > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> > ---
> > v3:
> > * add enum ovs_pkt_hash_types
> > * avoid duplicate call the skb_get_hash_raw.
> > * explain why we should fix this problem.
> > ---
> >   include/uapi/linux/openvswitch.h |  2 ++
> >   net/openvswitch/datapath.c       | 30 +++++++++++++++++++++++++++++-
> >   net/openvswitch/datapath.h       | 12 ++++++++++++
> >   3 files changed, 43 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
> > index 1887a451c388..e65407c1f366 100644
> > --- a/include/uapi/linux/openvswitch.h
> > +++ b/include/uapi/linux/openvswitch.h
> > @@ -170,6 +170,7 @@ enum ovs_packet_cmd {
> >    * output port is actually a tunnel port. Contains the output tunnel key
> >    * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
> >    * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
> > + * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb).
> >    * @OVS_PACKET_ATTR_LEN: Packet size before truncation.
> >    * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
> >    * size.
> > @@ -190,6 +191,7 @@ enum ovs_packet_attr {
> >       OVS_PACKET_ATTR_PROBE,      /* Packet operation is a feature probe,
> >                                      error logging should be suppressed. */
> >       OVS_PACKET_ATTR_MRU,        /* Maximum received IP fragment size. */
> > +     OVS_PACKET_ATTR_HASH,       /* Packet hash. */
> >       OVS_PACKET_ATTR_LEN,            /* Packet size before truncation. */
> >       __OVS_PACKET_ATTR_MAX
> >   };
>
> Why do you add the new enum before the last entry OVS_PACKET_ATTR_LEN
> instead of
> just adding it to the end of the list?
Should be at end of the list, but I run the "make check" without patch
for ovs-vswitchd,
There is not error. There is not a OVS_PACKET_ATTR_LEN test case ?
I will change the order of enum in next version, thanks.
> Just curious.
>
> - Greg
>
> > diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> > index 2088619c03f0..b556cf62b77c 100644
> > --- a/net/openvswitch/datapath.c
> > +++ b/net/openvswitch/datapath.c
> > @@ -350,7 +350,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
> >       size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
> >               + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
> >               + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
> > -             + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
> > +             + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */
> > +             + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */
> >
> >       /* OVS_PACKET_ATTR_USERDATA */
> >       if (upcall_info->userdata)
> > @@ -393,6 +394,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
> >       size_t len;
> >       unsigned int hlen;
> >       int err, dp_ifindex;
> > +     u64 hash;
> >
> >       dp_ifindex = get_dpifindex(dp);
> >       if (!dp_ifindex)
> > @@ -504,6 +506,23 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
> >               pad_packet(dp, user_skb);
> >       }
> >
> > +     hash = skb_get_hash_raw(skb);
> > +     if (hash) {
> > +             if (skb->sw_hash)
> > +                     hash |= OVS_PACKET_HASH_SW_BIT;
> > +
> > +             if (skb->l4_hash)
> > +                     hash |= OVS_PACKET_HASH_L4_BIT;
> > +
> > +             if (nla_put(user_skb, OVS_PACKET_ATTR_HASH,
> > +                         sizeof (u64), &hash)) {
> > +                     err = -ENOBUFS;
> > +                     goto out;
> > +             }
> > +
> > +             pad_packet(dp, user_skb);
> > +     }
> > +
> >       /* Only reserve room for attribute header, packet data is added
> >        * in skb_zerocopy() */
> >       if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
> > @@ -543,6 +562,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
> >       struct datapath *dp;
> >       struct vport *input_vport;
> >       u16 mru = 0;
> > +     u64 hash;
> >       int len;
> >       int err;
> >       bool log = !a[OVS_PACKET_ATTR_PROBE];
> > @@ -568,6 +588,14 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
> >       }
> >       OVS_CB(packet)->mru = mru;
> >
> > +     if (a[OVS_PACKET_ATTR_HASH]) {
> > +             hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]);
> > +
> > +             __skb_set_hash(packet, hash & 0xFFFFFFFFULL,
> > +                            !!(hash & OVS_PACKET_HASH_SW_BIT),
> > +                            !!(hash & OVS_PACKET_HASH_L4_BIT));
> > +     }
> > +
> >       /* Build an sw_flow for sending this packet. */
> >       flow = ovs_flow_alloc();
> >       err = PTR_ERR(flow);
> > diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
> > index 81e85dde8217..e239a46c2f94 100644
> > --- a/net/openvswitch/datapath.h
> > +++ b/net/openvswitch/datapath.h
> > @@ -139,6 +139,18 @@ struct ovs_net {
> >       bool xt_label;
> >   };
> >
> > +/**
> > + * enum ovs_pkt_hash_types - hash info to include with a packet
> > + * to send to userspace.
> > + * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack.
> > + * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash
> > + * over transport ports.
> > + */
> > +enum ovs_pkt_hash_types {
> > +     OVS_PACKET_HASH_SW_BIT = (1ULL << 32),
> > +     OVS_PACKET_HASH_L4_BIT = (1ULL << 33),
> > +};
> > +
> >   extern unsigned int ovs_net_id;
> >   void ovs_lock(void);
> >   void ovs_unlock(void);
>
Pravin Shelar Nov. 13, 2019, 4:54 a.m. UTC | #3
On Tue, Nov 12, 2019 at 7:09 AM <xiangxia.m.yue@gmail.com> wrote:
>
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> When using the kernel datapath, the upcall don't
> include skb hash info relatived. That will introduce
> some problem, because the hash of skb is important
> in kernel stack. For example, VXLAN module uses
> it to select UDP src port. The tx queue selection
> may also use the hash in stack.
>
> Hash is computed in different ways. Hash is random
> for a TCP socket, and hash may be computed in hardware,
> or software stack. Recalculation hash is not easy.
>
> Hash of TCP socket is computed:
>   tcp_v4_connect
>     -> sk_set_txhash (is random)
>
>   __tcp_transmit_skb
>     -> skb_set_hash_from_sk
>
> There will be one upcall, without information of skb
> hash, to ovs-vswitchd, for the first packet of a TCP
> session. The rest packets will be processed in Open vSwitch
> modules, hash kept. If this tcp session is forward to
> VXLAN module, then the UDP src port of first tcp packet
> is different from rest packets.
>
> TCP packets may come from the host or dockers, to Open vSwitch.
> To fix it, we store the hash info to upcall, and restore hash
> when packets sent back.
>
> +---------------+          +-------------------------+
> |   Docker/VMs  |          |     ovs-vswitchd        |
> +----+----------+          +-+--------------------+--+
>      |                       ^                    |
>      |                       |                    |
>      |                       |  upcall            v restore packet hash (not recalculate)
>      |                     +-+--------------------+--+
>      |  tap netdev         |                         |   vxlan module
>      +--------------->     +-->  Open vSwitch ko     +-->
>        or internal type    |                         |
>                            +-------------------------+
>
> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> ---
> v3:
> * add enum ovs_pkt_hash_types
> * avoid duplicate call the skb_get_hash_raw.
> * explain why we should fix this problem.
> ---
>  include/uapi/linux/openvswitch.h |  2 ++
>  net/openvswitch/datapath.c       | 30 +++++++++++++++++++++++++++++-
>  net/openvswitch/datapath.h       | 12 ++++++++++++
>  3 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
> index 1887a451c388..e65407c1f366 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -170,6 +170,7 @@ enum ovs_packet_cmd {
>   * output port is actually a tunnel port. Contains the output tunnel key
>   * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
>   * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
> + * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb).
>   * @OVS_PACKET_ATTR_LEN: Packet size before truncation.
>   * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
>   * size.
> @@ -190,6 +191,7 @@ enum ovs_packet_attr {
>         OVS_PACKET_ATTR_PROBE,      /* Packet operation is a feature probe,
>                                        error logging should be suppressed. */
>         OVS_PACKET_ATTR_MRU,        /* Maximum received IP fragment size. */
> +       OVS_PACKET_ATTR_HASH,       /* Packet hash. */
>         OVS_PACKET_ATTR_LEN,            /* Packet size before truncation. */
>         __OVS_PACKET_ATTR_MAX
>  };
I agree with Greg, value of existing enums can not be changed in UAPI.

> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index 2088619c03f0..b556cf62b77c 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -350,7 +350,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
>         size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
>                 + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
>                 + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
> -               + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
> +               + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */
> +               + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */
>
>         /* OVS_PACKET_ATTR_USERDATA */
>         if (upcall_info->userdata)
> @@ -393,6 +394,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
>         size_t len;
>         unsigned int hlen;
>         int err, dp_ifindex;
> +       u64 hash;
>
>         dp_ifindex = get_dpifindex(dp);
>         if (!dp_ifindex)
> @@ -504,6 +506,23 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
>                 pad_packet(dp, user_skb);
>         }
>
> +       hash = skb_get_hash_raw(skb);
> +       if (hash) {
Zero hash is valid hash of skb. due to this check packets with zero
hash would not get same vxlan source port number. This patch should
solve the issue for all values of skb hash.




> +               if (skb->sw_hash)
> +                       hash |= OVS_PACKET_HASH_SW_BIT;
> +
> +               if (skb->l4_hash)
> +                       hash |= OVS_PACKET_HASH_L4_BIT;
> +
> +               if (nla_put(user_skb, OVS_PACKET_ATTR_HASH,
> +                           sizeof (u64), &hash)) {
> +                       err = -ENOBUFS;
> +                       goto out;
> +               }
> +
> +               pad_packet(dp, user_skb);
> +       }
> +
>         /* Only reserve room for attribute header, packet data is added
>          * in skb_zerocopy() */
>         if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
> @@ -543,6 +562,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>         struct datapath *dp;
>         struct vport *input_vport;
>         u16 mru = 0;
> +       u64 hash;
>         int len;
>         int err;
>         bool log = !a[OVS_PACKET_ATTR_PROBE];
> @@ -568,6 +588,14 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>         }
>         OVS_CB(packet)->mru = mru;
>
> +       if (a[OVS_PACKET_ATTR_HASH]) {
> +               hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]);
> +
> +               __skb_set_hash(packet, hash & 0xFFFFFFFFULL,
> +                              !!(hash & OVS_PACKET_HASH_SW_BIT),
> +                              !!(hash & OVS_PACKET_HASH_L4_BIT));
> +       }
> +
>         /* Build an sw_flow for sending this packet. */
>         flow = ovs_flow_alloc();
>         err = PTR_ERR(flow);
> diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
> index 81e85dde8217..e239a46c2f94 100644
> --- a/net/openvswitch/datapath.h
> +++ b/net/openvswitch/datapath.h
> @@ -139,6 +139,18 @@ struct ovs_net {
>         bool xt_label;
>  };
>
> +/**
> + * enum ovs_pkt_hash_types - hash info to include with a packet
> + * to send to userspace.
> + * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack.
> + * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash
> + * over transport ports.
> + */
> +enum ovs_pkt_hash_types {
> +       OVS_PACKET_HASH_SW_BIT = (1ULL << 32),
> +       OVS_PACKET_HASH_L4_BIT = (1ULL << 33),
> +};
> +


>  extern unsigned int ovs_net_id;
>  void ovs_lock(void);
>  void ovs_unlock(void);
> --
> 2.23.0
>
Tonghao Zhang Nov. 13, 2019, 6:34 a.m. UTC | #4
On Wed, Nov 13, 2019 at 12:54 PM Pravin Shelar <pshelar@ovn.org> wrote:
>
> On Tue, Nov 12, 2019 at 7:09 AM <xiangxia.m.yue@gmail.com> wrote:
> >
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > When using the kernel datapath, the upcall don't
> > include skb hash info relatived. That will introduce
> > some problem, because the hash of skb is important
> > in kernel stack. For example, VXLAN module uses
> > it to select UDP src port. The tx queue selection
> > may also use the hash in stack.
> >
> > Hash is computed in different ways. Hash is random
> > for a TCP socket, and hash may be computed in hardware,
> > or software stack. Recalculation hash is not easy.
> >
> > Hash of TCP socket is computed:
> >   tcp_v4_connect
> >     -> sk_set_txhash (is random)
> >
> >   __tcp_transmit_skb
> >     -> skb_set_hash_from_sk
> >
> > There will be one upcall, without information of skb
> > hash, to ovs-vswitchd, for the first packet of a TCP
> > session. The rest packets will be processed in Open vSwitch
> > modules, hash kept. If this tcp session is forward to
> > VXLAN module, then the UDP src port of first tcp packet
> > is different from rest packets.
> >
> > TCP packets may come from the host or dockers, to Open vSwitch.
> > To fix it, we store the hash info to upcall, and restore hash
> > when packets sent back.
> >
> > +---------------+          +-------------------------+
> > |   Docker/VMs  |          |     ovs-vswitchd        |
> > +----+----------+          +-+--------------------+--+
> >      |                       ^                    |
> >      |                       |                    |
> >      |                       |  upcall            v restore packet hash (not recalculate)
> >      |                     +-+--------------------+--+
> >      |  tap netdev         |                         |   vxlan module
> >      +--------------->     +-->  Open vSwitch ko     +-->
> >        or internal type    |                         |
> >                            +-------------------------+
> >
> > Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
> > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> > ---
> > v3:
> > * add enum ovs_pkt_hash_types
> > * avoid duplicate call the skb_get_hash_raw.
> > * explain why we should fix this problem.
> > ---
> >  include/uapi/linux/openvswitch.h |  2 ++
> >  net/openvswitch/datapath.c       | 30 +++++++++++++++++++++++++++++-
> >  net/openvswitch/datapath.h       | 12 ++++++++++++
> >  3 files changed, 43 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
> > index 1887a451c388..e65407c1f366 100644
> > --- a/include/uapi/linux/openvswitch.h
> > +++ b/include/uapi/linux/openvswitch.h
> > @@ -170,6 +170,7 @@ enum ovs_packet_cmd {
> >   * output port is actually a tunnel port. Contains the output tunnel key
> >   * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
> >   * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
> > + * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb).
> >   * @OVS_PACKET_ATTR_LEN: Packet size before truncation.
> >   * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
> >   * size.
> > @@ -190,6 +191,7 @@ enum ovs_packet_attr {
> >         OVS_PACKET_ATTR_PROBE,      /* Packet operation is a feature probe,
> >                                        error logging should be suppressed. */
> >         OVS_PACKET_ATTR_MRU,        /* Maximum received IP fragment size. */
> > +       OVS_PACKET_ATTR_HASH,       /* Packet hash. */
> >         OVS_PACKET_ATTR_LEN,            /* Packet size before truncation. */
> >         __OVS_PACKET_ATTR_MAX
> >  };
> I agree with Greg, value of existing enums can not be changed in UAPI.
>
> > diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> > index 2088619c03f0..b556cf62b77c 100644
> > --- a/net/openvswitch/datapath.c
> > +++ b/net/openvswitch/datapath.c
> > @@ -350,7 +350,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
> >         size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
> >                 + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
> >                 + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
> > -               + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
> > +               + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */
> > +               + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */
> >
> >         /* OVS_PACKET_ATTR_USERDATA */
> >         if (upcall_info->userdata)
> > @@ -393,6 +394,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
> >         size_t len;
> >         unsigned int hlen;
> >         int err, dp_ifindex;
> > +       u64 hash;
> >
> >         dp_ifindex = get_dpifindex(dp);
> >         if (!dp_ifindex)
> > @@ -504,6 +506,23 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
> >                 pad_packet(dp, user_skb);
> >         }
> >
> > +       hash = skb_get_hash_raw(skb);
> > +       if (hash) {
> Zero hash is valid hash of skb. due to this check packets with zero
> hash would not get same vxlan source port number. This patch should
> solve the issue for all values of skb hash.
I got it. thanks.
One question, should we call the pad_packet? because the
nla_put_u16/nla_put_u32/nla_put
will reserve room with NLA_ALIGN. I think we can remove the pad_packet
after setting
OVS_PACKET_ATTR_MRU/OVS_PACKET_ATTR_LEN.
>
>
>
> > +               if (skb->sw_hash)
> > +                       hash |= OVS_PACKET_HASH_SW_BIT;
> > +
> > +               if (skb->l4_hash)
> > +                       hash |= OVS_PACKET_HASH_L4_BIT;
> > +
> > +               if (nla_put(user_skb, OVS_PACKET_ATTR_HASH,
> > +                           sizeof (u64), &hash)) {
> > +                       err = -ENOBUFS;
> > +                       goto out;
> > +               }
> > +
> > +               pad_packet(dp, user_skb);
> > +       }
> > +
> >         /* Only reserve room for attribute header, packet data is added
> >          * in skb_zerocopy() */
> >         if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
> > @@ -543,6 +562,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
> >         struct datapath *dp;
> >         struct vport *input_vport;
> >         u16 mru = 0;
> > +       u64 hash;
> >         int len;
> >         int err;
> >         bool log = !a[OVS_PACKET_ATTR_PROBE];
> > @@ -568,6 +588,14 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
> >         }
> >         OVS_CB(packet)->mru = mru;
> >
> > +       if (a[OVS_PACKET_ATTR_HASH]) {
> > +               hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]);
> > +
> > +               __skb_set_hash(packet, hash & 0xFFFFFFFFULL,
> > +                              !!(hash & OVS_PACKET_HASH_SW_BIT),
> > +                              !!(hash & OVS_PACKET_HASH_L4_BIT));
> > +       }
> > +
> >         /* Build an sw_flow for sending this packet. */
> >         flow = ovs_flow_alloc();
> >         err = PTR_ERR(flow);
> > diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
> > index 81e85dde8217..e239a46c2f94 100644
> > --- a/net/openvswitch/datapath.h
> > +++ b/net/openvswitch/datapath.h
> > @@ -139,6 +139,18 @@ struct ovs_net {
> >         bool xt_label;
> >  };
> >
> > +/**
> > + * enum ovs_pkt_hash_types - hash info to include with a packet
> > + * to send to userspace.
> > + * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack.
> > + * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash
> > + * over transport ports.
> > + */
> > +enum ovs_pkt_hash_types {
> > +       OVS_PACKET_HASH_SW_BIT = (1ULL << 32),
> > +       OVS_PACKET_HASH_L4_BIT = (1ULL << 33),
> > +};
> > +
>
>
> >  extern unsigned int ovs_net_id;
> >  void ovs_lock(void);
> >  void ovs_unlock(void);
> > --
> > 2.23.0
> >
diff mbox series

Patch

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 1887a451c388..e65407c1f366 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -170,6 +170,7 @@  enum ovs_packet_cmd {
  * output port is actually a tunnel port. Contains the output tunnel key
  * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
  * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
+ * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb).
  * @OVS_PACKET_ATTR_LEN: Packet size before truncation.
  * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
  * size.
@@ -190,6 +191,7 @@  enum ovs_packet_attr {
 	OVS_PACKET_ATTR_PROBE,      /* Packet operation is a feature probe,
 				       error logging should be suppressed. */
 	OVS_PACKET_ATTR_MRU,	    /* Maximum received IP fragment size. */
+	OVS_PACKET_ATTR_HASH,	    /* Packet hash. */
 	OVS_PACKET_ATTR_LEN,		/* Packet size before truncation. */
 	__OVS_PACKET_ATTR_MAX
 };
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2088619c03f0..b556cf62b77c 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -350,7 +350,8 @@  static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
 		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
 		+ nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
-		+ nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
+		+ nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */
+		+ nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */
 
 	/* OVS_PACKET_ATTR_USERDATA */
 	if (upcall_info->userdata)
@@ -393,6 +394,7 @@  static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	size_t len;
 	unsigned int hlen;
 	int err, dp_ifindex;
+	u64 hash;
 
 	dp_ifindex = get_dpifindex(dp);
 	if (!dp_ifindex)
@@ -504,6 +506,23 @@  static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 		pad_packet(dp, user_skb);
 	}
 
+	hash = skb_get_hash_raw(skb);
+	if (hash) {
+		if (skb->sw_hash)
+			hash |= OVS_PACKET_HASH_SW_BIT;
+
+		if (skb->l4_hash)
+			hash |= OVS_PACKET_HASH_L4_BIT;
+
+		if (nla_put(user_skb, OVS_PACKET_ATTR_HASH,
+			    sizeof (u64), &hash)) {
+			err = -ENOBUFS;
+			goto out;
+		}
+
+		pad_packet(dp, user_skb);
+	}
+
 	/* Only reserve room for attribute header, packet data is added
 	 * in skb_zerocopy() */
 	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
@@ -543,6 +562,7 @@  static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	struct vport *input_vport;
 	u16 mru = 0;
+	u64 hash;
 	int len;
 	int err;
 	bool log = !a[OVS_PACKET_ATTR_PROBE];
@@ -568,6 +588,14 @@  static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	}
 	OVS_CB(packet)->mru = mru;
 
+	if (a[OVS_PACKET_ATTR_HASH]) {
+		hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]);
+
+		__skb_set_hash(packet, hash & 0xFFFFFFFFULL,
+			       !!(hash & OVS_PACKET_HASH_SW_BIT),
+			       !!(hash & OVS_PACKET_HASH_L4_BIT));
+	}
+
 	/* Build an sw_flow for sending this packet. */
 	flow = ovs_flow_alloc();
 	err = PTR_ERR(flow);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 81e85dde8217..e239a46c2f94 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -139,6 +139,18 @@  struct ovs_net {
 	bool xt_label;
 };
 
+/**
+ * enum ovs_pkt_hash_types - hash info to include with a packet
+ * to send to userspace.
+ * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack.
+ * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash
+ * over transport ports.
+ */
+enum ovs_pkt_hash_types {
+	OVS_PACKET_HASH_SW_BIT = (1ULL << 32),
+	OVS_PACKET_HASH_L4_BIT = (1ULL << 33),
+};
+
 extern unsigned int ovs_net_id;
 void ovs_lock(void);
 void ovs_unlock(void);