diff mbox series

[ovs-dev] flow: Consistent VXLAN UDP src ports for fragmented packets

Message ID 20230105132503.35383-1-hemanth.a@acldigital.com
State Changes Requested
Headers show
Series [ovs-dev] flow: Consistent VXLAN UDP src ports for fragmented packets | expand

Commit Message

Hemanth Aramadaka Jan. 5, 2023, 1:25 p.m. UTC
Issue:

The src-port for UDP is based on RSS hash in the packet metadata.
In case of packets coming from VM it will be 5-tuple, if available,
otherwise just IP addresses.If the VM fragments a large IP packet
and sends the fragments to ovs, only the first fragment will contain
the L4 header. Therefore, the first fragment and subsequent fragments
get different UDP src ports in the outgoing VXLAN header.This can
lead to fragment re-ordering in the fabric as packet will take
different paths.

Fix:

Intention of this is to avoid fragment packets taking different paths.
For example, due to presence of firewalls, fragment packets will take
different paths and will get dropped.To avoid this we ignore the L4
header during hash calculation only in the case of fragmented packets.

Signed-off-by: Hemanth Aramadaka <hemanth.a@acldigital.com>
---
 lib/flow.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

Comments

Simon Horman Jan. 20, 2023, 3:22 p.m. UTC | #1
On Thu, Jan 05, 2023 at 06:55:03PM +0530, Hemanth Aramadaka via dev wrote:
> Issue:
> 
> The src-port for UDP is based on RSS hash in the packet metadata.
> In case of packets coming from VM it will be 5-tuple, if available,
> otherwise just IP addresses.If the VM fragments a large IP packet
> and sends the fragments to ovs, only the first fragment will contain
> the L4 header. Therefore, the first fragment and subsequent fragments
> get different UDP src ports in the outgoing VXLAN header.This can
> lead to fragment re-ordering in the fabric as packet will take
> different paths.
> 
> Fix:
> 
> Intention of this is to avoid fragment packets taking different paths.
> For example, due to presence of firewalls, fragment packets will take
> different paths and will get dropped.To avoid this we ignore the L4
> header during hash calculation only in the case of fragmented packets.
> 
> Signed-off-by: Hemanth Aramadaka <hemanth.a@acldigital.com>
> ---
>  lib/flow.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/flow.c b/lib/flow.c
> index c3a3aa3ce..4f4197b1c 100644
> --- a/lib/flow.c
> +++ b/lib/flow.c
> @@ -1018,7 +1018,9 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
>                      miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>                      miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>                      if (dl_type == htons(ETH_TYPE_IP)) {
> -                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> +                        if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
> +                            dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> +                        }
>                      } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>                          dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>                      }
> @@ -1033,7 +1035,9 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
>                  miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>                  miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>                  if (dl_type == htons(ETH_TYPE_IP)) {
> -                    dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> +                    if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
> +                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> +                    }
>                  } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>                      dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>                  }
> @@ -2248,7 +2252,7 @@ miniflow_hash_5tuple(const struct miniflow *flow, uint32_t basis)
>  
>      if (flow) {
>          ovs_be16 dl_type = MINIFLOW_GET_BE16(flow, dl_type);
> -        uint8_t nw_proto;
> +        uint8_t nw_proto, nw_frag;
>  
>          if (dl_type == htons(ETH_TYPE_IPV6)) {
>              struct flowmap map = FLOWMAP_EMPTY_INITIALIZER;
> @@ -2270,6 +2274,14 @@ miniflow_hash_5tuple(const struct miniflow *flow, uint32_t basis)
>  
>          nw_proto = MINIFLOW_GET_U8(flow, nw_proto);
>          hash = hash_add(hash, nw_proto);
> +
> +        /* Skip l4 header fields if IP packet is fragmented since
> +         * only first fragment will carry l4 header.
> +         */
> +        nw_frag = MINIFLOW_GET_U8(flow, nw_frag);
> +        if (nw_frag & FLOW_NW_FRAG_MASK) {
> +            goto out;
> +        }

Maybe I am reading things wrong, but it seems to me that
this change seems to effect all protocols, and in particular both
IPv4 and IPv6. Whereas the changes above to miniflow_extract()
only affect IPv6. Is this intentional?

>          if (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP
>              && nw_proto != IPPROTO_SCTP && nw_proto != IPPROTO_ICMP
>              && nw_proto != IPPROTO_ICMPV6) {
> -- 
> 2.34.1
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Hemanth Aramadaka June 26, 2023, 6:59 a.m. UTC | #2
Hi Simon,

Sorry for the late response. Yes, the changes are specific to IPV6 protocol
only.

Thanks,
Hemanth.

-----Original Message-----
From: Simon Horman <simon.horman@corigine.com> 
Sent: Friday, January 20, 2023 8:52 PM
To: ovs-dev@openvswitch.org
Cc: Hemanth Aramadaka <hemanth.a@acldigital.com>
Subject: Re: [ovs-dev] [PATCH] flow: Consistent VXLAN UDP src ports for
fragmented packets

On Thu, Jan 05, 2023 at 06:55:03PM +0530, Hemanth Aramadaka via dev wrote:
> Issue:
> 
> The src-port for UDP is based on RSS hash in the packet metadata.
> In case of packets coming from VM it will be 5-tuple, if available, 
> otherwise just IP addresses.If the VM fragments a large IP packet and 
> sends the fragments to ovs, only the first fragment will contain the 
> L4 header. Therefore, the first fragment and subsequent fragments get 
> different UDP src ports in the outgoing VXLAN header.This can lead to 
> fragment re-ordering in the fabric as packet will take different 
> paths.
> 
> Fix:
> 
> Intention of this is to avoid fragment packets taking different paths.
> For example, due to presence of firewalls, fragment packets will take 
> different paths and will get dropped.To avoid this we ignore the L4 
> header during hash calculation only in the case of fragmented packets.
> 
> Signed-off-by: Hemanth Aramadaka <hemanth.a@acldigital.com>
> ---
>  lib/flow.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/flow.c b/lib/flow.c
> index c3a3aa3ce..4f4197b1c 100644
> --- a/lib/flow.c
> +++ b/lib/flow.c
> @@ -1018,7 +1018,9 @@ miniflow_extract(struct dp_packet *packet, struct
miniflow *dst)
>                      miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>                      miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>                      if (dl_type == htons(ETH_TYPE_IP)) {
> -                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> +                        if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
> +
dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> +                        }
>                      } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>                          dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>                      }
> @@ -1033,7 +1035,9 @@ miniflow_extract(struct dp_packet *packet, struct
miniflow *dst)
>                  miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>                  miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>                  if (dl_type == htons(ETH_TYPE_IP)) {
> -                    dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> +                    if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
> +                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> +                    }
>                  } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>                      dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>                  }
> @@ -2248,7 +2252,7 @@ miniflow_hash_5tuple(const struct miniflow 
> *flow, uint32_t basis)
>  
>      if (flow) {
>          ovs_be16 dl_type = MINIFLOW_GET_BE16(flow, dl_type);
> -        uint8_t nw_proto;
> +        uint8_t nw_proto, nw_frag;
>  
>          if (dl_type == htons(ETH_TYPE_IPV6)) {
>              struct flowmap map = FLOWMAP_EMPTY_INITIALIZER; @@ 
> -2270,6 +2274,14 @@ miniflow_hash_5tuple(const struct miniflow *flow, 
> uint32_t basis)
>  
>          nw_proto = MINIFLOW_GET_U8(flow, nw_proto);
>          hash = hash_add(hash, nw_proto);
> +
> +        /* Skip l4 header fields if IP packet is fragmented since
> +         * only first fragment will carry l4 header.
> +         */
> +        nw_frag = MINIFLOW_GET_U8(flow, nw_frag);
> +        if (nw_frag & FLOW_NW_FRAG_MASK) {
> +            goto out;
> +        }

Maybe I am reading things wrong, but it seems to me that this change seems
to effect all protocols, and in particular both
IPv4 and IPv6. Whereas the changes above to miniflow_extract() only affect
IPv6. Is this intentional?

>          if (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP
>              && nw_proto != IPPROTO_SCTP && nw_proto != IPPROTO_ICMP
>              && nw_proto != IPPROTO_ICMPV6) {
> --
> 2.34.1
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Ilya Maximets June 26, 2023, 2:29 p.m. UTC | #3
On 6/26/23 08:59, Hemanth Aramadaka via dev wrote:
> Hi Simon,
> 
> Sorry for the late response. Yes, the changes are specific to IPV6 protocol
> only.

Please, fix the flow_hash_5tuple() function as well.  All variants of the
same hash function should give the same result.  For some reason you just
removed the fix for this function from the patch instead of addressing a
review comment.

And add the unit test for the issue.

Thanks.
Best regards, Ilya Maximets.

> 
> Thanks,
> Hemanth.
> 
> -----Original Message-----
> From: Simon Horman <simon.horman@corigine.com> 
> Sent: Friday, January 20, 2023 8:52 PM
> To: ovs-dev@openvswitch.org
> Cc: Hemanth Aramadaka <hemanth.a@acldigital.com>
> Subject: Re: [ovs-dev] [PATCH] flow: Consistent VXLAN UDP src ports for
> fragmented packets
> 
> On Thu, Jan 05, 2023 at 06:55:03PM +0530, Hemanth Aramadaka via dev wrote:
>> Issue:
>>
>> The src-port for UDP is based on RSS hash in the packet metadata.
>> In case of packets coming from VM it will be 5-tuple, if available, 
>> otherwise just IP addresses.If the VM fragments a large IP packet and 
>> sends the fragments to ovs, only the first fragment will contain the 
>> L4 header. Therefore, the first fragment and subsequent fragments get 
>> different UDP src ports in the outgoing VXLAN header.This can lead to 
>> fragment re-ordering in the fabric as packet will take different 
>> paths.
>>
>> Fix:
>>
>> Intention of this is to avoid fragment packets taking different paths.
>> For example, due to presence of firewalls, fragment packets will take 
>> different paths and will get dropped.To avoid this we ignore the L4 
>> header during hash calculation only in the case of fragmented packets.
>>
>> Signed-off-by: Hemanth Aramadaka <hemanth.a@acldigital.com>
>> ---
>>  lib/flow.c | 18 +++++++++++++++---
>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/lib/flow.c b/lib/flow.c
>> index c3a3aa3ce..4f4197b1c 100644
>> --- a/lib/flow.c
>> +++ b/lib/flow.c
>> @@ -1018,7 +1018,9 @@ miniflow_extract(struct dp_packet *packet, struct
> miniflow *dst)
>>                      miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>>                      miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>>                      if (dl_type == htons(ETH_TYPE_IP)) {
>> -                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>> +                        if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
>> +
> dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>> +                        }
>>                      } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>>                          dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>>                      }
>> @@ -1033,7 +1035,9 @@ miniflow_extract(struct dp_packet *packet, struct
> miniflow *dst)
>>                  miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>>                  miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>>                  if (dl_type == htons(ETH_TYPE_IP)) {
>> -                    dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>> +                    if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
>> +                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>> +                    }
>>                  } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>>                      dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>>                  }
>> @@ -2248,7 +2252,7 @@ miniflow_hash_5tuple(const struct miniflow 
>> *flow, uint32_t basis)
>>  
>>      if (flow) {
>>          ovs_be16 dl_type = MINIFLOW_GET_BE16(flow, dl_type);
>> -        uint8_t nw_proto;
>> +        uint8_t nw_proto, nw_frag;
>>  
>>          if (dl_type == htons(ETH_TYPE_IPV6)) {
>>              struct flowmap map = FLOWMAP_EMPTY_INITIALIZER; @@ 
>> -2270,6 +2274,14 @@ miniflow_hash_5tuple(const struct miniflow *flow, 
>> uint32_t basis)
>>  
>>          nw_proto = MINIFLOW_GET_U8(flow, nw_proto);
>>          hash = hash_add(hash, nw_proto);
>> +
>> +        /* Skip l4 header fields if IP packet is fragmented since
>> +         * only first fragment will carry l4 header.
>> +         */
>> +        nw_frag = MINIFLOW_GET_U8(flow, nw_frag);
>> +        if (nw_frag & FLOW_NW_FRAG_MASK) {
>> +            goto out;
>> +        }
> 
> Maybe I am reading things wrong, but it seems to me that this change seems
> to effect all protocols, and in particular both
> IPv4 and IPv6. Whereas the changes above to miniflow_extract() only affect
> IPv6. Is this intentional?
> 
>>          if (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP
>>              && nw_proto != IPPROTO_SCTP && nw_proto != IPPROTO_ICMP
>>              && nw_proto != IPPROTO_ICMPV6) {
>> --
>> 2.34.1
Hemanth Aramadaka Oct. 30, 2023, 10:03 a.m. UTC | #4
Hi Ilya,

The changes are not required for flow_hash_5tuple() function. I have added debug statements, and we are not hitting this function as part of this code flow. 

Thanks,
Hemanth.

-----Original Message-----
From: Ilya Maximets <i.maximets@ovn.org> 
Sent: Monday, June 26, 2023 7:59 PM
To: Hemanth Aramadaka <hemanth.a@acldigital.com>; 'Simon Horman' <simon.horman@corigine.com>; ovs-dev@openvswitch.org
Cc: i.maximets@ovn.org
Subject: Re: [ovs-dev] [PATCH] flow: Consistent VXLAN UDP src ports for fragmented packets

On 6/26/23 08:59, Hemanth Aramadaka via dev wrote:
> Hi Simon,
> 
> Sorry for the late response. Yes, the changes are specific to IPV6 
> protocol only.

Please, fix the flow_hash_5tuple() function as well.  All variants of the same hash function should give the same result.  For some reason you just removed the fix for this function from the patch instead of addressing a review comment.

And add the unit test for the issue.

Thanks.
Best regards, Ilya Maximets.

> 
> Thanks,
> Hemanth.
> 
> -----Original Message-----
> From: Simon Horman <simon.horman@corigine.com>
> Sent: Friday, January 20, 2023 8:52 PM
> To: ovs-dev@openvswitch.org
> Cc: Hemanth Aramadaka <hemanth.a@acldigital.com>
> Subject: Re: [ovs-dev] [PATCH] flow: Consistent VXLAN UDP src ports 
> for fragmented packets
> 
> On Thu, Jan 05, 2023 at 06:55:03PM +0530, Hemanth Aramadaka via dev wrote:
>> Issue:
>>
>> The src-port for UDP is based on RSS hash in the packet metadata.
>> In case of packets coming from VM it will be 5-tuple, if available, 
>> otherwise just IP addresses.If the VM fragments a large IP packet and 
>> sends the fragments to ovs, only the first fragment will contain the
>> L4 header. Therefore, the first fragment and subsequent fragments get 
>> different UDP src ports in the outgoing VXLAN header.This can lead to 
>> fragment re-ordering in the fabric as packet will take different 
>> paths.
>>
>> Fix:
>>
>> Intention of this is to avoid fragment packets taking different paths.
>> For example, due to presence of firewalls, fragment packets will take 
>> different paths and will get dropped.To avoid this we ignore the L4 
>> header during hash calculation only in the case of fragmented packets.
>>
>> Signed-off-by: Hemanth Aramadaka <hemanth.a@acldigital.com>
>> ---
>>  lib/flow.c | 18 +++++++++++++++---
>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/lib/flow.c b/lib/flow.c
>> index c3a3aa3ce..4f4197b1c 100644
>> --- a/lib/flow.c
>> +++ b/lib/flow.c
>> @@ -1018,7 +1018,9 @@ miniflow_extract(struct dp_packet *packet, 
>> struct
> miniflow *dst)
>>                      miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>>                      miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>>                      if (dl_type == htons(ETH_TYPE_IP)) {
>> -                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>> +                        if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
>> +
> dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>> +                        }
>>                      } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>>                          dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>>                      }
>> @@ -1033,7 +1035,9 @@ miniflow_extract(struct dp_packet *packet, 
>> struct
> miniflow *dst)
>>                  miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>>                  miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>>                  if (dl_type == htons(ETH_TYPE_IP)) {
>> -                    dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>> +                    if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
>> +                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>> +                    }
>>                  } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>>                      dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>>                  }
>> @@ -2248,7 +2252,7 @@ miniflow_hash_5tuple(const struct miniflow 
>> *flow, uint32_t basis)
>>  
>>      if (flow) {
>>          ovs_be16 dl_type = MINIFLOW_GET_BE16(flow, dl_type);
>> -        uint8_t nw_proto;
>> +        uint8_t nw_proto, nw_frag;
>>  
>>          if (dl_type == htons(ETH_TYPE_IPV6)) {
>>              struct flowmap map = FLOWMAP_EMPTY_INITIALIZER; @@
>> -2270,6 +2274,14 @@ miniflow_hash_5tuple(const struct miniflow *flow, 
>> uint32_t basis)
>>  
>>          nw_proto = MINIFLOW_GET_U8(flow, nw_proto);
>>          hash = hash_add(hash, nw_proto);
>> +
>> +        /* Skip l4 header fields if IP packet is fragmented since
>> +         * only first fragment will carry l4 header.
>> +         */
>> +        nw_frag = MINIFLOW_GET_U8(flow, nw_frag);
>> +        if (nw_frag & FLOW_NW_FRAG_MASK) {
>> +            goto out;
>> +        }
> 
> Maybe I am reading things wrong, but it seems to me that this change 
> seems to effect all protocols, and in particular both
> IPv4 and IPv6. Whereas the changes above to miniflow_extract() only 
> affect IPv6. Is this intentional?
> 
>>          if (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP
>>              && nw_proto != IPPROTO_SCTP && nw_proto != IPPROTO_ICMP
>>              && nw_proto != IPPROTO_ICMPV6) {
>> --
>> 2.34.1
Ilya Maximets Oct. 30, 2023, 11:48 a.m. UTC | #5
On 10/30/23 11:03, Hemanth Aramadaka wrote:
> Hi Ilya,
> 
> The changes are not required for flow_hash_5tuple() function. I have added debug statements, and we are not hitting this function as part of this code flow. 

It doesn't matter if you hit this particular function in your scenario
or not.  All the hash functions must produce the same result.

Best regards, Ilya Maximets.

> 
> Thanks,
> Hemanth.
> 
> -----Original Message-----
> From: Ilya Maximets <i.maximets@ovn.org> 
> Sent: Monday, June 26, 2023 7:59 PM
> To: Hemanth Aramadaka <hemanth.a@acldigital.com>; 'Simon Horman' <simon.horman@corigine.com>; ovs-dev@openvswitch.org
> Cc: i.maximets@ovn.org
> Subject: Re: [ovs-dev] [PATCH] flow: Consistent VXLAN UDP src ports for fragmented packets
> 
> On 6/26/23 08:59, Hemanth Aramadaka via dev wrote:
>> Hi Simon,
>>
>> Sorry for the late response. Yes, the changes are specific to IPV6 
>> protocol only.
> 
> Please, fix the flow_hash_5tuple() function as well.  All variants of the same hash function should give the same result.  For some reason you just removed the fix for this function from the patch instead of addressing a review comment.
> 
> And add the unit test for the issue.
> 
> Thanks.
> Best regards, Ilya Maximets.
> 
>>
>> Thanks,
>> Hemanth.
>>
>> -----Original Message-----
>> From: Simon Horman <simon.horman@corigine.com>
>> Sent: Friday, January 20, 2023 8:52 PM
>> To: ovs-dev@openvswitch.org
>> Cc: Hemanth Aramadaka <hemanth.a@acldigital.com>
>> Subject: Re: [ovs-dev] [PATCH] flow: Consistent VXLAN UDP src ports 
>> for fragmented packets
>>
>> On Thu, Jan 05, 2023 at 06:55:03PM +0530, Hemanth Aramadaka via dev wrote:
>>> Issue:
>>>
>>> The src-port for UDP is based on RSS hash in the packet metadata.
>>> In case of packets coming from VM it will be 5-tuple, if available, 
>>> otherwise just IP addresses.If the VM fragments a large IP packet and 
>>> sends the fragments to ovs, only the first fragment will contain the
>>> L4 header. Therefore, the first fragment and subsequent fragments get 
>>> different UDP src ports in the outgoing VXLAN header.This can lead to 
>>> fragment re-ordering in the fabric as packet will take different 
>>> paths.
>>>
>>> Fix:
>>>
>>> Intention of this is to avoid fragment packets taking different paths.
>>> For example, due to presence of firewalls, fragment packets will take 
>>> different paths and will get dropped.To avoid this we ignore the L4 
>>> header during hash calculation only in the case of fragmented packets.
>>>
>>> Signed-off-by: Hemanth Aramadaka <hemanth.a@acldigital.com>
>>> ---
>>>  lib/flow.c | 18 +++++++++++++++---
>>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/lib/flow.c b/lib/flow.c
>>> index c3a3aa3ce..4f4197b1c 100644
>>> --- a/lib/flow.c
>>> +++ b/lib/flow.c
>>> @@ -1018,7 +1018,9 @@ miniflow_extract(struct dp_packet *packet, 
>>> struct
>> miniflow *dst)
>>>                      miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>>>                      miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>>>                      if (dl_type == htons(ETH_TYPE_IP)) {
>>> -                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>>> +                        if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
>>> +
>> dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>>> +                        }
>>>                      } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>>>                          dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>>>                      }
>>> @@ -1033,7 +1035,9 @@ miniflow_extract(struct dp_packet *packet, 
>>> struct
>> miniflow *dst)
>>>                  miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>>>                  miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>>>                  if (dl_type == htons(ETH_TYPE_IP)) {
>>> -                    dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>>> +                    if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
>>> +                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
>>> +                    }
>>>                  } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>>>                      dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>>>                  }
>>> @@ -2248,7 +2252,7 @@ miniflow_hash_5tuple(const struct miniflow 
>>> *flow, uint32_t basis)
>>>  
>>>      if (flow) {
>>>          ovs_be16 dl_type = MINIFLOW_GET_BE16(flow, dl_type);
>>> -        uint8_t nw_proto;
>>> +        uint8_t nw_proto, nw_frag;
>>>  
>>>          if (dl_type == htons(ETH_TYPE_IPV6)) {
>>>              struct flowmap map = FLOWMAP_EMPTY_INITIALIZER; @@
>>> -2270,6 +2274,14 @@ miniflow_hash_5tuple(const struct miniflow *flow, 
>>> uint32_t basis)
>>>  
>>>          nw_proto = MINIFLOW_GET_U8(flow, nw_proto);
>>>          hash = hash_add(hash, nw_proto);
>>> +
>>> +        /* Skip l4 header fields if IP packet is fragmented since
>>> +         * only first fragment will carry l4 header.
>>> +         */
>>> +        nw_frag = MINIFLOW_GET_U8(flow, nw_frag);
>>> +        if (nw_frag & FLOW_NW_FRAG_MASK) {
>>> +            goto out;
>>> +        }
>>
>> Maybe I am reading things wrong, but it seems to me that this change 
>> seems to effect all protocols, and in particular both
>> IPv4 and IPv6. Whereas the changes above to miniflow_extract() only 
>> affect IPv6. Is this intentional?
>>
>>>          if (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP
>>>              && nw_proto != IPPROTO_SCTP && nw_proto != IPPROTO_ICMP
>>>              && nw_proto != IPPROTO_ICMPV6) {
>>> --
>>> 2.34.1
>
diff mbox series

Patch

diff --git a/lib/flow.c b/lib/flow.c
index c3a3aa3ce..4f4197b1c 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -1018,7 +1018,9 @@  miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
                     miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
                     miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
                     if (dl_type == htons(ETH_TYPE_IP)) {
-                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
+                        if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
+                            dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
+                        }
                     } else if (dl_type == htons(ETH_TYPE_IPV6)) {
                         dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
                     }
@@ -1033,7 +1035,9 @@  miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
                 miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
                 miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
                 if (dl_type == htons(ETH_TYPE_IP)) {
-                    dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
+                    if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
+                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
+                    }
                 } else if (dl_type == htons(ETH_TYPE_IPV6)) {
                     dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
                 }
@@ -2248,7 +2252,7 @@  miniflow_hash_5tuple(const struct miniflow *flow, uint32_t basis)
 
     if (flow) {
         ovs_be16 dl_type = MINIFLOW_GET_BE16(flow, dl_type);
-        uint8_t nw_proto;
+        uint8_t nw_proto, nw_frag;
 
         if (dl_type == htons(ETH_TYPE_IPV6)) {
             struct flowmap map = FLOWMAP_EMPTY_INITIALIZER;
@@ -2270,6 +2274,14 @@  miniflow_hash_5tuple(const struct miniflow *flow, uint32_t basis)
 
         nw_proto = MINIFLOW_GET_U8(flow, nw_proto);
         hash = hash_add(hash, nw_proto);
+
+        /* Skip l4 header fields if IP packet is fragmented since
+         * only first fragment will carry l4 header.
+         */
+        nw_frag = MINIFLOW_GET_U8(flow, nw_frag);
+        if (nw_frag & FLOW_NW_FRAG_MASK) {
+            goto out;
+        }
         if (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP
             && nw_proto != IPPROTO_SCTP && nw_proto != IPPROTO_ICMP
             && nw_proto != IPPROTO_ICMPV6) {