diff mbox series

[ovs-dev,v2] openvswitch: Trim off padding before L3+ netfilter processing

Message ID 1513869437-20059-1-git-send-email-eswierk@skyportsystems.com
State Not Applicable
Headers show
Series [ovs-dev,v2] openvswitch: Trim off padding before L3+ netfilter processing | expand

Commit Message

Li,Rongqing via dev Dec. 21, 2017, 3:17 p.m. UTC
IPv4 and IPv6 packets may arrive with lower-layer padding that is not
included in the L3 length. For example, a short IPv4 packet may have
up to 6 bytes of padding following the IP payload when received on an
Ethernet device. In the normal IPv4 receive path, ip_rcv() trims the
packet to ip_hdr->tot_len before invoking netfilter hooks (including
conntrack and nat).

In the IPv6 receive path, ip6_rcv() does the same using
ipv6_hdr->payload_len. Similarly in the br_netfilter receive path,
br_validate_ipv4() and br_validate_ipv6() trim the packet to the L3
length before invoking NF_INET_PRE_ROUTING hooks.

In the OVS conntrack receive path, ovs_ct_execute() pulls the skb to
the L3 header but does not trim it to the L3 length before calling
nf_conntrack_in(NF_INET_PRE_ROUTING). When nf_conntrack_proto_tcp
encounters a packet with lower-layer padding, nf_checksum() fails and
logs "nf_ct_tcp: bad TCP checksum". While extra zero bytes don't
affect the checksum, the length in the IP pseudoheader does. That
length is based on skb->len, and without trimming, it doesn't match
the length the sender used when computing the checksum.

The assumption throughout nf_conntrack and nf_nat is that skb->len
reflects the length of the L3 header and payload, so there is no need
to refer back to ip_hdr->tot_len or ipv6_hdr->payload_len.

This change brings OVS into line with other netfilter users, trimming
IPv4 and IPv6 packets prior to L3+ netfilter processing.

Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
---
v2:
- Trim packet in nat receive path as well as conntrack
- Free skb on error
---
 net/openvswitch/conntrack.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

Comments

Pravin Shelar Dec. 22, 2017, 11:31 p.m. UTC | #1
On Thu, Dec 21, 2017 at 7:17 AM, Ed Swierk <eswierk@skyportsystems.com> wrote:
> IPv4 and IPv6 packets may arrive with lower-layer padding that is not
> included in the L3 length. For example, a short IPv4 packet may have
> up to 6 bytes of padding following the IP payload when received on an
> Ethernet device. In the normal IPv4 receive path, ip_rcv() trims the
> packet to ip_hdr->tot_len before invoking netfilter hooks (including
> conntrack and nat).
>
> In the IPv6 receive path, ip6_rcv() does the same using
> ipv6_hdr->payload_len. Similarly in the br_netfilter receive path,
> br_validate_ipv4() and br_validate_ipv6() trim the packet to the L3
> length before invoking NF_INET_PRE_ROUTING hooks.
>
> In the OVS conntrack receive path, ovs_ct_execute() pulls the skb to
> the L3 header but does not trim it to the L3 length before calling
> nf_conntrack_in(NF_INET_PRE_ROUTING). When nf_conntrack_proto_tcp
> encounters a packet with lower-layer padding, nf_checksum() fails and
> logs "nf_ct_tcp: bad TCP checksum". While extra zero bytes don't
> affect the checksum, the length in the IP pseudoheader does. That
> length is based on skb->len, and without trimming, it doesn't match
> the length the sender used when computing the checksum.
>
> The assumption throughout nf_conntrack and nf_nat is that skb->len
> reflects the length of the L3 header and payload, so there is no need
> to refer back to ip_hdr->tot_len or ipv6_hdr->payload_len.
>
> This change brings OVS into line with other netfilter users, trimming
> IPv4 and IPv6 packets prior to L3+ netfilter processing.
>
> Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
> ---
> v2:
> - Trim packet in nat receive path as well as conntrack
> - Free skb on error
> ---
>  net/openvswitch/conntrack.c | 34 ++++++++++++++++++++++++++++++++++
>  1 file changed, 34 insertions(+)
>
> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> index b27c5c6..1bdc78f 100644
> --- a/net/openvswitch/conntrack.c
> +++ b/net/openvswitch/conntrack.c
> @@ -703,6 +703,33 @@ static bool skb_nfct_cached(struct net *net,
>         return ct_executed;
>  }
>
> +/* Trim the skb to the L3 length. Assumes the skb is already pulled to
> + * the L3 header. The skb is freed on error.
> + */
> +static int skb_trim_l3(struct sk_buff *skb)
> +{
> +       unsigned int nh_len;
> +       int err;
> +
> +       switch (skb->protocol) {
> +       case htons(ETH_P_IP):
> +               nh_len = ntohs(ip_hdr(skb)->tot_len);
> +               break;
> +       case htons(ETH_P_IPV6):
> +               nh_len = ntohs(ipv6_hdr(skb)->payload_len)
> +                       + sizeof(struct ipv6hdr);
> +               break;
> +       default:
> +               nh_len = skb->len;
> +       }
> +
> +       err = pskb_trim_rcsum(skb, nh_len);
> +       if (err)
This should is unlikely.
> +               kfree_skb(skb);
> +
> +       return err;
> +}
> +
This looks like a generic function, it probably does not belong to OVS
code base.

>  #ifdef CONFIG_NF_NAT_NEEDED
>  /* Modelled after nf_nat_ipv[46]_fn().
>   * range is only used for new, uninitialized NAT state.
> @@ -715,8 +742,12 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
>  {
>         int hooknum, nh_off, err = NF_ACCEPT;
>
> +       /* The nat module expects to be working at L3. */
>         nh_off = skb_network_offset(skb);
>         skb_pull_rcsum(skb, nh_off);
> +       err = skb_trim_l3(skb);
> +       if (err)
> +               return err;
>
ct-nat is executed within ct action, so I do not see why you you call
skb-trim again from ovs_ct_nat_execute().
ovs_ct_execute() trim should take care of the skb.

>         /* See HOOK2MANIP(). */
>         if (maniptype == NF_NAT_MANIP_SRC)
> @@ -1111,6 +1142,9 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
>         /* The conntrack module expects to be working at L3. */
>         nh_ofs = skb_network_offset(skb);
>         skb_pull_rcsum(skb, nh_ofs);
> +       err = skb_trim_l3(skb);
> +       if (err)
> +               return err;
>
>         if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
>                 err = handle_fragments(net, key, info->zone.id, skb);
> --
> 1.9.1
>
Li,Rongqing via dev Dec. 23, 2017, 12:39 a.m. UTC | #2
On Fri, Dec 22, 2017 at 3:31 PM, Pravin Shelar <pshelar@ovn.org> wrote:
> On Thu, Dec 21, 2017 at 7:17 AM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>> IPv4 and IPv6 packets may arrive with lower-layer padding that is not
>> included in the L3 length. For example, a short IPv4 packet may have
>> up to 6 bytes of padding following the IP payload when received on an
>> Ethernet device. In the normal IPv4 receive path, ip_rcv() trims the
>> packet to ip_hdr->tot_len before invoking netfilter hooks (including
>> conntrack and nat).
>>
>> In the IPv6 receive path, ip6_rcv() does the same using
>> ipv6_hdr->payload_len. Similarly in the br_netfilter receive path,
>> br_validate_ipv4() and br_validate_ipv6() trim the packet to the L3
>> length before invoking NF_INET_PRE_ROUTING hooks.
>>
>> In the OVS conntrack receive path, ovs_ct_execute() pulls the skb to
>> the L3 header but does not trim it to the L3 length before calling
>> nf_conntrack_in(NF_INET_PRE_ROUTING). When nf_conntrack_proto_tcp
>> encounters a packet with lower-layer padding, nf_checksum() fails and
>> logs "nf_ct_tcp: bad TCP checksum". While extra zero bytes don't
>> affect the checksum, the length in the IP pseudoheader does. That
>> length is based on skb->len, and without trimming, it doesn't match
>> the length the sender used when computing the checksum.
>>
>> The assumption throughout nf_conntrack and nf_nat is that skb->len
>> reflects the length of the L3 header and payload, so there is no need
>> to refer back to ip_hdr->tot_len or ipv6_hdr->payload_len.
>>
>> This change brings OVS into line with other netfilter users, trimming
>> IPv4 and IPv6 packets prior to L3+ netfilter processing.
>>
>> Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
>> ---
>> v2:
>> - Trim packet in nat receive path as well as conntrack
>> - Free skb on error
>> ---
>>  net/openvswitch/conntrack.c | 34 ++++++++++++++++++++++++++++++++++
>>  1 file changed, 34 insertions(+)
>>
>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>> index b27c5c6..1bdc78f 100644
>> --- a/net/openvswitch/conntrack.c
>> +++ b/net/openvswitch/conntrack.c
>> @@ -703,6 +703,33 @@ static bool skb_nfct_cached(struct net *net,
>>         return ct_executed;
>>  }
>>
>> +/* Trim the skb to the L3 length. Assumes the skb is already pulled to
>> + * the L3 header. The skb is freed on error.
>> + */
>> +static int skb_trim_l3(struct sk_buff *skb)
>> +{
>> +       unsigned int nh_len;
>> +       int err;
>> +
>> +       switch (skb->protocol) {
>> +       case htons(ETH_P_IP):
>> +               nh_len = ntohs(ip_hdr(skb)->tot_len);
>> +               break;
>> +       case htons(ETH_P_IPV6):
>> +               nh_len = ntohs(ipv6_hdr(skb)->payload_len)
>> +                       + sizeof(struct ipv6hdr);
>> +               break;
>> +       default:
>> +               nh_len = skb->len;
>> +       }
>> +
>> +       err = pskb_trim_rcsum(skb, nh_len);
>> +       if (err)
> This should is unlikely.

I'll add unlikely().

>> +               kfree_skb(skb);
>> +
>> +       return err;
>> +}
>> +
> This looks like a generic function, it probably does not belong to OVS
> code base.

Indeed. I'll move it to skbuff.c, unless you have a better idea.

>>  #ifdef CONFIG_NF_NAT_NEEDED
>>  /* Modelled after nf_nat_ipv[46]_fn().
>>   * range is only used for new, uninitialized NAT state.
>> @@ -715,8 +742,12 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
>>  {
>>         int hooknum, nh_off, err = NF_ACCEPT;
>>
>> +       /* The nat module expects to be working at L3. */
>>         nh_off = skb_network_offset(skb);
>>         skb_pull_rcsum(skb, nh_off);
>> +       err = skb_trim_l3(skb);
>> +       if (err)
>> +               return err;
>>
> ct-nat is executed within ct action, so I do not see why you you call
> skb-trim again from ovs_ct_nat_execute().
> ovs_ct_execute() trim should take care of the skb.

I see. Doesn't that mean that skb_pull_rcsum() is also unnecessary in
ovs_ct_nat_execute(), as ovs_ct_execute() has already pulled the skb
to the L3 header?

>>         /* See HOOK2MANIP(). */
>>         if (maniptype == NF_NAT_MANIP_SRC)
>> @@ -1111,6 +1142,9 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
>>         /* The conntrack module expects to be working at L3. */
>>         nh_ofs = skb_network_offset(skb);
>>         skb_pull_rcsum(skb, nh_ofs);
>> +       err = skb_trim_l3(skb);
>> +       if (err)
>> +               return err;
>>
>>         if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
>>                 err = handle_fragments(net, key, info->zone.id, skb);
>> --
>> 1.9.1
>>
Pravin Shelar Jan. 3, 2018, 6:21 a.m. UTC | #3
On Fri, Dec 22, 2017 at 4:39 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
> On Fri, Dec 22, 2017 at 3:31 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>> On Thu, Dec 21, 2017 at 7:17 AM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>>> IPv4 and IPv6 packets may arrive with lower-layer padding that is not
>>> included in the L3 length. For example, a short IPv4 packet may have
>>> up to 6 bytes of padding following the IP payload when received on an
>>> Ethernet device. In the normal IPv4 receive path, ip_rcv() trims the
>>> packet to ip_hdr->tot_len before invoking netfilter hooks (including
>>> conntrack and nat).
>>>
>>> In the IPv6 receive path, ip6_rcv() does the same using
>>> ipv6_hdr->payload_len. Similarly in the br_netfilter receive path,
>>> br_validate_ipv4() and br_validate_ipv6() trim the packet to the L3
>>> length before invoking NF_INET_PRE_ROUTING hooks.
>>>
>>> In the OVS conntrack receive path, ovs_ct_execute() pulls the skb to
>>> the L3 header but does not trim it to the L3 length before calling
>>> nf_conntrack_in(NF_INET_PRE_ROUTING). When nf_conntrack_proto_tcp
>>> encounters a packet with lower-layer padding, nf_checksum() fails and
>>> logs "nf_ct_tcp: bad TCP checksum". While extra zero bytes don't
>>> affect the checksum, the length in the IP pseudoheader does. That
>>> length is based on skb->len, and without trimming, it doesn't match
>>> the length the sender used when computing the checksum.
>>>
>>> The assumption throughout nf_conntrack and nf_nat is that skb->len
>>> reflects the length of the L3 header and payload, so there is no need
>>> to refer back to ip_hdr->tot_len or ipv6_hdr->payload_len.
>>>
>>> This change brings OVS into line with other netfilter users, trimming
>>> IPv4 and IPv6 packets prior to L3+ netfilter processing.
>>>
>>> Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
>>> ---
>>> v2:
>>> - Trim packet in nat receive path as well as conntrack
>>> - Free skb on error
>>> ---
>>>  net/openvswitch/conntrack.c | 34 ++++++++++++++++++++++++++++++++++
>>>  1 file changed, 34 insertions(+)
>>>
>>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>>> index b27c5c6..1bdc78f 100644
>>> --- a/net/openvswitch/conntrack.c
>>> +++ b/net/openvswitch/conntrack.c
>>> @@ -703,6 +703,33 @@ static bool skb_nfct_cached(struct net *net,
>>>         return ct_executed;
>>>  }
>>>
>>> +/* Trim the skb to the L3 length. Assumes the skb is already pulled to
>>> + * the L3 header. The skb is freed on error.
>>> + */
>>> +static int skb_trim_l3(struct sk_buff *skb)
>>> +{
>>> +       unsigned int nh_len;
>>> +       int err;
>>> +
>>> +       switch (skb->protocol) {
>>> +       case htons(ETH_P_IP):
>>> +               nh_len = ntohs(ip_hdr(skb)->tot_len);
>>> +               break;
>>> +       case htons(ETH_P_IPV6):
>>> +               nh_len = ntohs(ipv6_hdr(skb)->payload_len)
>>> +                       + sizeof(struct ipv6hdr);
>>> +               break;
>>> +       default:
>>> +               nh_len = skb->len;
>>> +       }
>>> +
>>> +       err = pskb_trim_rcsum(skb, nh_len);
>>> +       if (err)
>> This should is unlikely.
>
> I'll add unlikely().
>
>>> +               kfree_skb(skb);
>>> +
>>> +       return err;
>>> +}
>>> +
>> This looks like a generic function, it probably does not belong to OVS
>> code base.
>
> Indeed. I'll move it to skbuff.c, unless you have a better idea.
>
>>>  #ifdef CONFIG_NF_NAT_NEEDED
>>>  /* Modelled after nf_nat_ipv[46]_fn().
>>>   * range is only used for new, uninitialized NAT state.
>>> @@ -715,8 +742,12 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
>>>  {
>>>         int hooknum, nh_off, err = NF_ACCEPT;
>>>
>>> +       /* The nat module expects to be working at L3. */
>>>         nh_off = skb_network_offset(skb);
>>>         skb_pull_rcsum(skb, nh_off);
>>> +       err = skb_trim_l3(skb);
>>> +       if (err)
>>> +               return err;
>>>
>> ct-nat is executed within ct action, so I do not see why you you call
>> skb-trim again from ovs_ct_nat_execute().
>> ovs_ct_execute() trim should take care of the skb.
>
> I see. Doesn't that mean that skb_pull_rcsum() is also unnecessary in
> ovs_ct_nat_execute(), as ovs_ct_execute() has already pulled the skb
> to the L3 header?
>

Yes, It looks redundant. but lets address it in separate patch.
Li,Rongqing via dev Jan. 4, 2018, 3:49 a.m. UTC | #4
On Fri, Dec 22, 2017 at 3:31 PM, Pravin Shelar <pshelar@ovn.org> wrote:
> On Thu, Dec 21, 2017 at 7:17 AM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>> IPv4 and IPv6 packets may arrive with lower-layer padding that is not
>> included in the L3 length. For example, a short IPv4 packet may have
>> up to 6 bytes of padding following the IP payload when received on an
>> Ethernet device. In the normal IPv4 receive path, ip_rcv() trims the
>> packet to ip_hdr->tot_len before invoking netfilter hooks (including
>> conntrack and nat).
>>
>> In the IPv6 receive path, ip6_rcv() does the same using
>> ipv6_hdr->payload_len. Similarly in the br_netfilter receive path,
>> br_validate_ipv4() and br_validate_ipv6() trim the packet to the L3
>> length before invoking NF_INET_PRE_ROUTING hooks.
>>
>> In the OVS conntrack receive path, ovs_ct_execute() pulls the skb to
>> the L3 header but does not trim it to the L3 length before calling
>> nf_conntrack_in(NF_INET_PRE_ROUTING). When nf_conntrack_proto_tcp
>> encounters a packet with lower-layer padding, nf_checksum() fails and
>> logs "nf_ct_tcp: bad TCP checksum". While extra zero bytes don't
>> affect the checksum, the length in the IP pseudoheader does. That
>> length is based on skb->len, and without trimming, it doesn't match
>> the length the sender used when computing the checksum.
>>
>> The assumption throughout nf_conntrack and nf_nat is that skb->len
>> reflects the length of the L3 header and payload, so there is no need
>> to refer back to ip_hdr->tot_len or ipv6_hdr->payload_len.
>>
>> This change brings OVS into line with other netfilter users, trimming
>> IPv4 and IPv6 packets prior to L3+ netfilter processing.
>>
>> Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
>> ---
>> v2:
>> - Trim packet in nat receive path as well as conntrack
>> - Free skb on error
>> ---
>>  net/openvswitch/conntrack.c | 34 ++++++++++++++++++++++++++++++++++
>>  1 file changed, 34 insertions(+)
>>
>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>> index b27c5c6..1bdc78f 100644
>> --- a/net/openvswitch/conntrack.c
>> +++ b/net/openvswitch/conntrack.c
>> @@ -703,6 +703,33 @@ static bool skb_nfct_cached(struct net *net,
>>         return ct_executed;
>>  }
>>
>> +/* Trim the skb to the L3 length. Assumes the skb is already pulled to
>> + * the L3 header. The skb is freed on error.
>> + */
>> +static int skb_trim_l3(struct sk_buff *skb)
>> +{
>> +       unsigned int nh_len;
>> +       int err;
>> +
>> +       switch (skb->protocol) {
>> +       case htons(ETH_P_IP):
>> +               nh_len = ntohs(ip_hdr(skb)->tot_len);
>> +               break;
>> +       case htons(ETH_P_IPV6):
>> +               nh_len = ntohs(ipv6_hdr(skb)->payload_len)
>> +                       + sizeof(struct ipv6hdr);
>> +               break;
>> +       default:
>> +               nh_len = skb->len;
>> +       }
>> +
>> +       err = pskb_trim_rcsum(skb, nh_len);
>> +       if (err)
> This should is unlikely.
>> +               kfree_skb(skb);
>> +
>> +       return err;
>> +}
>> +
> This looks like a generic function, it probably does not belong to OVS
> code base.

It occurs to me that skb_trim_l3() can't just reach into ip_hdr(skb)
before calling pskb_may_pull(skb, sizeof(struct iphdr)) to make sure
the IP header is actually there; and for IPv4 it should validate the
IP header checksum, including options. Once we add all these steps,
skb_trim_l3() starts to look an awful lot like br_validate_ipv4() and
br_validate_ipv6(). And those in turn are eerily similar to ip_rcv()
and ip6_rcv(). It would be nice to avoid duplicating this logic yet
again.

What if we turn br_validate_ipv4() and br_validate_ipv6() into generic
functions and call them from both br_netfilter and ovs_ct--should
there be any fundamental difference between these two receive paths,
at least for L3+ conntrack processing?

For example, currently br_netfilter updates the
IPSTATS_MIB_INTRUNCATEDPKTS and IPSTATS_MIB_INDISCARDS counters. It
would be easy to make this conditional in a generic function, if we
still don't want ovs_ct to update those counters.

>>  #ifdef CONFIG_NF_NAT_NEEDED
>>  /* Modelled after nf_nat_ipv[46]_fn().
>>   * range is only used for new, uninitialized NAT state.
>> @@ -715,8 +742,12 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
>>  {
>>         int hooknum, nh_off, err = NF_ACCEPT;
>>
>> +       /* The nat module expects to be working at L3. */
>>         nh_off = skb_network_offset(skb);
>>         skb_pull_rcsum(skb, nh_off);
>> +       err = skb_trim_l3(skb);
>> +       if (err)
>> +               return err;
>>
> ct-nat is executed within ct action, so I do not see why you you call
> skb-trim again from ovs_ct_nat_execute().
> ovs_ct_execute() trim should take care of the skb.
>
>>         /* See HOOK2MANIP(). */
>>         if (maniptype == NF_NAT_MANIP_SRC)
>> @@ -1111,6 +1142,9 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
>>         /* The conntrack module expects to be working at L3. */
>>         nh_ofs = skb_network_offset(skb);
>>         skb_pull_rcsum(skb, nh_ofs);
>> +       err = skb_trim_l3(skb);
>> +       if (err)
>> +               return err;
>>
>>         if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
>>                 err = handle_fragments(net, key, info->zone.id, skb);
>> --
>> 1.9.1
>>
Pravin Shelar Jan. 5, 2018, 3:36 a.m. UTC | #5
On Wed, Jan 3, 2018 at 7:49 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
> On Fri, Dec 22, 2017 at 3:31 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>> On Thu, Dec 21, 2017 at 7:17 AM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>>> IPv4 and IPv6 packets may arrive with lower-layer padding that is not
>>> included in the L3 length. For example, a short IPv4 packet may have
>>> up to 6 bytes of padding following the IP payload when received on an
>>> Ethernet device. In the normal IPv4 receive path, ip_rcv() trims the
>>> packet to ip_hdr->tot_len before invoking netfilter hooks (including
>>> conntrack and nat).
>>>
>>> In the IPv6 receive path, ip6_rcv() does the same using
>>> ipv6_hdr->payload_len. Similarly in the br_netfilter receive path,
>>> br_validate_ipv4() and br_validate_ipv6() trim the packet to the L3
>>> length before invoking NF_INET_PRE_ROUTING hooks.
>>>
>>> In the OVS conntrack receive path, ovs_ct_execute() pulls the skb to
>>> the L3 header but does not trim it to the L3 length before calling
>>> nf_conntrack_in(NF_INET_PRE_ROUTING). When nf_conntrack_proto_tcp
>>> encounters a packet with lower-layer padding, nf_checksum() fails and
>>> logs "nf_ct_tcp: bad TCP checksum". While extra zero bytes don't
>>> affect the checksum, the length in the IP pseudoheader does. That
>>> length is based on skb->len, and without trimming, it doesn't match
>>> the length the sender used when computing the checksum.
>>>
>>> The assumption throughout nf_conntrack and nf_nat is that skb->len
>>> reflects the length of the L3 header and payload, so there is no need
>>> to refer back to ip_hdr->tot_len or ipv6_hdr->payload_len.
>>>
>>> This change brings OVS into line with other netfilter users, trimming
>>> IPv4 and IPv6 packets prior to L3+ netfilter processing.
>>>
>>> Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
>>> ---
>>> v2:
>>> - Trim packet in nat receive path as well as conntrack
>>> - Free skb on error
>>> ---
>>>  net/openvswitch/conntrack.c | 34 ++++++++++++++++++++++++++++++++++
>>>  1 file changed, 34 insertions(+)
>>>
>>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>>> index b27c5c6..1bdc78f 100644
>>> --- a/net/openvswitch/conntrack.c
>>> +++ b/net/openvswitch/conntrack.c
>>> @@ -703,6 +703,33 @@ static bool skb_nfct_cached(struct net *net,
>>>         return ct_executed;
>>>  }
>>>
>>> +/* Trim the skb to the L3 length. Assumes the skb is already pulled to
>>> + * the L3 header. The skb is freed on error.
>>> + */
>>> +static int skb_trim_l3(struct sk_buff *skb)
>>> +{
>>> +       unsigned int nh_len;
>>> +       int err;
>>> +
>>> +       switch (skb->protocol) {
>>> +       case htons(ETH_P_IP):
>>> +               nh_len = ntohs(ip_hdr(skb)->tot_len);
>>> +               break;
>>> +       case htons(ETH_P_IPV6):
>>> +               nh_len = ntohs(ipv6_hdr(skb)->payload_len)
>>> +                       + sizeof(struct ipv6hdr);
>>> +               break;
>>> +       default:
>>> +               nh_len = skb->len;
>>> +       }
>>> +
>>> +       err = pskb_trim_rcsum(skb, nh_len);
>>> +       if (err)
>> This should is unlikely.
>>> +               kfree_skb(skb);
>>> +
>>> +       return err;
>>> +}
>>> +
>> This looks like a generic function, it probably does not belong to OVS
>> code base.
>
> It occurs to me that skb_trim_l3() can't just reach into ip_hdr(skb)
> before calling pskb_may_pull(skb, sizeof(struct iphdr)) to make sure
> the IP header is actually there; and for IPv4 it should validate the
> IP header checksum, including options. Once we add all these steps,
> skb_trim_l3() starts to look an awful lot like br_validate_ipv4() and
> br_validate_ipv6(). And those in turn are eerily similar to ip_rcv()
> and ip6_rcv(). It would be nice to avoid duplicating this logic yet
> again.
>
OVS already pull all required headers in skb linear data, so no need
to redo all of it. only check required is the ip-checksum validation.
I think we could avoid it in most of cases by checking skb length to
ipheader length before verifying the ip header-checksum.
Li,Rongqing via dev Jan. 5, 2018, 6:14 p.m. UTC | #6
On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar <pshelar@ovn.org> wrote:
> On Wed, Jan 3, 2018 at 7:49 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>> On Fri, Dec 22, 2017 at 3:31 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>>> On Thu, Dec 21, 2017 at 7:17 AM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>>>> IPv4 and IPv6 packets may arrive with lower-layer padding that is not
>>>> included in the L3 length. For example, a short IPv4 packet may have
>>>> up to 6 bytes of padding following the IP payload when received on an
>>>> Ethernet device. In the normal IPv4 receive path, ip_rcv() trims the
>>>> packet to ip_hdr->tot_len before invoking netfilter hooks (including
>>>> conntrack and nat).
>>>>
>>>> In the IPv6 receive path, ip6_rcv() does the same using
>>>> ipv6_hdr->payload_len. Similarly in the br_netfilter receive path,
>>>> br_validate_ipv4() and br_validate_ipv6() trim the packet to the L3
>>>> length before invoking NF_INET_PRE_ROUTING hooks.
>>>>
>>>> In the OVS conntrack receive path, ovs_ct_execute() pulls the skb to
>>>> the L3 header but does not trim it to the L3 length before calling
>>>> nf_conntrack_in(NF_INET_PRE_ROUTING). When nf_conntrack_proto_tcp
>>>> encounters a packet with lower-layer padding, nf_checksum() fails and
>>>> logs "nf_ct_tcp: bad TCP checksum". While extra zero bytes don't
>>>> affect the checksum, the length in the IP pseudoheader does. That
>>>> length is based on skb->len, and without trimming, it doesn't match
>>>> the length the sender used when computing the checksum.
>>>>
>>>> The assumption throughout nf_conntrack and nf_nat is that skb->len
>>>> reflects the length of the L3 header and payload, so there is no need
>>>> to refer back to ip_hdr->tot_len or ipv6_hdr->payload_len.
>>>>
>>>> This change brings OVS into line with other netfilter users, trimming
>>>> IPv4 and IPv6 packets prior to L3+ netfilter processing.
>>>>
>>>> Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
>>>> ---
>>>> v2:
>>>> - Trim packet in nat receive path as well as conntrack
>>>> - Free skb on error
>>>> ---
>>>>  net/openvswitch/conntrack.c | 34 ++++++++++++++++++++++++++++++++++
>>>>  1 file changed, 34 insertions(+)
>>>>
>>>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>>>> index b27c5c6..1bdc78f 100644
>>>> --- a/net/openvswitch/conntrack.c
>>>> +++ b/net/openvswitch/conntrack.c
>>>> @@ -703,6 +703,33 @@ static bool skb_nfct_cached(struct net *net,
>>>>         return ct_executed;
>>>>  }
>>>>
>>>> +/* Trim the skb to the L3 length. Assumes the skb is already pulled to
>>>> + * the L3 header. The skb is freed on error.
>>>> + */
>>>> +static int skb_trim_l3(struct sk_buff *skb)
>>>> +{
>>>> +       unsigned int nh_len;
>>>> +       int err;
>>>> +
>>>> +       switch (skb->protocol) {
>>>> +       case htons(ETH_P_IP):
>>>> +               nh_len = ntohs(ip_hdr(skb)->tot_len);
>>>> +               break;
>>>> +       case htons(ETH_P_IPV6):
>>>> +               nh_len = ntohs(ipv6_hdr(skb)->payload_len)
>>>> +                       + sizeof(struct ipv6hdr);
>>>> +               break;
>>>> +       default:
>>>> +               nh_len = skb->len;
>>>> +       }
>>>> +
>>>> +       err = pskb_trim_rcsum(skb, nh_len);
>>>> +       if (err)
>>> This should is unlikely.
>>>> +               kfree_skb(skb);
>>>> +
>>>> +       return err;
>>>> +}
>>>> +
>>> This looks like a generic function, it probably does not belong to OVS
>>> code base.
>>
>> It occurs to me that skb_trim_l3() can't just reach into ip_hdr(skb)
>> before calling pskb_may_pull(skb, sizeof(struct iphdr)) to make sure
>> the IP header is actually there; and for IPv4 it should validate the
>> IP header checksum, including options. Once we add all these steps,
>> skb_trim_l3() starts to look an awful lot like br_validate_ipv4() and
>> br_validate_ipv6(). And those in turn are eerily similar to ip_rcv()
>> and ip6_rcv(). It would be nice to avoid duplicating this logic yet
>> again.
>>
> OVS already pull all required headers in skb linear data, so no need
> to redo all of it. only check required is the ip-checksum validation.
> I think we could avoid it in most of cases by checking skb length to
> ipheader length before verifying the ip header-checksum.

Shouldn't the IP header checksum be verified even earlier, like in
key_extract(), before actually using any of the fields in the IP
header?

And since key_extract() is already inspecting the IP/IPv6 header, it
would be a convenient spot to check whether the skb->len matches. If
there's a difference, it could record the number of bytes to trim in
an ovs_skb_cb field. Then ovs_ct_execute() would look at this field
and trim the skb only if necessary.
Li,Rongqing via dev Jan. 5, 2018, 11:20 p.m. UTC | #7
On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk <eswierk@skyportsystems.com>
wrote:
> On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>> OVS already pull all required headers in skb linear data, so no need
>> to redo all of it. only check required is the ip-checksum validation.
>> I think we could avoid it in most of cases by checking skb length to
>> ipheader length before verifying the ip header-checksum.
>
> Shouldn't the IP header checksum be verified even earlier, like in
> key_extract(), before actually using any of the fields in the IP
> header?

Something like this for verifying the IP header checksum (not tested):

--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -203,6 +203,7 @@ static int check_iphdr(struct sk_buff *skb)
 {
  unsigned int nh_ofs = skb_network_offset(skb);
  unsigned int ip_len;
+ const struct iphdr *nh;
  int err;

  err = check_header(skb, nh_ofs + sizeof(struct iphdr));
@@ -214,6 +215,13 @@ static int check_iphdr(struct sk_buff *skb)
       skb->len < nh_ofs + ip_len))
  return -EINVAL;

+ if (unlikely(!pskb_may_pull(skb, nh_ofs + ip_len)))
+ return -ENOMEM;
+
+ nh = ip_hdr(skb);
+ if (unlikely(ip_fast_csum((u8 *)nh, nh->ihl)))
+ return -EINVAL;
+
  skb_set_transport_header(skb, nh_ofs + ip_len);
  return 0;
 }

> And since key_extract() is already inspecting the IP/IPv6 header, it
> would be a convenient spot to check whether the skb->len matches. If
> there's a difference, it could record the number of bytes to trim in
> an ovs_skb_cb field. Then ovs_ct_execute() would look at this field
> and trim the skb only if necessary.

And something like this for trimming the padding (also not tested):

--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1112,6 +1112,14 @@ int ovs_ct_execute(struct net *net, struct sk_buff
*skb,
  nh_ofs = skb_network_offset(skb);
  skb_pull_rcsum(skb, nh_ofs);

+ if (unlikely(OVS_CB(skb)->padlen)) {
+ err = pskb_trim_rcsum(skb, skb->len - OVS_CB(skb)->padlen);
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+ }
+
  if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
  err = handle_fragments(net, key, info->zone.id, skb);
  if (err)
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 523d655..eab29fa 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -106,12 +106,14 @@ struct datapath {
  * fragmented.
  * @acts_origlen: The netlink size of the flow actions applied to this skb.
  * @cutlen: The number of bytes from the packet end to be removed.
+ * @padlen: The number of padding bytes to be ignored in L3+ processing.
  */
 struct ovs_skb_cb {
  struct vport *input_vport;
  u16 mru;
  u16 acts_origlen;
  u32 cutlen;
+ u32                     padlen;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 987a97f..c749dfd 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -223,6 +223,7 @@ static int check_iphdr(struct sk_buff *skb)
  return -EINVAL;

  skb_set_transport_header(skb, nh_ofs + ip_len);
+ OVS_CB(skb)->padlen = skb->len - nh_ofs - ntohs(nh->tot_len);
  return 0;
 }

@@ -305,6 +306,7 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct
sw_flow_key *key)

  nh_len = payload_ofs - nh_ofs;
  skb_set_transport_header(skb, nh_ofs + nh_len);
+ OVS_CB(skb)->padlen = skb->len - payload_ofs - ntohs(nh->payload_len);
  key->ip.proto = nexthdr;
  return nh_len;
 }
Pravin Shelar Jan. 6, 2018, 6:17 a.m. UTC | #8
On Fri, Jan 5, 2018 at 3:20 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
> On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk <eswierk@skyportsystems.com>
> wrote:
>> On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>>> OVS already pull all required headers in skb linear data, so no need
>>> to redo all of it. only check required is the ip-checksum validation.
>>> I think we could avoid it in most of cases by checking skb length to
>>> ipheader length before verifying the ip header-checksum.
>>
>> Shouldn't the IP header checksum be verified even earlier, like in
>> key_extract(), before actually using any of the fields in the IP
>> header?
>
> Something like this for verifying the IP header checksum (not tested):
>
AFAIU openflow does not need this verification, so it is not required
in flow extract.
Li,Rongqing via dev Jan. 6, 2018, 6:59 a.m. UTC | #9
On Jan 5, 2018 22:17, "Pravin Shelar" <pshelar@ovn.org> wrote:

On Fri, Jan 5, 2018 at 3:20 PM, Ed Swierk <eswierk@skyportsystems.com>
wrote:
> On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk <eswierk@skyportsystems.com>
> wrote:
>> On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>>> OVS already pull all required headers in skb linear data, so no need
>>> to redo all of it. only check required is the ip-checksum validation.
>>> I think we could avoid it in most of cases by checking skb length to
>>> ipheader length before verifying the ip header-checksum.
>>
>> Shouldn't the IP header checksum be verified even earlier, like in
>> key_extract(), before actually using any of the fields in the IP
>> header?
>
> Something like this for verifying the IP header checksum (not tested):
>
AFAIU openflow does not need this verification, so it is not required
in flow extract.


Okay. How about my proposed trimming implementation, caching the pad length
in the ovs cb?
Pravin Shelar Jan. 6, 2018, 6:57 p.m. UTC | #10
On Fri, Jan 5, 2018 at 10:59 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>
>
> On Jan 5, 2018 22:17, "Pravin Shelar" <pshelar@ovn.org> wrote:
>
> On Fri, Jan 5, 2018 at 3:20 PM, Ed Swierk <eswierk@skyportsystems.com>
> wrote:
>> On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk <eswierk@skyportsystems.com>
>> wrote:
>>> On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>>>> OVS already pull all required headers in skb linear data, so no need
>>>> to redo all of it. only check required is the ip-checksum validation.
>>>> I think we could avoid it in most of cases by checking skb length to
>>>> ipheader length before verifying the ip header-checksum.
>>>
>>> Shouldn't the IP header checksum be verified even earlier, like in
>>> key_extract(), before actually using any of the fields in the IP
>>> header?
>>
>> Something like this for verifying the IP header checksum (not tested):
>>
> AFAIU openflow does not need this verification, so it is not required
> in flow extract.
>
>
> Okay. How about my proposed trimming implementation, caching the pad length
> in the ovs cb?
>
Caching the length is not that simple, OVS actions can change the
length. Keeping it consistent with packet would be more work, so lets
calculate it in ovs-ct function.
Pravin Shelar Jan. 9, 2018, 12:05 a.m. UTC | #11
On Sat, Jan 6, 2018 at 10:57 AM, Pravin Shelar <pshelar@ovn.org> wrote:
> On Fri, Jan 5, 2018 at 10:59 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>>
>>
>> On Jan 5, 2018 22:17, "Pravin Shelar" <pshelar@ovn.org> wrote:
>>
>> On Fri, Jan 5, 2018 at 3:20 PM, Ed Swierk <eswierk@skyportsystems.com>
>> wrote:
>>> On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk <eswierk@skyportsystems.com>
>>> wrote:
>>>> On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>>>>> OVS already pull all required headers in skb linear data, so no need
>>>>> to redo all of it. only check required is the ip-checksum validation.
>>>>> I think we could avoid it in most of cases by checking skb length to
>>>>> ipheader length before verifying the ip header-checksum.
>>>>
>>>> Shouldn't the IP header checksum be verified even earlier, like in
>>>> key_extract(), before actually using any of the fields in the IP
>>>> header?
>>>
>>> Something like this for verifying the IP header checksum (not tested):
>>>
>> AFAIU openflow does not need this verification, so it is not required
>> in flow extract.
>>
>>
>> Okay. How about my proposed trimming implementation, caching the pad length
>> in the ovs cb?
>>
> Caching the length is not that simple, OVS actions can change the
> length. Keeping it consistent with packet would be more work, so lets
> calculate it in ovs-ct function.

You could make it specific for skb-len-trimming, something like
boolean flag. so that it is easy to reason with.
Li,Rongqing via dev Jan. 9, 2018, 3:02 a.m. UTC | #12
On 1/6/18 10:57, Pravin Shelar wrote:
> On Fri, Jan 5, 2018 at 10:59 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>>
>>
>> On Jan 5, 2018 22:17, "Pravin Shelar" <pshelar@ovn.org> wrote:
>>
>> On Fri, Jan 5, 2018 at 3:20 PM, Ed Swierk <eswierk@skyportsystems.com>
>> wrote:
>>> On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk <eswierk@skyportsystems.com>
>>> wrote:
>>>> On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>>>>> OVS already pull all required headers in skb linear data, so no need
>>>>> to redo all of it. only check required is the ip-checksum validation.
>>>>> I think we could avoid it in most of cases by checking skb length to
>>>>> ipheader length before verifying the ip header-checksum.
>>>>
>>>> Shouldn't the IP header checksum be verified even earlier, like in
>>>> key_extract(), before actually using any of the fields in the IP
>>>> header?
>>>
>>> Something like this for verifying the IP header checksum (not tested):
>>>
>> AFAIU openflow does not need this verification, so it is not required
>> in flow extract.
>>
>>
>> Okay. How about my proposed trimming implementation, caching the pad length
>> in the ovs cb?
>>
> Caching the length is not that simple, OVS actions can change the
> length. Keeping it consistent with packet would be more work, so lets
> calculate it in ovs-ct function.
> 

Something like this?

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a38c80e..282325d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4084,6 +4084,8 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
 				     unsigned int transport_len,
 				     __sum16(*skb_chkf)(struct sk_buff *skb));
 
+int skb_network_trim(struct sk_buff *skb);
+
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
  * @skb: skb to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 08f5740..c68e927 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4740,6 +4740,41 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(skb_checksum_trimmed);
 
+/**
+ * skb_network_trim - trim skb to length specified by the network header
+ * @skb: the skb to trim
+ *
+ * Trims the skb to the length specified by the network header,
+ * removing any trailing padding. Leaves the skb alone if the protocol
+ * is not IP or IPv6. Frees the skb on error.
+ * 
+ * Caller needs to pull the skb to the network header.
+ */
+int skb_network_trim(struct sk_buff *skb)
+{
+	unsigned int len;
+	int err;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		len = ntohs(ip_hdr(skb)->tot_len);
+		break;
+	case htons(ETH_P_IPV6):
+		len = sizeof(struct ipv6hdr)
+			+ ntohs(ipv6_hdr(skb)->payload_len);
+		break;
+	default:
+		len = skb->len;
+	}
+
+	err = pskb_trim_rcsum(skb, len);
+	if (unlikely(err))
+		kfree_skb(skb);
+
+	return err;
+}
+EXPORT_SYMBOL(skb_network_trim);
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb)
 {
 	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index b27c5c6..73418d3 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1112,6 +1112,10 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 	nh_ofs = skb_network_offset(skb);
 	skb_pull_rcsum(skb, nh_ofs);
 
+	err = skb_network_trim(skb);
+	if (err)
+		return err;
+
 	if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
 		err = handle_fragments(net, key, info->zone.id, skb);
 		if (err)
Pravin Shelar Jan. 9, 2018, 10:06 p.m. UTC | #13
On Mon, Jan 8, 2018 at 7:02 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
> On 1/6/18 10:57, Pravin Shelar wrote:
>> On Fri, Jan 5, 2018 at 10:59 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>>>
>>>
>>> On Jan 5, 2018 22:17, "Pravin Shelar" <pshelar@ovn.org> wrote:
>>>
>>> On Fri, Jan 5, 2018 at 3:20 PM, Ed Swierk <eswierk@skyportsystems.com>
>>> wrote:
>>>> On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk <eswierk@skyportsystems.com>
>>>> wrote:
>>>>> On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>>>>>> OVS already pull all required headers in skb linear data, so no need
>>>>>> to redo all of it. only check required is the ip-checksum validation.
>>>>>> I think we could avoid it in most of cases by checking skb length to
>>>>>> ipheader length before verifying the ip header-checksum.
>>>>>
>>>>> Shouldn't the IP header checksum be verified even earlier, like in
>>>>> key_extract(), before actually using any of the fields in the IP
>>>>> header?
>>>>
>>>> Something like this for verifying the IP header checksum (not tested):
>>>>
>>> AFAIU openflow does not need this verification, so it is not required
>>> in flow extract.
>>>
>>>
>>> Okay. How about my proposed trimming implementation, caching the pad length
>>> in the ovs cb?
>>>
>> Caching the length is not that simple, OVS actions can change the
>> length. Keeping it consistent with packet would be more work, so lets
>> calculate it in ovs-ct function.
>>
>
> Something like this?
>
Sure, Can you submit formal patch?

> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index a38c80e..282325d 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -4084,6 +4084,8 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
>                                      unsigned int transport_len,
>                                      __sum16(*skb_chkf)(struct sk_buff *skb));
>
> +int skb_network_trim(struct sk_buff *skb);
> +
>  /**
>   * skb_head_is_locked - Determine if the skb->head is locked down
>   * @skb: skb to check
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 08f5740..c68e927 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -4740,6 +4740,41 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
>  }
>  EXPORT_SYMBOL(skb_checksum_trimmed);
>
> +/**
> + * skb_network_trim - trim skb to length specified by the network header
> + * @skb: the skb to trim
> + *
> + * Trims the skb to the length specified by the network header,
> + * removing any trailing padding. Leaves the skb alone if the protocol
> + * is not IP or IPv6. Frees the skb on error.
> + *
> + * Caller needs to pull the skb to the network header.
> + */
> +int skb_network_trim(struct sk_buff *skb)
> +{
> +       unsigned int len;
> +       int err;
> +
> +       switch (skb->protocol) {
> +       case htons(ETH_P_IP):
> +               len = ntohs(ip_hdr(skb)->tot_len);
> +               break;
> +       case htons(ETH_P_IPV6):
> +               len = sizeof(struct ipv6hdr)
> +                       + ntohs(ipv6_hdr(skb)->payload_len);
> +               break;
> +       default:
> +               len = skb->len;
> +       }
> +
> +       err = pskb_trim_rcsum(skb, len);
> +       if (unlikely(err))
> +               kfree_skb(skb);
> +
> +       return err;
> +}
> +EXPORT_SYMBOL(skb_network_trim);
> +
>  void __skb_warn_lro_forwarding(const struct sk_buff *skb)
>  {
>         net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> index b27c5c6..73418d3 100644
> --- a/net/openvswitch/conntrack.c
> +++ b/net/openvswitch/conntrack.c
> @@ -1112,6 +1112,10 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
>         nh_ofs = skb_network_offset(skb);
>         skb_pull_rcsum(skb, nh_ofs);
>
> +       err = skb_network_trim(skb);
> +       if (err)
> +               return err;
> +
>         if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
>                 err = handle_fragments(net, key, info->zone.id, skb);
>                 if (err)
diff mbox series

Patch

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index b27c5c6..1bdc78f 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -703,6 +703,33 @@  static bool skb_nfct_cached(struct net *net,
 	return ct_executed;
 }
 
+/* Trim the skb to the L3 length. Assumes the skb is already pulled to
+ * the L3 header. The skb is freed on error.
+ */
+static int skb_trim_l3(struct sk_buff *skb)
+{
+	unsigned int nh_len;
+	int err;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		nh_len = ntohs(ip_hdr(skb)->tot_len);
+		break;
+	case htons(ETH_P_IPV6):
+		nh_len = ntohs(ipv6_hdr(skb)->payload_len)
+			+ sizeof(struct ipv6hdr);
+		break;
+	default:
+		nh_len = skb->len;
+	}
+
+	err = pskb_trim_rcsum(skb, nh_len);
+	if (err)
+		kfree_skb(skb);
+
+	return err;
+}
+
 #ifdef CONFIG_NF_NAT_NEEDED
 /* Modelled after nf_nat_ipv[46]_fn().
  * range is only used for new, uninitialized NAT state.
@@ -715,8 +742,12 @@  static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
 {
 	int hooknum, nh_off, err = NF_ACCEPT;
 
+	/* The nat module expects to be working at L3. */
 	nh_off = skb_network_offset(skb);
 	skb_pull_rcsum(skb, nh_off);
+	err = skb_trim_l3(skb);
+	if (err)
+		return err;
 
 	/* See HOOK2MANIP(). */
 	if (maniptype == NF_NAT_MANIP_SRC)
@@ -1111,6 +1142,9 @@  int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 	/* The conntrack module expects to be working at L3. */
 	nh_ofs = skb_network_offset(skb);
 	skb_pull_rcsum(skb, nh_ofs);
+	err = skb_trim_l3(skb);
+	if (err)
+		return err;
 
 	if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
 		err = handle_fragments(net, key, info->zone.id, skb);