diff mbox series

[net] net: Do not clear the socket TX queue in sock_orphan()

Message ID 1592748544-41555-1-git-send-email-tariqt@mellanox.com
State Superseded
Delegated to: David Miller
Headers show
Series [net] net: Do not clear the socket TX queue in sock_orphan() | expand

Commit Message

Tariq Toukan June 21, 2020, 2:09 p.m. UTC
sock_orphan() call to sk_set_socket() implies clearing the sock TX queue.
This might cause unexpected out-of-order transmit, as outstanding packets
can pick a different TX queue and bypass the ones already queued.
This is undesired in general. More specifically, it breaks the in-order
scheduling property guarantee for device-offloaded TLS sockets.

Introduce a function variation __sk_set_socket() that does not clear
the TX queue, and call it from sock_orphan().
All other callers of sk_set_socket() do not operate on an active socket,
so they do not need this change.

Fixes: e022f0b4a03f ("net: Introduce sk_tx_queue_mapping")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Boris Pismenny <borisp@mellanox.com>
---
 include/net/sock.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

Please queue for -stable.

Comments

Eric Dumazet June 22, 2020, 3:53 p.m. UTC | #1
On 6/21/20 7:09 AM, Tariq Toukan wrote:
> sock_orphan() call to sk_set_socket() implies clearing the sock TX queue.
> This might cause unexpected out-of-order transmit, as outstanding packets
> can pick a different TX queue and bypass the ones already queued.
> This is undesired in general. More specifically, it breaks the in-order
> scheduling property guarantee for device-offloaded TLS sockets.
> 
> Introduce a function variation __sk_set_socket() that does not clear
> the TX queue, and call it from sock_orphan().
> All other callers of sk_set_socket() do not operate on an active socket,
> so they do not need this change.
> 
> Fixes: e022f0b4a03f ("net: Introduce sk_tx_queue_mapping")
> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
> Reviewed-by: Boris Pismenny <borisp@mellanox.com>
> ---
>  include/net/sock.h | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> Please queue for -stable.
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index c53cc42b5ab9..23e43f3d79f0 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1846,10 +1846,15 @@ static inline int sk_rx_queue_get(const struct sock *sk)
>  }
>  #endif
>  
> +static inline void __sk_set_socket(struct sock *sk, struct socket *sock)
> +{
> +	sk->sk_socket = sock;
> +}
> +
>  static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>  {
>  	sk_tx_queue_clear(sk);
> -	sk->sk_socket = sock;
> +	__sk_set_socket(sk, sock);
>  }
>  

Hmm...

I think we should have a single sk_set_socket() call, and remove
the sk_tx_queue_clear() from it.

sk_tx_queue_clear() should be put where it is needed, instead of being hidden
in sk_set_socket()

diff --git a/include/net/sock.h b/include/net/sock.h
index c53cc42b5ab92d0062519e60435b85c75564a967..3428619faae4340485b200f49d9cce4fb09086b3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1848,7 +1848,6 @@ static inline int sk_rx_queue_get(const struct sock *sk)
 
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
-       sk_tx_queue_clear(sk);
        sk->sk_socket = sock;
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c4acf1f0220b1f925ebcfaa847632ec0dbe0b9b..134de0d37f77ba781b2b3341c94a97a1b2d57a2d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1767,6 +1767,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                cgroup_sk_alloc(&sk->sk_cgrp_data);
                sock_update_classid(&sk->sk_cgrp_data);
                sock_update_netprioidx(&sk->sk_cgrp_data);
+               sk_tx_queue_clear(sk);
        }
 
        return sk;
@@ -1990,6 +1991,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                 */
                sk_refcnt_debug_inc(newsk);
                sk_set_socket(newsk, NULL);
+               sk_tx_queue_clear(newsk);
                RCU_INIT_POINTER(newsk->sk_wq, NULL);
 
                if (newsk->sk_prot->sockets_allocated)
Tariq Toukan June 22, 2020, 4:24 p.m. UTC | #2
On 6/22/2020 6:53 PM, Eric Dumazet wrote:
> 
> 
> On 6/21/20 7:09 AM, Tariq Toukan wrote:
>> sock_orphan() call to sk_set_socket() implies clearing the sock TX queue.
>> This might cause unexpected out-of-order transmit, as outstanding packets
>> can pick a different TX queue and bypass the ones already queued.
>> This is undesired in general. More specifically, it breaks the in-order
>> scheduling property guarantee for device-offloaded TLS sockets.
>>
>> Introduce a function variation __sk_set_socket() that does not clear
>> the TX queue, and call it from sock_orphan().
>> All other callers of sk_set_socket() do not operate on an active socket,
>> so they do not need this change.
>>
>> Fixes: e022f0b4a03f ("net: Introduce sk_tx_queue_mapping")
>> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
>> Reviewed-by: Boris Pismenny <borisp@mellanox.com>
>> ---
>>   include/net/sock.h | 9 +++++++--
>>   1 file changed, 7 insertions(+), 2 deletions(-)
>>
>> Please queue for -stable.
>>
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index c53cc42b5ab9..23e43f3d79f0 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -1846,10 +1846,15 @@ static inline int sk_rx_queue_get(const struct sock *sk)
>>   }
>>   #endif
>>   
>> +static inline void __sk_set_socket(struct sock *sk, struct socket *sock)
>> +{
>> +	sk->sk_socket = sock;
>> +}
>> +
>>   static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>   {
>>   	sk_tx_queue_clear(sk);
>> -	sk->sk_socket = sock;
>> +	__sk_set_socket(sk, sock);
>>   }
>>   
> 
> Hmm...
> 
> I think we should have a single sk_set_socket() call, and remove
> the sk_tx_queue_clear() from it.
> 
> sk_tx_queue_clear() should be put where it is needed, instead of being hidden
> in sk_set_socket()
> 

Thanks Eric, sounds good to me, I will respin, just have some questions 
below.

> diff --git a/include/net/sock.h b/include/net/sock.h
> index c53cc42b5ab92d0062519e60435b85c75564a967..3428619faae4340485b200f49d9cce4fb09086b3 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1848,7 +1848,6 @@ static inline int sk_rx_queue_get(const struct sock *sk)
>   
>   static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>   {
> -       sk_tx_queue_clear(sk);
>          sk->sk_socket = sock;
>   }
>   
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 6c4acf1f0220b1f925ebcfaa847632ec0dbe0b9b..134de0d37f77ba781b2b3341c94a97a1b2d57a2d 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1767,6 +1767,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
>                  cgroup_sk_alloc(&sk->sk_cgrp_data);
>                  sock_update_classid(&sk->sk_cgrp_data);
>                  sock_update_netprioidx(&sk->sk_cgrp_data);
> +               sk_tx_queue_clear(sk);

Why add it here?
I don't see a call to sk_set_socket().

>          }
>   
>          return sk;
> @@ -1990,6 +1991,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>                   */
>                  sk_refcnt_debug_inc(newsk);
>                  sk_set_socket(newsk, NULL);
> +               sk_tx_queue_clear(newsk);
>                  RCU_INIT_POINTER(newsk->sk_wq, NULL);
>   
>                  if (newsk->sk_prot->sockets_allocated)
> 

So in addition to sock_orphan(), now sk_tx_queue_clear() won't be called 
also from:
1. sock_graft(): Looks fine to me.
2. sock_init_data(): I think we should add an explicit call to 
sk_tx_queue_clear() here. The one for RX sk_rx_queue_clear() is already 
there.
3. mptcp_sock_graft(): Looks fine to me.

Regards,
Tariq
Eric Dumazet June 22, 2020, 4:56 p.m. UTC | #3
On 6/22/20 9:24 AM, Tariq Toukan wrote:
> 
> 
> On 6/22/2020 6:53 PM, Eric Dumazet wrote:
>>
>>
>> On 6/21/20 7:09 AM, Tariq Toukan wrote:
>>> sock_orphan() call to sk_set_socket() implies clearing the sock TX queue.
>>> This might cause unexpected out-of-order transmit, as outstanding packets
>>> can pick a different TX queue and bypass the ones already queued.
>>> This is undesired in general. More specifically, it breaks the in-order
>>> scheduling property guarantee for device-offloaded TLS sockets.
>>>
>>> Introduce a function variation __sk_set_socket() that does not clear
>>> the TX queue, and call it from sock_orphan().
>>> All other callers of sk_set_socket() do not operate on an active socket,
>>> so they do not need this change.
>>>
>>> Fixes: e022f0b4a03f ("net: Introduce sk_tx_queue_mapping")
>>> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
>>> Reviewed-by: Boris Pismenny <borisp@mellanox.com>
>>> ---
>>>   include/net/sock.h | 9 +++++++--
>>>   1 file changed, 7 insertions(+), 2 deletions(-)
>>>
>>> Please queue for -stable.
>>>
>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>> index c53cc42b5ab9..23e43f3d79f0 100644
>>> --- a/include/net/sock.h
>>> +++ b/include/net/sock.h
>>> @@ -1846,10 +1846,15 @@ static inline int sk_rx_queue_get(const struct sock *sk)
>>>   }
>>>   #endif
>>>   +static inline void __sk_set_socket(struct sock *sk, struct socket *sock)
>>> +{
>>> +    sk->sk_socket = sock;
>>> +}
>>> +
>>>   static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>>   {
>>>       sk_tx_queue_clear(sk);
>>> -    sk->sk_socket = sock;
>>> +    __sk_set_socket(sk, sock);
>>>   }
>>>   
>>
>> Hmm...
>>
>> I think we should have a single sk_set_socket() call, and remove
>> the sk_tx_queue_clear() from it.
>>
>> sk_tx_queue_clear() should be put where it is needed, instead of being hidden
>> in sk_set_socket()
>>
> 
> Thanks Eric, sounds good to me, I will respin, just have some questions below.
> 
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index c53cc42b5ab92d0062519e60435b85c75564a967..3428619faae4340485b200f49d9cce4fb09086b3 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -1848,7 +1848,6 @@ static inline int sk_rx_queue_get(const struct sock *sk)
>>     static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>   {
>> -       sk_tx_queue_clear(sk);
>>          sk->sk_socket = sock;
>>   }
>>   diff --git a/net/core/sock.c b/net/core/sock.c
>> index 6c4acf1f0220b1f925ebcfaa847632ec0dbe0b9b..134de0d37f77ba781b2b3341c94a97a1b2d57a2d 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -1767,6 +1767,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
>>                  cgroup_sk_alloc(&sk->sk_cgrp_data);
>>                  sock_update_classid(&sk->sk_cgrp_data);
>>                  sock_update_netprioidx(&sk->sk_cgrp_data);
>> +               sk_tx_queue_clear(sk);
> 
> Why add it here?
> I don't see a call to sk_set_socket().

Yes, but the intent is to set the initial value of sk->sk_tx_queue_mapping (USHRT_MAX)
when socket object is allocated/populated, not later in some unrelated paths.

We move into one location all the initializers.
(Most fields initial value is 0, so we do not clear them a second time)

> 
>>          }
>>            return sk;
>> @@ -1990,6 +1991,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>>                   */
>>                  sk_refcnt_debug_inc(newsk);
>>                  sk_set_socket(newsk, NULL);
>> +               sk_tx_queue_clear(newsk);
>>                  RCU_INIT_POINTER(newsk->sk_wq, NULL);
>>                    if (newsk->sk_prot->sockets_allocated)
>>
> 
> So in addition to sock_orphan(), now sk_tx_queue_clear() won't be called also from:
> 1. sock_graft(): Looks fine to me.
> 2. sock_init_data(): I think we should add an explicit call to sk_tx_queue_clear() here. The one for RX sk_rx_queue_clear() is already there.

Why ? Initial value found is socket should be just fine.

If not, normal skb->ooo_okay should prevail, if protocols really care about OOO.

> 3. mptcp_sock_graft(): Looks fine to me.
> 
> Regards,
> Tariq
Tariq Toukan June 22, 2020, 6:25 p.m. UTC | #4
On 6/22/2020 7:56 PM, Eric Dumazet wrote:
> 
> 
> On 6/22/20 9:24 AM, Tariq Toukan wrote:
>>
>>
>> On 6/22/2020 6:53 PM, Eric Dumazet wrote:
>>>
>>>
>>> On 6/21/20 7:09 AM, Tariq Toukan wrote:
>>>> sock_orphan() call to sk_set_socket() implies clearing the sock TX queue.
>>>> This might cause unexpected out-of-order transmit, as outstanding packets
>>>> can pick a different TX queue and bypass the ones already queued.
>>>> This is undesired in general. More specifically, it breaks the in-order
>>>> scheduling property guarantee for device-offloaded TLS sockets.
>>>>
>>>> Introduce a function variation __sk_set_socket() that does not clear
>>>> the TX queue, and call it from sock_orphan().
>>>> All other callers of sk_set_socket() do not operate on an active socket,
>>>> so they do not need this change.
>>>>
>>>> Fixes: e022f0b4a03f ("net: Introduce sk_tx_queue_mapping")
>>>> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
>>>> Reviewed-by: Boris Pismenny <borisp@mellanox.com>
>>>> ---
>>>>    include/net/sock.h | 9 +++++++--
>>>>    1 file changed, 7 insertions(+), 2 deletions(-)
>>>>
>>>> Please queue for -stable.
>>>>
>>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>>> index c53cc42b5ab9..23e43f3d79f0 100644
>>>> --- a/include/net/sock.h
>>>> +++ b/include/net/sock.h
>>>> @@ -1846,10 +1846,15 @@ static inline int sk_rx_queue_get(const struct sock *sk)
>>>>    }
>>>>    #endif
>>>>    +static inline void __sk_set_socket(struct sock *sk, struct socket *sock)
>>>> +{
>>>> +    sk->sk_socket = sock;
>>>> +}
>>>> +
>>>>    static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>>>    {
>>>>        sk_tx_queue_clear(sk);
>>>> -    sk->sk_socket = sock;
>>>> +    __sk_set_socket(sk, sock);
>>>>    }
>>>>    
>>>
>>> Hmm...
>>>
>>> I think we should have a single sk_set_socket() call, and remove
>>> the sk_tx_queue_clear() from it.
>>>
>>> sk_tx_queue_clear() should be put where it is needed, instead of being hidden
>>> in sk_set_socket()
>>>
>>
>> Thanks Eric, sounds good to me, I will respin, just have some questions below.
>>
>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>> index c53cc42b5ab92d0062519e60435b85c75564a967..3428619faae4340485b200f49d9cce4fb09086b3 100644
>>> --- a/include/net/sock.h
>>> +++ b/include/net/sock.h
>>> @@ -1848,7 +1848,6 @@ static inline int sk_rx_queue_get(const struct sock *sk)
>>>      static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>>>    {
>>> -       sk_tx_queue_clear(sk);
>>>           sk->sk_socket = sock;
>>>    }
>>>    diff --git a/net/core/sock.c b/net/core/sock.c
>>> index 6c4acf1f0220b1f925ebcfaa847632ec0dbe0b9b..134de0d37f77ba781b2b3341c94a97a1b2d57a2d 100644
>>> --- a/net/core/sock.c
>>> +++ b/net/core/sock.c
>>> @@ -1767,6 +1767,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
>>>                   cgroup_sk_alloc(&sk->sk_cgrp_data);
>>>                   sock_update_classid(&sk->sk_cgrp_data);
>>>                   sock_update_netprioidx(&sk->sk_cgrp_data);
>>> +               sk_tx_queue_clear(sk);
>>
>> Why add it here?
>> I don't see a call to sk_set_socket().
> 
> Yes, but the intent is to set the initial value of sk->sk_tx_queue_mapping (USHRT_MAX)
> when socket object is allocated/populated, not later in some unrelated paths.
> 
> We move into one location all the initializers.
> (Most fields initial value is 0, so we do not clear them a second time)
> 
>>
>>>           }
>>>             return sk;
>>> @@ -1990,6 +1991,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>>>                    */
>>>                   sk_refcnt_debug_inc(newsk);
>>>                   sk_set_socket(newsk, NULL);
>>> +               sk_tx_queue_clear(newsk);
>>>                   RCU_INIT_POINTER(newsk->sk_wq, NULL);
>>>                     if (newsk->sk_prot->sockets_allocated)
>>>
>>
>> So in addition to sock_orphan(), now sk_tx_queue_clear() won't be called also from:
>> 1. sock_graft(): Looks fine to me.
>> 2. sock_init_data(): I think we should add an explicit call to sk_tx_queue_clear() here. The one for RX sk_rx_queue_clear() is already there.
> 
> Why ? Initial value found is socket should be just fine.
> 
> If not, normal skb->ooo_okay should prevail, if protocols really care about OOO.
> 
>> 3. mptcp_sock_graft(): Looks fine to me.
>>
>> Regards,
>> Tariq

Thanks.
I prepared the patch, will test it against the bug repro I have and submit.

Tariq
diff mbox series

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index c53cc42b5ab9..23e43f3d79f0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1846,10 +1846,15 @@  static inline int sk_rx_queue_get(const struct sock *sk)
 }
 #endif
 
+static inline void __sk_set_socket(struct sock *sk, struct socket *sock)
+{
+	sk->sk_socket = sock;
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
 	sk_tx_queue_clear(sk);
-	sk->sk_socket = sock;
+	__sk_set_socket(sk, sock);
 }
 
 static inline wait_queue_head_t *sk_sleep(struct sock *sk)
@@ -1868,7 +1873,7 @@  static inline void sock_orphan(struct sock *sk)
 {
 	write_lock_bh(&sk->sk_callback_lock);
 	sock_set_flag(sk, SOCK_DEAD);
-	sk_set_socket(sk, NULL);
+	__sk_set_socket(sk, NULL);
 	sk->sk_wq  = NULL;
 	write_unlock_bh(&sk->sk_callback_lock);
 }