[v2,net,3/3] wireguard: send: account for mtu=0 devices
diff mbox series

Message ID 20200214173407.52521-4-Jason@zx2c4.com
State Superseded, archived
Delegated to: David Miller
Headers show
Series
  • wireguard fixes for 5.6-rc2
Related show

Commit Message

Jason A. Donenfeld Feb. 14, 2020, 5:34 p.m. UTC
It turns out there's an easy way to get packets queued up while still
having an MTU of zero, and that's via persistent keep alive. This commit
makes sure that in whatever condition, we don't wind up dividing by
zero. Note that an MTU of zero for a wireguard interface is something
quasi-valid, so I don't think the correct fix is to limit it via
min_mtu. This can be reproduced easily with:

ip link add wg0 type wireguard
ip link add wg1 type wireguard
ip link set wg0 up mtu 0
ip link set wg1 up
wg set wg0 private-key <(wg genkey)
wg set wg1 listen-port 1 private-key <(wg genkey) peer $(wg show wg0 public-key)
wg set wg0 peer $(wg show wg1 public-key) persistent-keepalive 1 endpoint 127.0.0.1:1

However, while min_mtu=0 seems fine, it makes sense to restrict the
max_mtu. This commit also restricts the maximum MTU to the greatest
number for which rounding up to the padding multiple won't overflow a
signed integer. Packets this large were always rejected anyway
eventually, due to checks deeper in, but it seems more sound not to even
let the administrator configure something that won't work anyway.

We use this opportunity to clean up this function a bit so that it's
clear which paths we're expecting.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Eric Dumazet <edumazet@google.com>
---
 drivers/net/wireguard/device.c |  7 ++++---
 drivers/net/wireguard/send.c   | 16 +++++++++++-----
 2 files changed, 15 insertions(+), 8 deletions(-)

Comments

Eric Dumazet Feb. 14, 2020, 5:56 p.m. UTC | #1
On 2/14/20 9:34 AM, Jason A. Donenfeld wrote:
> It turns out there's an easy way to get packets queued up while still
> having an MTU of zero, and that's via persistent keep alive. This commit
> makes sure that in whatever condition, we don't wind up dividing by
> zero. Note that an MTU of zero for a wireguard interface is something
> quasi-valid, so I don't think the correct fix is to limit it via
> min_mtu. This can be reproduced easily with:
> 
> ip link add wg0 type wireguard
> ip link add wg1 type wireguard
> ip link set wg0 up mtu 0
> ip link set wg1 up
> wg set wg0 private-key <(wg genkey)
> wg set wg1 listen-port 1 private-key <(wg genkey) peer $(wg show wg0 public-key)
> wg set wg0 peer $(wg show wg1 public-key) persistent-keepalive 1 endpoint 127.0.0.1:1
> 
> However, while min_mtu=0 seems fine, it makes sense to restrict the
> max_mtu. This commit also restricts the maximum MTU to the greatest
> number for which rounding up to the padding multiple won't overflow a
> signed integer. Packets this large were always rejected anyway
> eventually, due to checks deeper in, but it seems more sound not to even
> let the administrator configure something that won't work anyway.
> 
> We use this opportunity to clean up this function a bit so that it's
> clear which paths we're expecting.
> 
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> Cc: Eric Dumazet <edumazet@google.com>
> ---
>  drivers/net/wireguard/device.c |  7 ++++---
>  drivers/net/wireguard/send.c   | 16 +++++++++++-----
>  2 files changed, 15 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c
> index 43db442b1373..cdc96968b0f4 100644
> --- a/drivers/net/wireguard/device.c
> +++ b/drivers/net/wireguard/device.c
> @@ -258,6 +258,8 @@ static void wg_setup(struct net_device *dev)
>  	enum { WG_NETDEV_FEATURES = NETIF_F_HW_CSUM | NETIF_F_RXCSUM |
>  				    NETIF_F_SG | NETIF_F_GSO |
>  				    NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA };
> +	const int overhead = MESSAGE_MINIMUM_LENGTH + sizeof(struct udphdr) +
> +			     max(sizeof(struct ipv6hdr), sizeof(struct iphdr));
>  
>  	dev->netdev_ops = &netdev_ops;
>  	dev->hard_header_len = 0;
> @@ -271,9 +273,8 @@ static void wg_setup(struct net_device *dev)
>  	dev->features |= WG_NETDEV_FEATURES;
>  	dev->hw_features |= WG_NETDEV_FEATURES;
>  	dev->hw_enc_features |= WG_NETDEV_FEATURES;
> -	dev->mtu = ETH_DATA_LEN - MESSAGE_MINIMUM_LENGTH -
> -		   sizeof(struct udphdr) -
> -		   max(sizeof(struct ipv6hdr), sizeof(struct iphdr));
> +	dev->mtu = ETH_DATA_LEN - overhead;
> +	dev->max_mtu = round_down(INT_MAX, MESSAGE_PADDING_MULTIPLE) - overhead;
>  
>  	SET_NETDEV_DEVTYPE(dev, &device_type);
>  
> diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c
> index c13260563446..2a9990ab66cd 100644
> --- a/drivers/net/wireguard/send.c
> +++ b/drivers/net/wireguard/send.c
> @@ -143,16 +143,22 @@ static void keep_key_fresh(struct wg_peer *peer)
>  
>  static unsigned int calculate_skb_padding(struct sk_buff *skb)
>  {
> +	unsigned int padded_size, last_unit = skb->len;
> +
> +	if (unlikely(!PACKET_CB(skb)->mtu))
> +		return -last_unit % MESSAGE_PADDING_MULTIPLE;

My brain hurts.

> +
>  	/* We do this modulo business with the MTU, just in case the networking
>  	 * layer gives us a packet that's bigger than the MTU. In that case, we
>  	 * wouldn't want the final subtraction to overflow in the case of the
> -	 * padded_size being clamped.
> +	 * padded_size being clamped. Fortunately, that's very rarely the case,
> +	 * so we optimize for that not happening.
>  	 */
> -	unsigned int last_unit = skb->len % PACKET_CB(skb)->mtu;
> -	unsigned int padded_size = ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE);
> +	if (unlikely(last_unit > PACKET_CB(skb)->mtu))
> +		last_unit %= PACKET_CB(skb)->mtu;
>  
> -	if (padded_size > PACKET_CB(skb)->mtu)
> -		padded_size = PACKET_CB(skb)->mtu;
> +	padded_size = min(PACKET_CB(skb)->mtu,
> +			  ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE));
>  	return padded_size - last_unit;
>  }
>  


Oh dear, can you describe what do you expect of a wireguard device with mtu == 0 or mtu == 1

Why simply not allowing silly configurations, instead of convoluted tests in fast path ?

We are speaking of tunnels adding quite a lot of headers, so we better not try to make them
work on networks with tiny mtu. Just say no to syzbot.
Jason A. Donenfeld Feb. 14, 2020, 6:15 p.m. UTC | #2
On Fri, Feb 14, 2020 at 6:56 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Oh dear, can you describe what do you expect of a wireguard device with mtu == 0 or mtu == 1
>
> Why simply not allowing silly configurations, instead of convoluted tests in fast path ?
>
> We are speaking of tunnels adding quite a lot of headers, so we better not try to make them
> work on networks with tiny mtu. Just say no to syzbot.

The idea was that wireguard might still be useful for the persistent
keepalive stuff. This branch becomes very cold very fast, so I don't
think it makes a difference performance wise, but if you feel strongly
about it, I can get rid of it and set a non-zero min_mtu that's the
smallest thing wireguard's xmit semantics will accept. It sounds like
you'd prefer that?
Eric Dumazet Feb. 14, 2020, 6:22 p.m. UTC | #3
On 2/14/20 10:15 AM, Jason A. Donenfeld wrote:
> On Fri, Feb 14, 2020 at 6:56 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> Oh dear, can you describe what do you expect of a wireguard device with mtu == 0 or mtu == 1
>>
>> Why simply not allowing silly configurations, instead of convoluted tests in fast path ?
>>
>> We are speaking of tunnels adding quite a lot of headers, so we better not try to make them
>> work on networks with tiny mtu. Just say no to syzbot.
> 
> The idea was that wireguard might still be useful for the persistent
> keepalive stuff. This branch becomes very cold very fast, so I don't
> think it makes a difference performance wise, but if you feel strongly
> about it, I can get rid of it and set a non-zero min_mtu that's the
> smallest thing wireguard's xmit semantics will accept. It sounds like
> you'd prefer that?
> 
Well, if you believe that wireguard in persistent keepalive
has some value on its own, I guess that we will have to support this mode.

Some legacy devices can have arbitrary mtu, and this has caused headaches.
I was hoping that for brand new devices, we could have saner limits.

About setting max_mtu to ~MAX_INT, does it mean wireguard will attempt
to send UDP datagrams bigger than 64K ? Where is the segmentation done ?
Jason A. Donenfeld Feb. 14, 2020, 6:37 p.m. UTC | #4
On 2/14/20, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
> On 2/14/20 10:15 AM, Jason A. Donenfeld wrote:
>> On Fri, Feb 14, 2020 at 6:56 PM Eric Dumazet <eric.dumazet@gmail.com>
>> wrote:
>>> Oh dear, can you describe what do you expect of a wireguard device with
>>> mtu == 0 or mtu == 1
>>>
>>> Why simply not allowing silly configurations, instead of convoluted tests
>>> in fast path ?
>>>
>>> We are speaking of tunnels adding quite a lot of headers, so we better
>>> not try to make them
>>> work on networks with tiny mtu. Just say no to syzbot.
>>
>> The idea was that wireguard might still be useful for the persistent
>> keepalive stuff. This branch becomes very cold very fast, so I don't
>> think it makes a difference performance wise, but if you feel strongly
>> about it, I can get rid of it and set a non-zero min_mtu that's the
>> smallest thing wireguard's xmit semantics will accept. It sounds like
>> you'd prefer that?
>>
> Well, if you believe that wireguard in persistent keepalive
> has some value on its own, I guess that we will have to support this mode.

Alright.

>
> Some legacy devices can have arbitrary mtu, and this has caused headaches.
> I was hoping that for brand new devices, we could have saner limits.
>
> About setting max_mtu to ~MAX_INT, does it mean wireguard will attempt
> to send UDP datagrams bigger than 64K ? Where is the segmentation done ?

The before passings off to the udp tunnel api, we indicate that we
support ip segmentation, and then it gets handled and fragmented
deeper down. Check out socket.c. This winds up being sometimes useful
for some odd people when it's faster to encrypt longer packets on
networks with no loss. I can't say I generally recommend people go
that route, but some report benefitting from it.


>
Eric Dumazet Feb. 14, 2020, 6:53 p.m. UTC | #5
On 2/14/20 10:37 AM, Jason A. Donenfeld wrote:
> On 2/14/20, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>>
>> On 2/14/20 10:15 AM, Jason A. Donenfeld wrote:
>>> On Fri, Feb 14, 2020 at 6:56 PM Eric Dumazet <eric.dumazet@gmail.com>
>>> wrote:
>>>> Oh dear, can you describe what do you expect of a wireguard device with
>>>> mtu == 0 or mtu == 1
>>>>
>>>> Why simply not allowing silly configurations, instead of convoluted tests
>>>> in fast path ?
>>>>
>>>> We are speaking of tunnels adding quite a lot of headers, so we better
>>>> not try to make them
>>>> work on networks with tiny mtu. Just say no to syzbot.
>>>
>>> The idea was that wireguard might still be useful for the persistent
>>> keepalive stuff. This branch becomes very cold very fast, so I don't
>>> think it makes a difference performance wise, but if you feel strongly
>>> about it, I can get rid of it and set a non-zero min_mtu that's the
>>> smallest thing wireguard's xmit semantics will accept. It sounds like
>>> you'd prefer that?
>>>
>> Well, if you believe that wireguard in persistent keepalive
>> has some value on its own, I guess that we will have to support this mode.
> 
> Alright.
> 
>>
>> Some legacy devices can have arbitrary mtu, and this has caused headaches.
>> I was hoping that for brand new devices, we could have saner limits.
>>
>> About setting max_mtu to ~MAX_INT, does it mean wireguard will attempt
>> to send UDP datagrams bigger than 64K ? Where is the segmentation done ?
> 
> The before passings off to the udp tunnel api, we indicate that we
> support ip segmentation, and then it gets handled and fragmented
> deeper down. Check out socket.c. 

Okay. Speaking of socket.c, I found this wg_socket_reinit() snippet :

synchronize_rcu();
synchronize_net();

Which makes little sense. Please add a comment explaining why these two
calls are needed.
Jason A. Donenfeld Feb. 14, 2020, 9:57 p.m. UTC | #6
Hey Eric,

On Fri, Feb 14, 2020 at 7:53 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > The before passings off to the udp tunnel api, we indicate that we
> > support ip segmentation, and then it gets handled and fragmented
> > deeper down. Check out socket.c.
>
> Okay. Speaking of socket.c, I found this wg_socket_reinit() snippet :
>
> synchronize_rcu();
> synchronize_net();
>
> Which makes little sense. Please add a comment explaining why these two
> calls are needed.

Thanks, I appreciate your scrutiny here. Right again, you are. It
looks like that was added in 2017 after observing the pattern in other
drivers and seeing the documentation comment, "Wait for packets
currently being received to be done." That sounds like an important
thing to do before tearing down a socket. But here it makes no sense
at all, since synchronize_net() is just a wrapper around
synchronize_rcu() (and sometimes _expedited). And here, the
synchronize_rcu() usage makes sense to have, since this is as boring
of an rcu pattern as can be:

mutex_lock()
old = rcu_dereference_protected(x->y)
rcu_assign(x->y, new)
mutex_unlock()
synchronize_rcu()
free_it(old)

Straight out of the documentation. Having the extra synchronize_net()
in there adds nothing at all. I'll send a v3 of this 5.6-rc2 cleanup
series containing that removal.

Jason
Eric Dumazet Feb. 14, 2020, 10:30 p.m. UTC | #7
On 2/14/20 1:57 PM, Jason A. Donenfeld wrote:

> 
> Thanks, I appreciate your scrutiny here. Right again, you are. It
> looks like that was added in 2017 after observing the pattern in other
> drivers and seeing the documentation comment, "Wait for packets
> currently being received to be done." That sounds like an important
> thing to do before tearing down a socket. But here it makes no sense
> at all, since synchronize_net() is just a wrapper around
> synchronize_rcu() (and sometimes _expedited). And here, the
> synchronize_rcu() usage makes sense to have, since this is as boring
> of an rcu pattern as can be:
> 
> mutex_lock()
> old = rcu_dereference_protected(x->y)
> rcu_assign(x->y, new)
> mutex_unlock()
> synchronize_rcu()
> free_it(old)
> 
> Straight out of the documentation. Having the extra synchronize_net()
> in there adds nothing at all. I'll send a v3 of this 5.6-rc2 cleanup
> series containing that removal.
> 

Also note that UDP sockets have SOCK_RCU_FREE flag set, so core
networking also respect one RCU grace period before freeing them.

It is possible that no extra synchronize_{net|rcu}() call is needed,
but this is left as an exercise for future kernels :)
Jason A. Donenfeld Feb. 14, 2020, 10:53 p.m. UTC | #8
On Fri, Feb 14, 2020 at 11:30 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Also note that UDP sockets have SOCK_RCU_FREE flag set, so core
> networking also respect one RCU grace period before freeing them.

       if (use_call_rcu)
               call_rcu(&sk->sk_rcu, __sk_destruct);
       else
               __sk_destruct(&sk->sk_rcu);

Ah, that's handy indeed.

> It is possible that no extra synchronize_{net|rcu}() call is needed,
> but this is left as an exercise for future kernels :)

Cool, yea, sounds like something I should play with for 5.7.

Sending v3 out in a few minutes.

Patch
diff mbox series

diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c
index 43db442b1373..cdc96968b0f4 100644
--- a/drivers/net/wireguard/device.c
+++ b/drivers/net/wireguard/device.c
@@ -258,6 +258,8 @@  static void wg_setup(struct net_device *dev)
 	enum { WG_NETDEV_FEATURES = NETIF_F_HW_CSUM | NETIF_F_RXCSUM |
 				    NETIF_F_SG | NETIF_F_GSO |
 				    NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA };
+	const int overhead = MESSAGE_MINIMUM_LENGTH + sizeof(struct udphdr) +
+			     max(sizeof(struct ipv6hdr), sizeof(struct iphdr));
 
 	dev->netdev_ops = &netdev_ops;
 	dev->hard_header_len = 0;
@@ -271,9 +273,8 @@  static void wg_setup(struct net_device *dev)
 	dev->features |= WG_NETDEV_FEATURES;
 	dev->hw_features |= WG_NETDEV_FEATURES;
 	dev->hw_enc_features |= WG_NETDEV_FEATURES;
-	dev->mtu = ETH_DATA_LEN - MESSAGE_MINIMUM_LENGTH -
-		   sizeof(struct udphdr) -
-		   max(sizeof(struct ipv6hdr), sizeof(struct iphdr));
+	dev->mtu = ETH_DATA_LEN - overhead;
+	dev->max_mtu = round_down(INT_MAX, MESSAGE_PADDING_MULTIPLE) - overhead;
 
 	SET_NETDEV_DEVTYPE(dev, &device_type);
 
diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c
index c13260563446..2a9990ab66cd 100644
--- a/drivers/net/wireguard/send.c
+++ b/drivers/net/wireguard/send.c
@@ -143,16 +143,22 @@  static void keep_key_fresh(struct wg_peer *peer)
 
 static unsigned int calculate_skb_padding(struct sk_buff *skb)
 {
+	unsigned int padded_size, last_unit = skb->len;
+
+	if (unlikely(!PACKET_CB(skb)->mtu))
+		return -last_unit % MESSAGE_PADDING_MULTIPLE;
+
 	/* We do this modulo business with the MTU, just in case the networking
 	 * layer gives us a packet that's bigger than the MTU. In that case, we
 	 * wouldn't want the final subtraction to overflow in the case of the
-	 * padded_size being clamped.
+	 * padded_size being clamped. Fortunately, that's very rarely the case,
+	 * so we optimize for that not happening.
 	 */
-	unsigned int last_unit = skb->len % PACKET_CB(skb)->mtu;
-	unsigned int padded_size = ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE);
+	if (unlikely(last_unit > PACKET_CB(skb)->mtu))
+		last_unit %= PACKET_CB(skb)->mtu;
 
-	if (padded_size > PACKET_CB(skb)->mtu)
-		padded_size = PACKET_CB(skb)->mtu;
+	padded_size = min(PACKET_CB(skb)->mtu,
+			  ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE));
 	return padded_size - last_unit;
 }