Message ID | 1503751671.11498.25.camel@edumazet-glaptop3.roam.corp.google.com |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
On 08/26/2017 05:47 AM, Eric Dumazet wrote: > On Fri, 2017-08-25 at 21:19 -0700, David Miller wrote: > >> Agreed, but the ARP resolution queue really needs to scale it's backlog >> to the physical technology it is attached to. > Yes, last time (in 2011) we increased the old limit of 3 packets :/ > > We probably should match sysctl_wmem_max so that a single socket > provider would hit its sk_sndbuf limit Before: /proc/sys/net/ipv4/neigh/eth0/unres_qlen:34 /proc/sys/net/ipv4/neigh/eth0/unres_qlen_bytes:65536 /proc/sys/net/ipv4/neigh/gphy/unres_qlen:34 /proc/sys/net/ipv4/neigh/gphy/unres_qlen_bytes:65536 After: /proc/sys/net/ipv4/neigh/eth0/unres_qlen:106 /proc/sys/net/ipv4/neigh/eth0/unres_qlen_bytes:229376 /proc/sys/net/ipv4/neigh/gphy/unres_qlen:106 /proc/sys/net/ipv4/neigh/gphy/unres_qlen_bytes:229376 and this does help a lot with the test case reported over an hour, only 2 packets lost: # perf record -a -g -e skb:kfree_skb iperf -c 192.168.1.23 -b 900M -t 3600 -u ------------------------------------------------------------ Client connecting to 192.168.1.23, UDP port 5001 Sending 1470 byte datagrams, IPG target: 13.07 us (kalman adjust) UDP buffer size: 224 KByte (default) ------------------------------------------------------------ [ 4] local 192.168.1.66 port 48209 connected with 192.168.1.23 port 5001 write failed: Invalid argument [ ID] Interval Transfer Bandwidth [ 4] 0.0-404.9 sec 4.51 GBytes 95.7 Mbits/sec [ 4] Sent 3294727 datagrams [ 4] Server Report: [ 4] 0.0-405.1 sec 4.51 GBytes 95.6 Mbits/sec 14.979 ms 2/3294728 (6.1e-05%) Thanks Eric! > > Something like : > > diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt > index 6b0bc0f715346a097a6df46e2ba2771359abcd23..7777dceb78107c0019fb39d5b69be1959005b78e 100644 > --- a/Documentation/networking/ip-sysctl.txt > +++ b/Documentation/networking/ip-sysctl.txt > @@ -109,7 +109,8 @@ neigh/default/unres_qlen_bytes - INTEGER > queued for each unresolved address by other network layers. > (added in linux 3.3) > Setting negative value is meaningless and will return error. > - Default: 65536 Bytes(64KB) > + Default: SK_WMEM_MAX, enough to store 256 packets of medium size > + (less than 256 bytes per packet) > > neigh/default/unres_qlen - INTEGER > The maximum number of packets which may be queued for each > diff --git a/include/net/sock.h b/include/net/sock.h > index 1c2912d433e81b10f3fdc87bcfcbb091570edc03..03a362568357acc7278a318423dd3873103f90ca 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -2368,6 +2368,16 @@ bool sk_net_capable(const struct sock *sk, int cap); > > void sk_get_meminfo(const struct sock *sk, u32 *meminfo); > > +/* Take into consideration the size of the struct sk_buff overhead in the > + * determination of these values, since that is non-constant across > + * platforms. This makes socket queueing behavior and performance > + * not depend upon such differences. > + */ > +#define _SK_MEM_PACKETS 256 > +#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) > +#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) > +#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) > + > extern __u32 sysctl_wmem_max; > extern __u32 sysctl_rmem_max; > > diff --git a/net/core/sock.c b/net/core/sock.c > index dfdd14cac775e9bfcee0085ee32ffcd0ab28b67b..9b7b6bbb2a23e7652a1f34a305f29d49de00bc8c 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -307,16 +307,6 @@ static struct lock_class_key af_wlock_keys[AF_MAX]; > static struct lock_class_key af_elock_keys[AF_MAX]; > static struct lock_class_key af_kern_callback_keys[AF_MAX]; > > -/* Take into consideration the size of the struct sk_buff overhead in the > - * determination of these values, since that is non-constant across > - * platforms. This makes socket queueing behavior and performance > - * not depend upon such differences. > - */ > -#define _SK_MEM_PACKETS 256 > -#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) > -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) > -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) > - > /* Run time adjustable parameters. */ > __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; > EXPORT_SYMBOL(sysctl_wmem_max); > diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c > index 21dedf6fd0f76dec22b2b3685beb89cfefea7ded..22bf0b95d6edc3c27ef3a99d27cb70a1551e3e0e 100644 > --- a/net/decnet/dn_neigh.c > +++ b/net/decnet/dn_neigh.c > @@ -94,7 +94,7 @@ struct neigh_table dn_neigh_table = { > [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, > [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, > [NEIGH_VAR_GC_STALETIME] = 60 * HZ, > - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64*1024, > + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, > [NEIGH_VAR_PROXY_QLEN] = 0, > [NEIGH_VAR_ANYCAST_DELAY] = 0, > [NEIGH_VAR_PROXY_DELAY] = 0, > diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c > index 8b52179ddc6e54eabf6d3c2ed0132083228680bb..7c45b8896709815c5dde5972fd57cb5c3bcb2648 100644 > --- a/net/ipv4/arp.c > +++ b/net/ipv4/arp.c > @@ -171,7 +171,7 @@ struct neigh_table arp_tbl = { > [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, > [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, > [NEIGH_VAR_GC_STALETIME] = 60 * HZ, > - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, > + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, > [NEIGH_VAR_PROXY_QLEN] = 64, > [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, > [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, > diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c > index 5e338eb89509b1df6ebd060f8bd19fcb4b86fe05..266a530414d7be4f1e7be922e465bbab46f7cbac 100644 > --- a/net/ipv6/ndisc.c > +++ b/net/ipv6/ndisc.c > @@ -127,7 +127,7 @@ struct neigh_table nd_tbl = { > [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, > [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, > [NEIGH_VAR_GC_STALETIME] = 60 * HZ, > - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, > + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, > [NEIGH_VAR_PROXY_QLEN] = 64, > [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, > [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, > >
On 08/26/2017 11:56 AM, Florian Fainelli wrote: > > > On 08/26/2017 05:47 AM, Eric Dumazet wrote: >> On Fri, 2017-08-25 at 21:19 -0700, David Miller wrote: >> >>> Agreed, but the ARP resolution queue really needs to scale it's backlog >>> to the physical technology it is attached to. >> Yes, last time (in 2011) we increased the old limit of 3 packets :/ >> >> We probably should match sysctl_wmem_max so that a single socket >> provider would hit its sk_sndbuf limit Eric, do you want to post this as a formal patch? I don't think I understand these tunables enough to provide a good commit message anyways. Thanks! > > Before: > /proc/sys/net/ipv4/neigh/eth0/unres_qlen:34 > /proc/sys/net/ipv4/neigh/eth0/unres_qlen_bytes:65536 > /proc/sys/net/ipv4/neigh/gphy/unres_qlen:34 > /proc/sys/net/ipv4/neigh/gphy/unres_qlen_bytes:65536 > > After: > /proc/sys/net/ipv4/neigh/eth0/unres_qlen:106 > /proc/sys/net/ipv4/neigh/eth0/unres_qlen_bytes:229376 > /proc/sys/net/ipv4/neigh/gphy/unres_qlen:106 > /proc/sys/net/ipv4/neigh/gphy/unres_qlen_bytes:229376 > > and this does help a lot with the test case reported over an hour, only > 2 packets lost: > > # perf record -a -g -e skb:kfree_skb iperf -c 192.168.1.23 -b 900M -t > 3600 -u > ------------------------------------------------------------ > Client connecting to 192.168.1.23, UDP port 5001 > Sending 1470 byte datagrams, IPG target: 13.07 us (kalman adjust) > UDP buffer size: 224 KByte (default) > ------------------------------------------------------------ > [ 4] local 192.168.1.66 port 48209 connected with 192.168.1.23 port 5001 > write failed: Invalid argument > [ ID] Interval Transfer Bandwidth > [ 4] 0.0-404.9 sec 4.51 GBytes 95.7 Mbits/sec > [ 4] Sent 3294727 datagrams > [ 4] Server Report: > [ 4] 0.0-405.1 sec 4.51 GBytes 95.6 Mbits/sec 14.979 ms > 2/3294728 (6.1e-05%) > > Thanks Eric! > >> >> Something like : >> >> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt >> index 6b0bc0f715346a097a6df46e2ba2771359abcd23..7777dceb78107c0019fb39d5b69be1959005b78e 100644 >> --- a/Documentation/networking/ip-sysctl.txt >> +++ b/Documentation/networking/ip-sysctl.txt >> @@ -109,7 +109,8 @@ neigh/default/unres_qlen_bytes - INTEGER >> queued for each unresolved address by other network layers. >> (added in linux 3.3) >> Setting negative value is meaningless and will return error. >> - Default: 65536 Bytes(64KB) >> + Default: SK_WMEM_MAX, enough to store 256 packets of medium size >> + (less than 256 bytes per packet) >> >> neigh/default/unres_qlen - INTEGER >> The maximum number of packets which may be queued for each >> diff --git a/include/net/sock.h b/include/net/sock.h >> index 1c2912d433e81b10f3fdc87bcfcbb091570edc03..03a362568357acc7278a318423dd3873103f90ca 100644 >> --- a/include/net/sock.h >> +++ b/include/net/sock.h >> @@ -2368,6 +2368,16 @@ bool sk_net_capable(const struct sock *sk, int cap); >> >> void sk_get_meminfo(const struct sock *sk, u32 *meminfo); >> >> +/* Take into consideration the size of the struct sk_buff overhead in the >> + * determination of these values, since that is non-constant across >> + * platforms. This makes socket queueing behavior and performance >> + * not depend upon such differences. >> + */ >> +#define _SK_MEM_PACKETS 256 >> +#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) >> +#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) >> +#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) >> + >> extern __u32 sysctl_wmem_max; >> extern __u32 sysctl_rmem_max; >> >> diff --git a/net/core/sock.c b/net/core/sock.c >> index dfdd14cac775e9bfcee0085ee32ffcd0ab28b67b..9b7b6bbb2a23e7652a1f34a305f29d49de00bc8c 100644 >> --- a/net/core/sock.c >> +++ b/net/core/sock.c >> @@ -307,16 +307,6 @@ static struct lock_class_key af_wlock_keys[AF_MAX]; >> static struct lock_class_key af_elock_keys[AF_MAX]; >> static struct lock_class_key af_kern_callback_keys[AF_MAX]; >> >> -/* Take into consideration the size of the struct sk_buff overhead in the >> - * determination of these values, since that is non-constant across >> - * platforms. This makes socket queueing behavior and performance >> - * not depend upon such differences. >> - */ >> -#define _SK_MEM_PACKETS 256 >> -#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) >> -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) >> -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) >> - >> /* Run time adjustable parameters. */ >> __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; >> EXPORT_SYMBOL(sysctl_wmem_max); >> diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c >> index 21dedf6fd0f76dec22b2b3685beb89cfefea7ded..22bf0b95d6edc3c27ef3a99d27cb70a1551e3e0e 100644 >> --- a/net/decnet/dn_neigh.c >> +++ b/net/decnet/dn_neigh.c >> @@ -94,7 +94,7 @@ struct neigh_table dn_neigh_table = { >> [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, >> [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, >> [NEIGH_VAR_GC_STALETIME] = 60 * HZ, >> - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64*1024, >> + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, >> [NEIGH_VAR_PROXY_QLEN] = 0, >> [NEIGH_VAR_ANYCAST_DELAY] = 0, >> [NEIGH_VAR_PROXY_DELAY] = 0, >> diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c >> index 8b52179ddc6e54eabf6d3c2ed0132083228680bb..7c45b8896709815c5dde5972fd57cb5c3bcb2648 100644 >> --- a/net/ipv4/arp.c >> +++ b/net/ipv4/arp.c >> @@ -171,7 +171,7 @@ struct neigh_table arp_tbl = { >> [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, >> [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, >> [NEIGH_VAR_GC_STALETIME] = 60 * HZ, >> - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, >> + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, >> [NEIGH_VAR_PROXY_QLEN] = 64, >> [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, >> [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, >> diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c >> index 5e338eb89509b1df6ebd060f8bd19fcb4b86fe05..266a530414d7be4f1e7be922e465bbab46f7cbac 100644 >> --- a/net/ipv6/ndisc.c >> +++ b/net/ipv6/ndisc.c >> @@ -127,7 +127,7 @@ struct neigh_table nd_tbl = { >> [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, >> [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, >> [NEIGH_VAR_GC_STALETIME] = 60 * HZ, >> - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, >> + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, >> [NEIGH_VAR_PROXY_QLEN] = 64, >> [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, >> [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, >> >> >
On Tue, 2017-08-29 at 10:53 -0700, Florian Fainelli wrote: > On 08/26/2017 11:56 AM, Florian Fainelli wrote: > > > > > > On 08/26/2017 05:47 AM, Eric Dumazet wrote: > >> On Fri, 2017-08-25 at 21:19 -0700, David Miller wrote: > >> > >>> Agreed, but the ARP resolution queue really needs to scale it's backlog > >>> to the physical technology it is attached to. > >> Yes, last time (in 2011) we increased the old limit of 3 packets :/ > >> > >> We probably should match sysctl_wmem_max so that a single socket > >> provider would hit its sk_sndbuf limit > > Eric, do you want to post this as a formal patch? I don't think I > understand these tunables enough to provide a good commit message > anyways. Thanks! I will post it today. I was out of the office yesterday, rafting on the south fork of American River ;) This will target net-next. Thanks.
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 6b0bc0f715346a097a6df46e2ba2771359abcd23..7777dceb78107c0019fb39d5b69be1959005b78e 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -109,7 +109,8 @@ neigh/default/unres_qlen_bytes - INTEGER queued for each unresolved address by other network layers. (added in linux 3.3) Setting negative value is meaningless and will return error. - Default: 65536 Bytes(64KB) + Default: SK_WMEM_MAX, enough to store 256 packets of medium size + (less than 256 bytes per packet) neigh/default/unres_qlen - INTEGER The maximum number of packets which may be queued for each diff --git a/include/net/sock.h b/include/net/sock.h index 1c2912d433e81b10f3fdc87bcfcbb091570edc03..03a362568357acc7278a318423dd3873103f90ca 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2368,6 +2368,16 @@ bool sk_net_capable(const struct sock *sk, int cap); void sk_get_meminfo(const struct sock *sk, u32 *meminfo); +/* Take into consideration the size of the struct sk_buff overhead in the + * determination of these values, since that is non-constant across + * platforms. This makes socket queueing behavior and performance + * not depend upon such differences. + */ +#define _SK_MEM_PACKETS 256 +#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) +#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) + extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; diff --git a/net/core/sock.c b/net/core/sock.c index dfdd14cac775e9bfcee0085ee32ffcd0ab28b67b..9b7b6bbb2a23e7652a1f34a305f29d49de00bc8c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -307,16 +307,6 @@ static struct lock_class_key af_wlock_keys[AF_MAX]; static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; -/* Take into consideration the size of the struct sk_buff overhead in the - * determination of these values, since that is non-constant across - * platforms. This makes socket queueing behavior and performance - * not depend upon such differences. - */ -#define _SK_MEM_PACKETS 256 -#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) - /* Run time adjustable parameters. */ __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; EXPORT_SYMBOL(sysctl_wmem_max); diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 21dedf6fd0f76dec22b2b3685beb89cfefea7ded..22bf0b95d6edc3c27ef3a99d27cb70a1551e3e0e 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -94,7 +94,7 @@ struct neigh_table dn_neigh_table = { [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64*1024, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 0, [NEIGH_VAR_ANYCAST_DELAY] = 0, [NEIGH_VAR_PROXY_DELAY] = 0, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 8b52179ddc6e54eabf6d3c2ed0132083228680bb..7c45b8896709815c5dde5972fd57cb5c3bcb2648 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -171,7 +171,7 @@ struct neigh_table arp_tbl = { [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 5e338eb89509b1df6ebd060f8bd19fcb4b86fe05..266a530414d7be4f1e7be922e465bbab46f7cbac 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -127,7 +127,7 @@ struct neigh_table nd_tbl = { [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,