diff mbox

[v2] net: Allow no-cache copy from user on transmit

Message ID alpine.DEB.2.00.1103231003120.16608@pokey.mtv.corp.google.com
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Tom Herbert March 23, 2011, 5:10 p.m. UTC
This patch uses __copy_from_user_nocache (from skb_copy_to_page)
on transmit to bypass data cache for a performance improvement.
This functionality is configurable per device using ethtool, the
device must also be doing TX csum offload to enable.  It seems
reasonable to set this when the netdevice does not copy or
otherwise touch the data.

This patch was tested using 200 instances of netperf TCP_RR with
1400 byte request and one byte reply.  Platform is 16 core AMD x86.

No-cache copy disabled:
   672703 tps, 97.13% utilization
   50/90/99% latency:244.31 484.205 1028.41

No-cache copy enabled:
   702113 tps, 96.16% utilization,
   50/90/99% latency 238.56 467.56 956.955

Using 14000 byte request and response sizes demonstrate the
effects more dramatically:

No-cache copy disabled:
   79571 tps, 34.34 %utlization
   50/90/95% latency 1584.46 2319.59 5001.76

No-cache copy enabled:
   83856 tps, 34.81% utilization
   50/90/95% latency 2508.42 2622.62 2735.88

Note especially the effect on tail latency (95th percentile).

This seems to provide a nice performance improvement and is
consistent in the tests I ran.  Presumably, this would provide
the greatest benfits in the presence of an application workload
stressing the cache and a lot of transmit data happening.  I don't
yet see a downside to using this.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/linux/netdevice.h |   10 ++++++++--
 include/net/sock.h        |    5 +++++
 net/core/dev.c            |    2 +-
 net/core/ethtool.c        |    2 +-
 4 files changed, 15 insertions(+), 4 deletions(-)

Comments

=?ISO-8859-2?Q?Micha=B3_Miros=B3aw?= March 23, 2011, 6:42 p.m. UTC | #1
2011/3/23 Tom Herbert <therbert@google.com>:
> This patch uses __copy_from_user_nocache (from skb_copy_to_page)
> on transmit to bypass data cache for a performance improvement.
> This functionality is configurable per device using ethtool, the
> device must also be doing TX csum offload to enable.  It seems
> reasonable to set this when the netdevice does not copy or
> otherwise touch the data.
>
> This patch was tested using 200 instances of netperf TCP_RR with
> 1400 byte request and one byte reply.  Platform is 16 core AMD x86.
>
> No-cache copy disabled:
>   672703 tps, 97.13% utilization
>   50/90/99% latency:244.31 484.205 1028.41
>
> No-cache copy enabled:
>   702113 tps, 96.16% utilization,
>   50/90/99% latency 238.56 467.56 956.955
>
> Using 14000 byte request and response sizes demonstrate the
> effects more dramatically:
>
> No-cache copy disabled:
>   79571 tps, 34.34 %utlization
>   50/90/95% latency 1584.46 2319.59 5001.76
>
> No-cache copy enabled:
>   83856 tps, 34.81% utilization
>   50/90/95% latency 2508.42 2622.62 2735.88
>
> Note especially the effect on tail latency (95th percentile).
>
> This seems to provide a nice performance improvement and is
> consistent in the tests I ran.  Presumably, this would provide
> the greatest benfits in the presence of an application workload
> stressing the cache and a lot of transmit data happening.  I don't
> yet see a downside to using this.
>
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
>  include/linux/netdevice.h |   10 ++++++++--
>  include/net/sock.h        |    5 +++++
>  net/core/dev.c            |    2 +-
>  net/core/ethtool.c        |    2 +-
>  4 files changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 5eeb2cd..52d444f 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1066,6 +1066,7 @@ struct net_device {
>  #define NETIF_F_NTUPLE         (1 << 27) /* N-tuple filters supported */
>  #define NETIF_F_RXHASH         (1 << 28) /* Receive hashing offload */
>  #define NETIF_F_RXCSUM         (1 << 29) /* Receive checksumming offload */
> +#define NETIF_F_NOCACHE_COPY   (1 << 30) /* Use no-cache copyfromuser */
>
>        /* Segmentation offload features */
>  #define NETIF_F_GSO_SHIFT      16
> @@ -1081,7 +1082,7 @@ struct net_device {
>        /* = all defined minus driver/device-class-related */
>  #define NETIF_F_NEVER_CHANGE   (NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | \
>                                  NETIF_F_LLTX | NETIF_F_NETNS_LOCAL)
> -#define NETIF_F_ETHTOOL_BITS   (0x3f3fffff & ~NETIF_F_NEVER_CHANGE)
> +#define NETIF_F_ETHTOOL_BITS   (0x7f3fffff & ~NETIF_F_NEVER_CHANGE)
>
>        /* List of features with software fallbacks. */
>  #define NETIF_F_GSO_SOFTWARE   (NETIF_F_TSO | NETIF_F_TSO_ECN | \
> @@ -1108,7 +1109,12 @@ struct net_device {
>                                 NETIF_F_FRAGLIST)
>
>        /* changeable features with no special hardware requirements */
> -#define NETIF_F_SOFT_FEATURES  (NETIF_F_GSO | NETIF_F_GRO)
> +#define NETIF_F_SOFT_FEATURES  (NETIF_F_GSO | NETIF_F_GRO |    \
> +                                NETIF_F_NOCACHE_COPY)
> +
> +       /* soft features automatically enabled */
> +#define NETIF_F_SOFT_FEAT_ENAB (NETIF_F_GSO | NETIF_F_GRO)
> +
>
>        /* Interface index. Unique device identifier    */
>        int                     ifindex;
> diff --git a/include/net/sock.h b/include/net/sock.h
> index da0534d..74ce586 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1401,6 +1401,11 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from,
>                if (err)
>                        return err;
>                skb->csum = csum_block_add(skb->csum, csum, skb->len);
> +       } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
> +               if (!access_ok(VERIFY_READ, from, copy) ||
> +                   __copy_from_user_nocache(page_address(page) + off,
> +                                               from, copy))
> +                       return -EFAULT;
>        } else if (copy_from_user(page_address(page) + off, from, copy))
>                return -EFAULT;
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 0b88eba..c3ed95e 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -5435,7 +5435,7 @@ int register_netdevice(struct net_device *dev)
>         * software offloads (GSO and GRO).
>         */
>        dev->hw_features |= NETIF_F_SOFT_FEATURES;
> -       dev->features |= NETIF_F_SOFT_FEATURES;
> +       dev->features |= NETIF_F_SOFT_FEAT_ENAB;
>        dev->wanted_features = dev->features & dev->hw_features;
>
>        /* Avoid warning from netdev_fix_features() for GSO without SG */
> diff --git a/net/core/ethtool.c b/net/core/ethtool.c
> index c1a71bb..40b6fe0 100644
> --- a/net/core/ethtool.c
> +++ b/net/core/ethtool.c
> @@ -344,7 +344,7 @@ static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS
>        /* NETIF_F_NTUPLE */          "rx-ntuple-filter",
>        /* NETIF_F_RXHASH */          "rx-hashing",
>        /* NETIF_F_RXCSUM */          "rx-checksum",
> -       "",
> +       /* NETIF_F_NOCACHE_COPY */    "tx-nocache-copy"
>        "",
>  };

I would rather see it enabled by default, including "hacks" in
register_netdev() like for GSO. Otherwise not much people will test
this. There should also be constraints for it in
netdev_fix_features().

BTW, what happens if this is used on e.g. bridge device or veth and
later packet ends up going to device which needs to do checksumming in
software?

Best Regards,
Michał Mirosław
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones March 23, 2011, 6:51 p.m. UTC | #2
On Wed, 2011-03-23 at 10:10 -0700, Tom Herbert wrote:
> This patch uses __copy_from_user_nocache (from skb_copy_to_page)
> on transmit to bypass data cache for a performance improvement.
> This functionality is configurable per device using ethtool, the
> device must also be doing TX csum offload to enable.  It seems
> reasonable to set this when the netdevice does not copy or
> otherwise touch the data.
> 
> This patch was tested using 200 instances of netperf TCP_RR with
> 1400 byte request and one byte reply.  Platform is 16 core AMD x86.
> 
> No-cache copy disabled:
>    672703 tps, 97.13% utilization
>    50/90/99% latency:244.31 484.205 1028.41
> 
> No-cache copy enabled:
>    702113 tps, 96.16% utilization,
>    50/90/99% latency 238.56 467.56 956.955
> 
> Using 14000 byte request and response sizes demonstrate the
> effects more dramatically:
> 
> No-cache copy disabled:
>    79571 tps, 34.34 %utlization
>    50/90/95% latency 1584.46 2319.59 5001.76
> 
> No-cache copy enabled:
>    83856 tps, 34.81% utilization
>    50/90/95% latency 2508.42 2622.62 2735.88
> 
> Note especially the effect on tail latency (95th percentile).
> 
> This seems to provide a nice performance improvement and is
> consistent in the tests I ran.  Presumably, this would provide
> the greatest benfits in the presence of an application workload
> stressing the cache and a lot of transmit data happening.  I don't
> yet see a downside to using this.
> 
> Signed-off-by: Tom Herbert <therbert@google.com>

Having raised the question about tying it to CKO and seeing it addressed
to my satisfaction I'll go ahead and:

Acked-by: Rick Jones <rick.jones2@hp.com>

rick jones

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger March 23, 2011, 7:25 p.m. UTC | #3
On Wed, 23 Mar 2011 19:42:20 +0100
Michał Mirosław <mirqus@gmail.com> wrote:

> 2011/3/23 Tom Herbert <therbert@google.com>:
> > This patch uses __copy_from_user_nocache (from skb_copy_to_page)
> > on transmit to bypass data cache for a performance improvement.
> > This functionality is configurable per device using ethtool, the
> > device must also be doing TX csum offload to enable.  It seems
> > reasonable to set this when the netdevice does not copy or
> > otherwise touch the data.
> >
> > This patch was tested using 200 instances of netperf TCP_RR with
> > 1400 byte request and one byte reply.  Platform is 16 core AMD x86.
> >
> > No-cache copy disabled:
> >   672703 tps, 97.13% utilization
> >   50/90/99% latency:244.31 484.205 1028.41
> >
> > No-cache copy enabled:
> >   702113 tps, 96.16% utilization,
> >   50/90/99% latency 238.56 467.56 956.955
> >
> > Using 14000 byte request and response sizes demonstrate the
> > effects more dramatically:
> >
> > No-cache copy disabled:
> >   79571 tps, 34.34 %utlization
> >   50/90/95% latency 1584.46 2319.59 5001.76
> >
> > No-cache copy enabled:
> >   83856 tps, 34.81% utilization
> >   50/90/95% latency 2508.42 2622.62 2735.88
> >
> > Note especially the effect on tail latency (95th percentile).
> >
> > This seems to provide a nice performance improvement and is
> > consistent in the tests I ran.  Presumably, this would provide
> > the greatest benfits in the presence of an application workload
> > stressing the cache and a lot of transmit data happening.  I don't
> > yet see a downside to using this.
> >
> > Signed-off-by: Tom Herbert <therbert@google.com>
> > ---
> >  include/linux/netdevice.h |   10 ++++++++--
> >  include/net/sock.h        |    5 +++++
> >  net/core/dev.c            |    2 +-
> >  net/core/ethtool.c        |    2 +-
> >  4 files changed, 15 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 5eeb2cd..52d444f 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -1066,6 +1066,7 @@ struct net_device {
> >  #define NETIF_F_NTUPLE         (1 << 27) /* N-tuple filters supported */
> >  #define NETIF_F_RXHASH         (1 << 28) /* Receive hashing offload */
> >  #define NETIF_F_RXCSUM         (1 << 29) /* Receive checksumming offload */
> > +#define NETIF_F_NOCACHE_COPY   (1 << 30) /* Use no-cache copyfromuser */
> >
> >        /* Segmentation offload features */
> >  #define NETIF_F_GSO_SHIFT      16
> > @@ -1081,7 +1082,7 @@ struct net_device {
> >        /* = all defined minus driver/device-class-related */
> >  #define NETIF_F_NEVER_CHANGE   (NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | \
> >                                  NETIF_F_LLTX | NETIF_F_NETNS_LOCAL)
> > -#define NETIF_F_ETHTOOL_BITS   (0x3f3fffff & ~NETIF_F_NEVER_CHANGE)
> > +#define NETIF_F_ETHTOOL_BITS   (0x7f3fffff & ~NETIF_F_NEVER_CHANGE)
> >
> >        /* List of features with software fallbacks. */
> >  #define NETIF_F_GSO_SOFTWARE   (NETIF_F_TSO | NETIF_F_TSO_ECN | \
> > @@ -1108,7 +1109,12 @@ struct net_device {
> >                                 NETIF_F_FRAGLIST)
> >
> >        /* changeable features with no special hardware requirements */
> > -#define NETIF_F_SOFT_FEATURES  (NETIF_F_GSO | NETIF_F_GRO)
> > +#define NETIF_F_SOFT_FEATURES  (NETIF_F_GSO | NETIF_F_GRO |    \
> > +                                NETIF_F_NOCACHE_COPY)
> > +
> > +       /* soft features automatically enabled */
> > +#define NETIF_F_SOFT_FEAT_ENAB (NETIF_F_GSO | NETIF_F_GRO)
> > +
> >
> >        /* Interface index. Unique device identifier    */
> >        int                     ifindex;
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index da0534d..74ce586 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -1401,6 +1401,11 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from,
> >                if (err)
> >                        return err;
> >                skb->csum = csum_block_add(skb->csum, csum, skb->len);
> > +       } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
> > +               if (!access_ok(VERIFY_READ, from, copy) ||
> > +                   __copy_from_user_nocache(page_address(page) + off,
> > +                                               from, copy))
> > +                       return -EFAULT;
> >        } else if (copy_from_user(page_address(page) + off, from, copy))
> >                return -EFAULT;
> >
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 0b88eba..c3ed95e 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -5435,7 +5435,7 @@ int register_netdevice(struct net_device *dev)
> >         * software offloads (GSO and GRO).
> >         */
> >        dev->hw_features |= NETIF_F_SOFT_FEATURES;
> > -       dev->features |= NETIF_F_SOFT_FEATURES;
> > +       dev->features |= NETIF_F_SOFT_FEAT_ENAB;
> >        dev->wanted_features = dev->features & dev->hw_features;
> >
> >        /* Avoid warning from netdev_fix_features() for GSO without SG */
> > diff --git a/net/core/ethtool.c b/net/core/ethtool.c
> > index c1a71bb..40b6fe0 100644
> > --- a/net/core/ethtool.c
> > +++ b/net/core/ethtool.c
> > @@ -344,7 +344,7 @@ static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS
> >        /* NETIF_F_NTUPLE */          "rx-ntuple-filter",
> >        /* NETIF_F_RXHASH */          "rx-hashing",
> >        /* NETIF_F_RXCSUM */          "rx-checksum",
> > -       "",
> > +       /* NETIF_F_NOCACHE_COPY */    "tx-nocache-copy"
> >        "",
> >  };
> 
> I would rather see it enabled by default, including "hacks" in
> register_netdev() like for GSO. Otherwise not much people will test
> this. There should also be constraints for it in
> netdev_fix_features().
> 
> BTW, what happens if this is used on e.g. bridge device or veth and
> later packet ends up going to device which needs to do checksumming in
> software?

The configuration via device and ethtool seems problematic for general use
in a distro. Nice for testing, but not really matching the architecture
issues.

Isn't nocache DMA a function of the I/O architecture not a function
of the device driver? Shouldn't it be handled at PCI level somehow
with considerations of CPU arch and quirks? Doesn't it make sense for
non-network traffic as well.

Hate to hold up a good optimization while waiting for a general
solution, but commiting to an API prematurely would be bad as well.
Tom Herbert March 23, 2011, 7:48 p.m. UTC | #4
> Isn't nocache DMA a function of the I/O architecture not a function
> of the device driver? Shouldn't it be handled at PCI level somehow
> with considerations of CPU arch and quirks? Doesn't it make sense for
> non-network traffic as well.
>
> Hate to hold up a good optimization while waiting for a general
> solution, but commiting to an API prematurely would be bad as well.
>

There's an implicit assumption in the patch that if somewhere below
the copyfromuser the data is touched it would be more efficient to
copy through the cache than bypass it.  Whether the data is touched is
an attribute of the device and hence the per device control.

Now that I think about it, I don't really know what the actual
performance impact is to bypass cache copy and then touch the data.
I'd be interested to know if anyone has data on that, else I'll try to
contrive some benchmark numbers.

Tom

>
>
> --
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5eeb2cd..52d444f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1066,6 +1066,7 @@  struct net_device {
 #define NETIF_F_NTUPLE		(1 << 27) /* N-tuple filters supported */
 #define NETIF_F_RXHASH		(1 << 28) /* Receive hashing offload */
 #define NETIF_F_RXCSUM		(1 << 29) /* Receive checksumming offload */
+#define NETIF_F_NOCACHE_COPY	(1 << 30) /* Use no-cache copyfromuser */
 
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
@@ -1081,7 +1082,7 @@  struct net_device {
 	/* = all defined minus driver/device-class-related */
 #define NETIF_F_NEVER_CHANGE	(NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | \
 				  NETIF_F_LLTX | NETIF_F_NETNS_LOCAL)
-#define NETIF_F_ETHTOOL_BITS	(0x3f3fffff & ~NETIF_F_NEVER_CHANGE)
+#define NETIF_F_ETHTOOL_BITS	(0x7f3fffff & ~NETIF_F_NEVER_CHANGE)
 
 	/* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | \
@@ -1108,7 +1109,12 @@  struct net_device {
 				 NETIF_F_FRAGLIST)
 
 	/* changeable features with no special hardware requirements */
-#define NETIF_F_SOFT_FEATURES	(NETIF_F_GSO | NETIF_F_GRO)
+#define NETIF_F_SOFT_FEATURES	(NETIF_F_GSO | NETIF_F_GRO |	\
+				 NETIF_F_NOCACHE_COPY)
+
+	/* soft features automatically enabled */
+#define NETIF_F_SOFT_FEAT_ENAB	(NETIF_F_GSO | NETIF_F_GRO)
+
 
 	/* Interface index. Unique device identifier	*/
 	int			ifindex;
diff --git a/include/net/sock.h b/include/net/sock.h
index da0534d..74ce586 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1401,6 +1401,11 @@  static inline int skb_copy_to_page(struct sock *sk, char __user *from,
 		if (err)
 			return err;
 		skb->csum = csum_block_add(skb->csum, csum, skb->len);
+	} else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
+		if (!access_ok(VERIFY_READ, from, copy) ||
+		    __copy_from_user_nocache(page_address(page) + off,
+						from, copy))
+			return -EFAULT;
 	} else if (copy_from_user(page_address(page) + off, from, copy))
 		return -EFAULT;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 0b88eba..c3ed95e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5435,7 +5435,7 @@  int register_netdevice(struct net_device *dev)
 	 * software offloads (GSO and GRO).
 	 */
 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
-	dev->features |= NETIF_F_SOFT_FEATURES;
+	dev->features |= NETIF_F_SOFT_FEAT_ENAB;
 	dev->wanted_features = dev->features & dev->hw_features;
 
 	/* Avoid warning from netdev_fix_features() for GSO without SG */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index c1a71bb..40b6fe0 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -344,7 +344,7 @@  static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS
 	/* NETIF_F_NTUPLE */          "rx-ntuple-filter",
 	/* NETIF_F_RXHASH */          "rx-hashing",
 	/* NETIF_F_RXCSUM */          "rx-checksum",
-	"",
+	/* NETIF_F_NOCACHE_COPY */    "tx-nocache-copy"
 	"",
 };