[ovs-dev,v2] netdev-dpdk: Implement TCP/UDP TX cksum in ovs-dpdk side

Submitted by Gao Zhenyu on Aug. 3, 2017, 9:20 a.m.

Details

Message ID 20170803092051.32273-1-sysugaozhenyu@gmail.com
State New
Headers show

Commit Message

Gao Zhenyu Aug. 3, 2017, 9:20 a.m.
Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
So L4 packets's cksum were calculated in VM side but performance is not
good.
Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
makes virtio-net frontend-driver support NETIF_F_SG as well

Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
---
v1--->v2: sync up to latest ovs and remove some useless mbuf operations in v1

Here is some performance number:

Setup:

 qperf client
+---------+
|   VM    |
+---------+
     |
     |                          qperf server
+--------------+              +------------+
| vswitch+dpdk |              | bare-metal |
+--------------+              +------------+
       |                            |
       |                            |
      pNic---------PhysicalSwitch----

do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx on' in VM side.
                      It offload cksum job to ovs-dpdk side.

do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in VM side.
                VM calculate cksum for tcp/udp packets.

We can see huge improvment in TCP throughput if we leverage ovs-dpdk cksum.

[root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01 tcp_bw tcp_lat udp_bw udp_lat
  do cksum in ovs-dpdk          do cksum in VM             without this patch
tcp_bw:
    bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec
tcp_bw:
    bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec
tcp_bw:
    bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec
tcp_bw:
    bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec
tcp_bw:
    bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec
tcp_bw:
    bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec
tcp_bw:
    bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec
tcp_bw: 
    bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec
tcp_bw:
    bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec
tcp_bw:
    bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec
tcp_bw:
    bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec
tcp_bw:
    bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec
tcp_bw:
    bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec
tcp_bw: 
    bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec
tcp_bw:
    bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec
tcp_bw:
    bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec
tcp_bw:
    bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec
tcp_lat:
    latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
tcp_lat:
    latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us
tcp_lat:
    latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us
tcp_lat:
    latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us
tcp_lat:
    latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
tcp_lat:
    latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us
tcp_lat:
    latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us
tcp_lat:
    latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us
tcp_lat:
    latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us
tcp_lat:
    latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us
tcp_lat:
    latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us
tcp_lat:
    latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us
tcp_lat:
     latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us
tcp_lat:
    latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us
tcp_lat:
    latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us
tcp_lat:
    latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us
tcp_lat:
    latency  =  167 us        latency  =  197 us        latency  =  172 us
udp_bw:
    send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403 KB/sec
    recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400 KB/sec
udp_bw:
    send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810 KB/sec
    recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807 KB/sec
udp_bw:
    send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63 MB/sec
    recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63 MB/sec
udp_bw:
    send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26 MB/sec
    recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82 MB/sec
udp_bw:
    send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45 MB/sec
    recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45 MB/sec
udp_bw:
    send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13 MB/sec
    recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13 MB/sec
udp_bw:
    send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9 MB/sec
    recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7 MB/sec
udp_bw:
    send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52 MB/sec
    recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2 MB/sec
udp_bw:
    send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103 MB/sec
    recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100 MB/sec
udp_bw:
    send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205 MB/sec
    recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202 MB/sec
udp_bw:
    send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401 MB/sec
    recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358 MB/sec
udp_bw:
    send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557 MB/sec
    recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362 MB/sec
udp_bw:
    send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863 MB/sec
    recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584 MB/sec
udp_bw:
    send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08 GB/sec
    recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793 MB/sec
udp_bw: 
    send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19 GB/sec
    recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673 MB/sec
udp_bw:
    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec
    recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762 MB/sec
udp_bw:
    send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec
    recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec
udp_lat:
    latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us
udp_lat:
    latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us
udp_lat:
    latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us
udp_lat:
    latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us
udp_lat:
    latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us
udp_lat:
    latency  =  28.1 us       latency  =  28 us         latency  =  28 us
udp_lat:
     latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us
udp_lat:
    latency  =  31 us         latency  =  31 us         latency  =  30.8 us
udp_lat:
    latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us
udp_lat:
    latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us
udp_lat:
    latency  =  64.9 us       latency  =  65 us         latency  =  65 us
udp_lat:
    latency  =  72.3 us       latency  =  72 us         latency  =  72 us
udp_lat:
    latency  =  121 us        latency  =  122 us        latency  =  122 us
udp_lat: 
    latency  =  0 ns          latency  =  0 ns          latency  =  0 ns


 lib/netdev-dpdk.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 4 deletions(-)

Patch hide | download patch | download mbox

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1d82bca..88b4c45 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -31,6 +31,7 @@ 
 #include <rte_errno.h>
 #include <rte_eth_ring.h>
 #include <rte_ethdev.h>
+#include <rte_ip.h>
 #include <rte_malloc.h>
 #include <rte_mbuf.h>
 #include <rte_meter.h>
@@ -991,8 +992,7 @@  netdev_dpdk_vhost_construct(struct netdev *netdev)
 
     err = rte_vhost_driver_disable_features(dev->vhost_id,
                                 1ULL << VIRTIO_NET_F_HOST_TSO4
-                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                | 1ULL << VIRTIO_NET_F_CSUM);
+                                | 1ULL << VIRTIO_NET_F_HOST_TSO6);
     if (err) {
         VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
                  "port: %s\n", name);
@@ -1454,6 +1454,76 @@  netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
     rte_free(rx);
 }
 
+static inline void
+netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
+                       uint8_t l4_proto, bool is_ipv4)
+{
+    void *l3hdr = (void *)(data + pkt->l3_ofs);
+
+    if (l4_proto == IPPROTO_TCP) {
+        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt->l4_ofs);
+
+        tcp_hdr->tcp_csum = 0;
+        if (is_ipv4) {
+            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
+        } else {
+            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
+        }
+    } else if (l4_proto == IPPROTO_UDP) {
+        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt->l4_ofs);
+        /* do not recalculate udp cksum if it was 0 */
+        if (udp_hdr->udp_csum != 0) {
+            udp_hdr->udp_csum = 0;
+            if (is_ipv4) {
+                /*do not calculate udp cksum if it was a fragment IP*/
+                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
+                                      fragment_offset)) {
+                    return;
+                }
+
+                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);
+            } else {
+                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);
+            }
+        }
+    }
+}
+
+static inline void
+netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
+{
+    int i;
+
+    for (i = 0; i < pkt_cnt; i++) {
+        ovs_be16 dl_type;
+        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
+        const char *data = dp_packet_data(pkt);
+        void *l3hdr = (char *)(data + pkt->l3_ofs);
+
+        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
+            continue;
+        }
+        /* This take a assumption that it should be a vhost packet if this
+         * packet was allocated by DPDK pool and try sending to pNic. */
+        if (pkt->source == DPBUF_DPDK &&
+            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
+            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need cksum
+            continue;
+        }
+
+        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
+        if (dl_type == htons(ETH_TYPE_IP)) {
+            netdev_refill_l4_cksum(data, pkt,
+                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,
+                                   true);
+        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
+            netdev_refill_l4_cksum(data, pkt,
+                                   ((struct ipv6_hdr *)l3hdr)->proto,
+                                   false);
+        }
+    }
+}
+
 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
  * 'pkts', even in case of failure.
  *
@@ -1895,6 +1965,8 @@  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
         return;
     }
 
+    netdev_prepare_tx_csum(batch->packets, batch->count);
+
     if (OVS_UNLIKELY(concurrent_txq)) {
         qid = qid % dev->up.n_txq;
         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
@@ -3285,8 +3357,7 @@  netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
 
         err = rte_vhost_driver_disable_features(dev->vhost_id,
                                     1ULL << VIRTIO_NET_F_HOST_TSO4
-                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                    | 1ULL << VIRTIO_NET_F_CSUM);
+                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6);
         if (err) {
             VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
                      "client port: %s\n", dev->up.name);