diff mbox series

[ovs-dev,v4] netdev-dpdk: Implement TCP/UDP TX cksum in ovs-dpdk side

Message ID 20170902050234.17169-1-sysugaozhenyu@gmail.com
State Changes Requested
Headers show
Series [ovs-dev,v4] netdev-dpdk: Implement TCP/UDP TX cksum in ovs-dpdk side | expand

Commit Message

Gao Zhenyu Sept. 2, 2017, 5:02 a.m. UTC
Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
So L4 packets's cksum were calculated in VM side but performance is not
good.
Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput in
VM->phy->phy->VM situation. And it makes virtio-net frontend-driver
support NETIF_F_SG(feature scatter-gather) as well.

Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
---

Here is some performance number:

Setup:

 qperf client
+---------+
|   VM    |
+---------+
     |
     |                          qperf server
+--------------+              +------------+
| vswitch+dpdk |              | bare-metal |
+--------------+              +------------+
       |                            |
       |                            |
      pNic---------PhysicalSwitch----

do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx on' in VM side.
                      It offload cksum job to ovs-dpdk side.

do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in VM side.
                VM calculate cksum for tcp/udp packets.

We can see huge improvment in TCP throughput if we leverage ovs-dpdk cksum.

[root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01 tcp_bw tcp_lat udp_bw udp_lat
  do cksum in ovs-dpdk          do cksum in VM             without this patch
tcp_bw:        
    bw  =  1.9 MB/sec         bw  =  1.92 MB/sec        bw  =  1.95 MB/sec
tcp_bw:        
    bw  =  3.97 MB/sec        bw  =  3.99 MB/sec        bw  =  3.98 MB/sec
tcp_bw:        
    bw  =  7.75 MB/sec        bw  =  7.79 MB/sec        bw  =  7.89 MB/sec
tcp_bw:        
    bw  =  14.7 MB/sec        bw  =  14.7 MB/sec        bw  =  14.9 MB/sec
tcp_bw:        
    bw  =  27.7 MB/sec        bw  =  27.4 MB/sec        bw  =  28 MB/sec
tcp_bw:        
    bw  =  51.1 MB/sec        bw  =  51.3 MB/sec        bw  =  51.8 MB/sec
tcp_bw:        
    bw  =  86.2 MB/sec        bw  =  84.4 MB/sec        bw  =  87.6 MB/sec
tcp_bw:        
    bw  =  141 MB/sec         bw  =  142 MB/sec        bw  =  141 MB/sec
tcp_bw:        
    bw  =  203 MB/sec         bw  =  201 MB/sec        bw  =  211 MB/sec
tcp_bw:        
    bw  =  267 MB/sec         bw  =  250 MB/sec        bw  =  260 MB/sec
tcp_bw:        
    bw  =  324 MB/sec         bw  =  295 MB/sec        bw  =  302 MB/sec
tcp_bw:        
    bw  =  397 MB/sec         bw  =  363 MB/sec        bw  =  347 MB/sec
tcp_bw:        
    bw  =  765 MB/sec         bw  =  510 MB/sec        bw  =  383 MB/sec
tcp_bw:        
    bw  =  850 MB/sec         bw  =  710 MB/sec        bw  =  417 MB/sec
tcp_bw:        
    bw  =  1.09 GB/sec        bw  =  860 MB/sec        bw  =  444 MB/sec
tcp_bw:        
    bw  =  1.17 GB/sec        bw  =  979 MB/sec        bw  =  447 MB/sec
tcp_bw:        
    bw  =  1.17 GB/sec        bw  =  1.07 GB/sec       bw  =  462 MB/sec
tcp_lat:        
    latency  =  29.1 us       latency  =  28.7 us        latency  =  29.1 us
tcp_lat:        
    latency  =  29 us         latency  =  28.8 us        latency  =  29 us
tcp_lat:        
    latency  =  29 us         latency  =  28.8 us        latency  =  29 us
tcp_lat:        
    latency  =  29 us         latency  =  28.9 us        latency  =  29 us
tcp_lat:        
    latency  =  29.2 us       latency  =  28.9 us        latency  =  29.1 us
tcp_lat:        
    latency  =  29.1 us       latency  =  29.1 us        latency  =  29.1 us
tcp_lat:        
    latency  =  29.5 us       latency  =  29.5 us        latency  =  29.5 us
tcp_lat:        
    latency  =  29.8 us       latency  =  29.8 us        latency  =  29.9 us
tcp_lat:        
    latency  =  30.7 us       latency  =  30.7 us        latency  =  30.7 us
tcp_lat:        
    latency  =  47.1 us       latency  =  46.2 us        latency  =  47.1 us
tcp_lat:        
    latency  =  52.1 us       latency  =  52.3 us        latency  =  53.3 us
tcp_lat:        
    latency  =  44 us         latency  =  43.8 us        latency  =  43.2 us
tcp_lat:        
    latency  =  50 us         latency  =  46.6 us        latency  =  47.8 us
tcp_lat:        
     latency  =  79.2 us      latency  =  77.9 us        latency  =  78.9 us
tcp_lat:        
    latency  =  82.3 us       latency  =  81.7 us        latency  =  82.2 us
tcp_lat:        
    latency  =  96.7 us       latency  =  90.8 us        latency  =  127 us
tcp_lat:        
    latency  =  215 us        latency  =  177 us        latency  =  225 us
udp_bw:        
    send_bw  =  422 KB/sec        send_bw  =  415 KB/sec        send_bw  =  405 KB/sec
    recv_bw  =  402 KB/sec        recv_bw  =  404 KB/sec        recv_bw  =  403 KB/sec
udp_bw:        
    send_bw  =  845 KB/sec        send_bw  =  835 KB/sec        send_bw  =  802 KB/sec
    recv_bw  =  831 KB/sec        recv_bw  =  804 KB/sec        recv_bw  =  802 KB/sec
udp_bw:        
    send_bw  =  1.69 MB/sec       send_bw  =  1.66 MB/sec        send_bw  =  1.62 MB/sec
    recv_bw  =  1.45 MB/sec       recv_bw  =  1.63 MB/sec        recv_bw  =   1.6 MB/sec
udp_bw:        
    send_bw  =  3.38 MB/sec       send_bw  =  3.33 MB/sec        send_bw  =  3.24 MB/sec
    recv_bw  =  3.32 MB/sec       recv_bw  =  3.25 MB/sec        recv_bw  =  3.24 MB/sec
udp_bw:        
    send_bw  =  6.76 MB/sec       send_bw  =  6.63 MB/sec        send_bw  =  6.47 MB/sec
    recv_bw  =  6.42 MB/sec       recv_bw  =  5.59 MB/sec        recv_bw  =  6.45 MB/sec
udp_bw:        
    send_bw  =  13.5 MB/sec       send_bw  =  13.3 MB/sec        send_bw  =  13 MB/sec
    recv_bw  =  13.4 MB/sec       recv_bw  =  12.1 MB/sec        recv_bw  =  13 MB/sec
udp_bw:        
    send_bw  =    27 MB/sec       send_bw  =  26.5 MB/sec        send_bw  =  25.9 MB/sec
    recv_bw  =  26.4 MB/sec       recv_bw  =  21.5 MB/sec        recv_bw  =  25.9 MB/sec
udp_bw:        
    send_bw  =  53.8 MB/sec       send_bw  =  52.9 MB/sec        send_bw  =  51.7 MB/sec
    recv_bw  =  49.1 MB/sec       recv_bw  =  47.6 MB/sec        recv_bw  =  51.1 MB/sec
udp_bw:        
    send_bw  =   108 MB/sec       send_bw  =  105 MB/sec         send_bw  =  102 MB/sec
    recv_bw  =  91.1 MB/sec       recv_bw  =  101 MB/sec         recv_bw  =  100 MB/sec
udp_bw:        
    send_bw  =  212 MB/sec        send_bw  =  208 MB/sec         send_bw  =  203 MB/sec
    recv_bw  =  204 MB/sec        recv_bw  =  204 MB/sec         recv_bw  =  169 MB/sec
udp_bw:        
    send_bw  =  414 MB/sec        send_bw  =  407 MB/sec         send_bw  =  398 MB/sec
    recv_bw  =  403 MB/sec        recv_bw  =  312 MB/sec         recv_bw  =  343 MB/sec
udp_bw:        
    send_bw  =  555 MB/sec        send_bw  =  561 MB/sec         send_bw  =  557 MB/sec
    recv_bw  =  354 MB/sec        recv_bw  =  368 MB/sec         recv_bw  =  360 MB/sec
udp_bw:        
    send_bw  =  877 MB/sec        send_bw  =  880 MB/sec         send_bw  =  868 MB/sec
    recv_bw  =  551 MB/sec        recv_bw  =  542 MB/sec         recv_bw  =  562 MB/sec
udp_bw:        
    send_bw  =  1.1 GB/sec        send_bw  =  1.08 GB/sec        send_bw  =  1.09 GB/sec
    recv_bw  =  805 MB/sec        recv_bw  =   785 MB/sec        recv_bw  =   766 MB/sec
udp_bw:        
    send_bw  =  1.21 GB/sec       send_bw  =  1.19 GB/sec        send_bw  =  1.22 GB/sec
    recv_bw  =   899 MB/sec       recv_bw  =   715 MB/sec        recv_bw  =   700 MB/sec
udp_bw:        
    send_bw  =  1.31 GB/sec       send_bw  =  1.31 GB/sec        send_bw  =  1.31 GB/sec
    recv_bw  =   614 MB/sec       recv_bw  =   622 MB/sec        recv_bw  =   661 MB/sec
udp_bw:        
    send_bw  =  0 bytes/sec       send_bw  =  0 bytes/sec        send_bw  =  0 bytes/sec
    recv_bw  =  0 bytes/sec       recv_bw  =  0 bytes/sec        recv_bw  =  0 bytes/sec
udp_lat:        
    latency  =  25.9 us        latency  =  26.5 us        latency  =  26.5 us
udp_lat:        
    latency  =  26.3 us        latency  =  26.4 us        latency  =  26.5 us
udp_lat:        
    latency  =  26 us          latency  =  26.4 us        latency  =  26.6 us
udp_lat:        
    latency  =  26.1 us        latency  =  26.2 us        latency  =  26.4 us
udp_lat:        
    latency  =  26.3 us        latency  =  26.5 us        latency  =  26.7 us
udp_lat:        
    latency  =  26.3 us        latency  =  26.4 us        latency  =  26.5 us
udp_lat:        
    latency  =  26.3 us        latency  =  26.7 us        latency  =  26.9 us
udp_lat:        
    latency  =  27.1 us        latency  =  27.1 us        latency  =  27.2 us
udp_lat:        
    latency  =  27.5 us        latency  =  27.8 us        latency  =  28.1 us
udp_lat:        
    latency  =  28.7 us        latency  =  28.9 us        latency  =  29.1 us
udp_lat:        
    latency  =  30.4 us        latency  =  30.5 us        latency  =  30.9 us
udp_lat:        
    latency  =  41.2 us        latency  =  41.3 us        latency  =  41.1 us
udp_lat:        
    latency  =  41.3 us        latency  =  41.5 us        latency  =  41.5 us
udp_lat:        
    latency  =  64.4 us        latency  =  64.5 us        latency  =  64.2 us
udp_lat:        
    latency  =  71.5 us        latency  =  71.5 us        latency  =  71.7 us
udp_lat:        
    latency  =  120 us         latency  =  120 us         latency  =  120 us
udp_lat:        
    latency  =  0 ns           latency  =  0 ns           latency  =  0 ns

 lib/netdev-dpdk.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 4 deletions(-)

Comments

Ciara Loftus Sept. 5, 2017, 3:11 p.m. UTC | #1
> 
> Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
> So L4 packets's cksum were calculated in VM side but performance is not
> good.
> Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput in
> VM->phy->phy->VM situation. And it makes virtio-net frontend-driver
> support NETIF_F_SG(feature scatter-gather) as well.
> 
> Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> ---
> 
> Here is some performance number:

Hi Zhenyu,

Thanks for the code changes since v3.
I tested a VM to VM case using iperf and observed a performance degradation when the tx cksum was offloaded to the host:

checksum in VM
0.0-30.0 sec  10.9 GBytes  3.12 Gbits/sec
0.0-30.0 sec  10.9 GBytes  3.11 Gbits/sec
0.0-30.0 sec  11.0 GBytes  3.16 Gbits/sec

checksum in ovs dpdk
0.0-30.0 sec  7.52 GBytes  2.15 Gbits/sec
0.0-30.0 sec  7.12 GBytes  2.04 Gbits/sec
0.0-30.0 sec  8.17 GBytes  2.34 Gbits/sec

I think for this feature to enabled we need performance to be roughly the same or better for all use cases. For now the gap here is too big I think.

If you wish to reproduce:

1 host, 2 VMs each with 1 vhost port and flows set up to switch packets from each vhost port to the other.

VM1:
ifconfig eth1 1.1.1.1/24 up
ethtool -K eth2 tx on/off
iperf -c 1.1.1.2 -i 1 -t 30

VM2:
ifconfig eth1 1.1.1.2/24 up
ethtool -K eth1 tx on/off
iperf -s -i 1

Thanks,
Ciara

> 
> Setup:
> 
>  qperf client
> +---------+
> |   VM    |
> +---------+
>      |
>      |                          qperf server
> +--------------+              +------------+
> | vswitch+dpdk |              | bare-metal |
> +--------------+              +------------+
>        |                            |
>        |                            |
>       pNic---------PhysicalSwitch----
> 
> do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx on'
> in VM side.
>                       It offload cksum job to ovs-dpdk side.
> 
> do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in VM
> side.
>                 VM calculate cksum for tcp/udp packets.
> 
> We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> cksum.
> 
> [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01
> tcp_bw tcp_lat udp_bw udp_lat
>   do cksum in ovs-dpdk          do cksum in VM             without this patch
> tcp_bw:
>     bw  =  1.9 MB/sec         bw  =  1.92 MB/sec        bw  =  1.95 MB/sec
> tcp_bw:
>     bw  =  3.97 MB/sec        bw  =  3.99 MB/sec        bw  =  3.98 MB/sec
> tcp_bw:
>     bw  =  7.75 MB/sec        bw  =  7.79 MB/sec        bw  =  7.89 MB/sec
> tcp_bw:
>     bw  =  14.7 MB/sec        bw  =  14.7 MB/sec        bw  =  14.9 MB/sec
> tcp_bw:
>     bw  =  27.7 MB/sec        bw  =  27.4 MB/sec        bw  =  28 MB/sec
> tcp_bw:
>     bw  =  51.1 MB/sec        bw  =  51.3 MB/sec        bw  =  51.8 MB/sec
> tcp_bw:
>     bw  =  86.2 MB/sec        bw  =  84.4 MB/sec        bw  =  87.6 MB/sec
> tcp_bw:
>     bw  =  141 MB/sec         bw  =  142 MB/sec        bw  =  141 MB/sec
> tcp_bw:
>     bw  =  203 MB/sec         bw  =  201 MB/sec        bw  =  211 MB/sec
> tcp_bw:
>     bw  =  267 MB/sec         bw  =  250 MB/sec        bw  =  260 MB/sec
> tcp_bw:
>     bw  =  324 MB/sec         bw  =  295 MB/sec        bw  =  302 MB/sec
> tcp_bw:
>     bw  =  397 MB/sec         bw  =  363 MB/sec        bw  =  347 MB/sec
> tcp_bw:
>     bw  =  765 MB/sec         bw  =  510 MB/sec        bw  =  383 MB/sec
> tcp_bw:
>     bw  =  850 MB/sec         bw  =  710 MB/sec        bw  =  417 MB/sec
> tcp_bw:
>     bw  =  1.09 GB/sec        bw  =  860 MB/sec        bw  =  444 MB/sec
> tcp_bw:
>     bw  =  1.17 GB/sec        bw  =  979 MB/sec        bw  =  447 MB/sec
> tcp_bw:
>     bw  =  1.17 GB/sec        bw  =  1.07 GB/sec       bw  =  462 MB/sec
> tcp_lat:
>     latency  =  29.1 us       latency  =  28.7 us        latency  =  29.1 us
> tcp_lat:
>     latency  =  29 us         latency  =  28.8 us        latency  =  29 us
> tcp_lat:
>     latency  =  29 us         latency  =  28.8 us        latency  =  29 us
> tcp_lat:
>     latency  =  29 us         latency  =  28.9 us        latency  =  29 us
> tcp_lat:
>     latency  =  29.2 us       latency  =  28.9 us        latency  =  29.1 us
> tcp_lat:
>     latency  =  29.1 us       latency  =  29.1 us        latency  =  29.1 us
> tcp_lat:
>     latency  =  29.5 us       latency  =  29.5 us        latency  =  29.5 us
> tcp_lat:
>     latency  =  29.8 us       latency  =  29.8 us        latency  =  29.9 us
> tcp_lat:
>     latency  =  30.7 us       latency  =  30.7 us        latency  =  30.7 us
> tcp_lat:
>     latency  =  47.1 us       latency  =  46.2 us        latency  =  47.1 us
> tcp_lat:
>     latency  =  52.1 us       latency  =  52.3 us        latency  =  53.3 us
> tcp_lat:
>     latency  =  44 us         latency  =  43.8 us        latency  =  43.2 us
> tcp_lat:
>     latency  =  50 us         latency  =  46.6 us        latency  =  47.8 us
> tcp_lat:
>      latency  =  79.2 us      latency  =  77.9 us        latency  =  78.9 us
> tcp_lat:
>     latency  =  82.3 us       latency  =  81.7 us        latency  =  82.2 us
> tcp_lat:
>     latency  =  96.7 us       latency  =  90.8 us        latency  =  127 us
> tcp_lat:
>     latency  =  215 us        latency  =  177 us        latency  =  225 us
> udp_bw:
>     send_bw  =  422 KB/sec        send_bw  =  415 KB/sec        send_bw  =  405
> KB/sec
>     recv_bw  =  402 KB/sec        recv_bw  =  404 KB/sec        recv_bw  =  403
> KB/sec
> udp_bw:
>     send_bw  =  845 KB/sec        send_bw  =  835 KB/sec        send_bw  =  802
> KB/sec
>     recv_bw  =  831 KB/sec        recv_bw  =  804 KB/sec        recv_bw  =  802
> KB/sec
> udp_bw:
>     send_bw  =  1.69 MB/sec       send_bw  =  1.66 MB/sec        send_bw  =  1.62
> MB/sec
>     recv_bw  =  1.45 MB/sec       recv_bw  =  1.63 MB/sec        recv_bw  =   1.6
> MB/sec
> udp_bw:
>     send_bw  =  3.38 MB/sec       send_bw  =  3.33 MB/sec        send_bw  =  3.24
> MB/sec
>     recv_bw  =  3.32 MB/sec       recv_bw  =  3.25 MB/sec        recv_bw  =  3.24
> MB/sec
> udp_bw:
>     send_bw  =  6.76 MB/sec       send_bw  =  6.63 MB/sec        send_bw  =  6.47
> MB/sec
>     recv_bw  =  6.42 MB/sec       recv_bw  =  5.59 MB/sec        recv_bw  =  6.45
> MB/sec
> udp_bw:
>     send_bw  =  13.5 MB/sec       send_bw  =  13.3 MB/sec        send_bw  =  13
> MB/sec
>     recv_bw  =  13.4 MB/sec       recv_bw  =  12.1 MB/sec        recv_bw  =  13
> MB/sec
> udp_bw:
>     send_bw  =    27 MB/sec       send_bw  =  26.5 MB/sec        send_bw  =  25.9
> MB/sec
>     recv_bw  =  26.4 MB/sec       recv_bw  =  21.5 MB/sec        recv_bw  =  25.9
> MB/sec
> udp_bw:
>     send_bw  =  53.8 MB/sec       send_bw  =  52.9 MB/sec        send_bw  =  51.7
> MB/sec
>     recv_bw  =  49.1 MB/sec       recv_bw  =  47.6 MB/sec        recv_bw  =  51.1
> MB/sec
> udp_bw:
>     send_bw  =   108 MB/sec       send_bw  =  105 MB/sec         send_bw  =  102
> MB/sec
>     recv_bw  =  91.1 MB/sec       recv_bw  =  101 MB/sec         recv_bw  =  100
> MB/sec
> udp_bw:
>     send_bw  =  212 MB/sec        send_bw  =  208 MB/sec         send_bw  =  203
> MB/sec
>     recv_bw  =  204 MB/sec        recv_bw  =  204 MB/sec         recv_bw  =  169
> MB/sec
> udp_bw:
>     send_bw  =  414 MB/sec        send_bw  =  407 MB/sec         send_bw  =  398
> MB/sec
>     recv_bw  =  403 MB/sec        recv_bw  =  312 MB/sec         recv_bw  =  343
> MB/sec
> udp_bw:
>     send_bw  =  555 MB/sec        send_bw  =  561 MB/sec         send_bw  =  557
> MB/sec
>     recv_bw  =  354 MB/sec        recv_bw  =  368 MB/sec         recv_bw  =  360
> MB/sec
> udp_bw:
>     send_bw  =  877 MB/sec        send_bw  =  880 MB/sec         send_bw  =  868
> MB/sec
>     recv_bw  =  551 MB/sec        recv_bw  =  542 MB/sec         recv_bw  =  562
> MB/sec
> udp_bw:
>     send_bw  =  1.1 GB/sec        send_bw  =  1.08 GB/sec        send_bw  =  1.09
> GB/sec
>     recv_bw  =  805 MB/sec        recv_bw  =   785 MB/sec        recv_bw  =   766
> MB/sec
> udp_bw:
>     send_bw  =  1.21 GB/sec       send_bw  =  1.19 GB/sec        send_bw  =  1.22
> GB/sec
>     recv_bw  =   899 MB/sec       recv_bw  =   715 MB/sec        recv_bw  =   700
> MB/sec
> udp_bw:
>     send_bw  =  1.31 GB/sec       send_bw  =  1.31 GB/sec        send_bw  =  1.31
> GB/sec
>     recv_bw  =   614 MB/sec       recv_bw  =   622 MB/sec        recv_bw  =   661
> MB/sec
> udp_bw:
>     send_bw  =  0 bytes/sec       send_bw  =  0 bytes/sec        send_bw  =  0
> bytes/sec
>     recv_bw  =  0 bytes/sec       recv_bw  =  0 bytes/sec        recv_bw  =  0
> bytes/sec
> udp_lat:
>     latency  =  25.9 us        latency  =  26.5 us        latency  =  26.5 us
> udp_lat:
>     latency  =  26.3 us        latency  =  26.4 us        latency  =  26.5 us
> udp_lat:
>     latency  =  26 us          latency  =  26.4 us        latency  =  26.6 us
> udp_lat:
>     latency  =  26.1 us        latency  =  26.2 us        latency  =  26.4 us
> udp_lat:
>     latency  =  26.3 us        latency  =  26.5 us        latency  =  26.7 us
> udp_lat:
>     latency  =  26.3 us        latency  =  26.4 us        latency  =  26.5 us
> udp_lat:
>     latency  =  26.3 us        latency  =  26.7 us        latency  =  26.9 us
> udp_lat:
>     latency  =  27.1 us        latency  =  27.1 us        latency  =  27.2 us
> udp_lat:
>     latency  =  27.5 us        latency  =  27.8 us        latency  =  28.1 us
> udp_lat:
>     latency  =  28.7 us        latency  =  28.9 us        latency  =  29.1 us
> udp_lat:
>     latency  =  30.4 us        latency  =  30.5 us        latency  =  30.9 us
> udp_lat:
>     latency  =  41.2 us        latency  =  41.3 us        latency  =  41.1 us
> udp_lat:
>     latency  =  41.3 us        latency  =  41.5 us        latency  =  41.5 us
> udp_lat:
>     latency  =  64.4 us        latency  =  64.5 us        latency  =  64.2 us
> udp_lat:
>     latency  =  71.5 us        latency  =  71.5 us        latency  =  71.7 us
> udp_lat:
>     latency  =  120 us         latency  =  120 us         latency  =  120 us
> udp_lat:
>     latency  =  0 ns           latency  =  0 ns           latency  =  0 ns
> 
>  lib/netdev-dpdk.c | 79
> ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 75 insertions(+), 4 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index f58e9be..0f91def 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -31,6 +31,7 @@
>  #include <rte_errno.h>
>  #include <rte_eth_ring.h>
>  #include <rte_ethdev.h>
> +#include <rte_ip.h>
>  #include <rte_malloc.h>
>  #include <rte_mbuf.h>
>  #include <rte_meter.h>
> @@ -992,8 +993,7 @@ netdev_dpdk_vhost_construct(struct netdev
> *netdev)
> 
>      err = rte_vhost_driver_disable_features(dev->vhost_id,
>                                  1ULL << VIRTIO_NET_F_HOST_TSO4
> -                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                                | 1ULL << VIRTIO_NET_F_CSUM);
> +                                | 1ULL << VIRTIO_NET_F_HOST_TSO6);
>      if (err) {
>          VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
>                   "port: %s\n", name);
> @@ -1455,6 +1455,76 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> *rxq)
>      rte_free(rx);
>  }
> 
> +static inline void
> +netdev_dpdk_vhost_refill_l4_cksum(const char *data, struct dp_packet
> *pkt,
> +                                  uint8_t l4_proto, bool is_ipv4)
> +{
> +    void *l3hdr = (void *)(data + pkt->mbuf.l2_len);
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
> +                                         pkt->mbuf.l2_len + pkt->mbuf.l3_len);
> +
> +        tcp_hdr->tcp_csum = 0;
> +        if (is_ipv4) {
> +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
> +        } else {
> +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
> +        }
> +    } else if (l4_proto == IPPROTO_UDP) {
> +        struct udp_header *udp_hdr = (struct udp_header *)(data +
> +                                         pkt->mbuf.l2_len + pkt->mbuf.l3_len);
> +        /* do not recalculate udp cksum if it was 0 */
> +        if (udp_hdr->udp_csum != 0) {
> +            udp_hdr->udp_csum = 0;
> +            if (is_ipv4) {
> +                /*do not calculate udp cksum if it was a fragment IP*/
> +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> +                                      fragment_offset)) {
> +                    return;
> +                }
> +
> +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);
> +            } else {
> +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);
> +            }
> +        }
> +    }
> +
> +    pkt->mbuf.ol_flags &= ~PKT_TX_L4_MASK;
> +}
> +
> +static inline void
> +netdev_dpdk_vhost_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> +{
> +    int i;
> +
> +    for (i = 0; i < pkt_cnt; i++) {
> +        ovs_be16 dl_type;
> +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> +        const char *data = dp_packet_data(pkt);
> +        void *l3hdr = (char *)(data + pkt->mbuf.l2_len);
> +
> +        if (!(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> +            /* DPDK vhost tags PKT_TX_L4_MASK if a L4 packet need cksum. */
> +            continue;
> +        }
> +
> +        if (OVS_UNLIKELY(pkt->mbuf.l2_len == 0 || pkt->mbuf.l3_len == 0)) {
> +            continue;
> +        }
> +
> +        dl_type = *(ovs_be16 *)(data + pkt->mbuf.l2_len - sizeof dl_type);
> +        if (dl_type == htons(ETH_TYPE_IP)) {
> +            uint8_t l4_proto = ((struct ipv4_hdr *)l3hdr)->next_proto_id;
> +            netdev_dpdk_vhost_refill_l4_cksum(data, pkt, l4_proto, true);
> +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> +            uint8_t l4_proto = ((struct ipv6_hdr *)l3hdr)->proto;
> +            netdev_dpdk_vhost_refill_l4_cksum(data, pkt, l4_proto, false);
> +        }
> +    }
> +}
> +
>  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
>   * 'pkts', even in case of failure.
>   *
> @@ -1646,6 +1716,8 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq
> *rxq,
> 
>      dp_packet_batch_init_cutlen(batch);
>      batch->count = (int) nb_rx;
> +    netdev_dpdk_vhost_tx_csum(batch->packets, batch->count);
> +
>      return 0;
>  }
> 
> @@ -3288,8 +3360,7 @@ netdev_dpdk_vhost_client_reconfigure(struct
> netdev *netdev)
> 
>          err = rte_vhost_driver_disable_features(dev->vhost_id,
>                                      1ULL << VIRTIO_NET_F_HOST_TSO4
> -                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                                    | 1ULL << VIRTIO_NET_F_CSUM);
> +                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6);
>          if (err) {
>              VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
>                       "client port: %s\n", dev->up.name);
> --
> 1.8.3.1
Gao Zhenyu Sept. 6, 2017, 11:43 a.m. UTC | #2
Thanks for your testing and I reproduce it on my own machine.

I did the testing:

10% times to get about 8.5Gb/s thoughput when "ethtool -K eth0 tx on" , 90%
situation get 3.5Gb/s.
10% times to get about 3.5Gb/s thoughput when "ethtool -K eth0 tx off", 90%
situation I get 8.5Gb/s

And this wierd thing always happen in changing tx mode, which just like:

[root@localhost ~]# ethtool -K eth0 tx off
[root@localhost ~]# ethtool -K eth0 tx on  <---------------------tx cksum
is on now
[root@localhost ~]# iperf3 -c 10.100.85.246 -i 1 -t 30
Connecting to host 10.100.85.246, port 5201
[  4] local 10.100.85.245 port 56004 connected to 10.100.85.246 port 5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   821 MBytes  6.89 Gbits/sec  846    214
KBytes
[  4]   1.00-2.00   sec  1.00 GBytes  8.63 Gbits/sec  676    305
KBytes
[  4]   2.00-3.00   sec  1.00 GBytes  8.62 Gbits/sec  839    402
KBytes
[  4]   3.00-4.00   sec  1.01 GBytes  8.69 Gbits/sec  787    403
KBytes
[  4]   4.00-5.00   sec   815 MBytes  6.84 Gbits/sec  1190    284
KBytes
[  4]   5.00-6.00   sec  1.01 GBytes  8.64 Gbits/sec  1247    547
KBytes
[  4]   6.00-7.00   sec  1.01 GBytes  8.65 Gbits/sec  765    260
KBytes
[  4]   7.00-8.00   sec  1.01 GBytes  8.65 Gbits/sec  1009    325
KBytes
[  4]   8.00-9.00   sec  1.01 GBytes  8.64 Gbits/sec  882    356
KBytes
[  4]   9.00-10.00  sec  1.00 GBytes  8.60 Gbits/sec  1102    327
KBytes
[  4]  10.00-11.00  sec  1022 MBytes  8.57 Gbits/sec  1250    370
KBytes
[  4]  11.00-12.00  sec  1022 MBytes  8.57 Gbits/sec  1128    337
KBytes
[  4]  12.00-13.00  sec  1.00 GBytes  8.60 Gbits/sec  923    407
KBytes
[  4]  13.00-14.00  sec  1.01 GBytes  8.65 Gbits/sec  678    334
KBytes
[  4]  14.00-15.00  sec  1.01 GBytes  8.64 Gbits/sec  1102    291
KBytes
[  4]  15.00-16.00  sec  1.01 GBytes  8.64 Gbits/sec  761    385
KBytes
[  4]  16.00-17.00  sec  1019 MBytes  8.55 Gbits/sec  1160   8.48
KBytes
[  4]  17.00-18.00  sec  1.01 GBytes  8.65 Gbits/sec  1264    516
KBytes
[  4]  18.00-19.00  sec  1.00 GBytes  8.60 Gbits/sec  1010    387
KBytes
[  4]  19.00-20.00  sec  1.01 GBytes  8.64 Gbits/sec  1047    445
KBytes
[  4]  20.00-21.00  sec  1.00 GBytes  8.61 Gbits/sec  986    321
KBytes
[  4]  21.00-22.00  sec  1.01 GBytes  8.64 Gbits/sec  1107    385
KBytes
[  4]  22.00-23.00  sec  1.01 GBytes  8.64 Gbits/sec  1036    530
KBytes
[  4]  23.00-24.00  sec  1.00 GBytes  8.63 Gbits/sec  1471    426
KBytes
[  4]  24.00-25.00  sec  1.01 GBytes  8.64 Gbits/sec  1392    386
KBytes
[  4]  25.00-26.00  sec  1.00 GBytes  8.61 Gbits/sec  1029    225
KBytes
[  4]  26.00-27.00  sec  1.01 GBytes  8.64 Gbits/sec  1246    420
KBytes
[  4]  27.00-28.00  sec  1024 MBytes  8.59 Gbits/sec  986    392
KBytes
[  4]  28.00-29.00  sec   821 MBytes  6.89 Gbits/sec  1124    325
KBytes
[  4]  29.00-30.00  sec  1.00 GBytes  8.60 Gbits/sec  1005    290
KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-30.00  sec  29.5 GBytes  8.45 Gbits/sec  31048
sender
[  4]   0.00-30.00  sec  29.5 GBytes  8.45 Gbits/sec
receiver

iperf Done.

And I test it again immediately, but performance become bad again.:

[root@localhost ~]# iperf3 -c 10.100.85.246 -i 1 -t 30
Connecting to host 10.100.85.246, port 5201
[  4] local 10.100.85.245 port 56008 connected to 10.100.85.246 port 5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   393 MBytes  3.29 Gbits/sec  607    335
KBytes
[  4]   1.00-2.00   sec  80.8 MBytes   678 Mbits/sec  817    372
KBytes
[  4]   2.00-3.00   sec   259 MBytes  2.18 Gbits/sec  582    544
KBytes
[  4]   3.00-4.00   sec   195 MBytes  1.63 Gbits/sec  403    370
KBytes
[  4]   4.00-5.00   sec   294 MBytes  2.46 Gbits/sec  587    346
KBytes
[  4]   5.00-6.00   sec   409 MBytes  3.43 Gbits/sec  719    409
KBytes
[  4]   6.00-7.00   sec   301 MBytes  2.52 Gbits/sec  762    411
KBytes
[  4]   7.00-8.00   sec   515 MBytes  4.32 Gbits/sec  654    205
KBytes
[  4]   8.00-9.00   sec   611 MBytes  5.12 Gbits/sec  756    431
KBytes
[  4]   9.00-10.00  sec   418 MBytes  3.51 Gbits/sec  646    436
KBytes
[  4]  10.00-11.00  sec   357 MBytes  2.99 Gbits/sec  651    337
KBytes
[  4]  11.00-12.00  sec   440 MBytes  3.69 Gbits/sec  575    404
KBytes
[  4]  12.00-13.00  sec   239 MBytes  2.00 Gbits/sec  480    399
KBytes
[  4]  13.00-14.00  sec   408 MBytes  3.42 Gbits/sec  634    400
KBytes
[  4]  14.00-15.00  sec   678 MBytes  5.69 Gbits/sec  869    462
KBytes
[  4]  15.00-16.00  sec   707 MBytes  5.93 Gbits/sec  987    335
KBytes
[  4]  16.00-17.00  sec   496 MBytes  4.16 Gbits/sec  742    332
KBytes
[  4]  17.00-18.00  sec   549 MBytes  4.60 Gbits/sec  468    385
KBytes
[  4]  18.00-19.00  sec   511 MBytes  4.28 Gbits/sec  721    291
KBytes
[  4]  19.00-20.00  sec   515 MBytes  4.32 Gbits/sec  957    386
KBytes
[  4]  20.00-21.00  sec   479 MBytes  4.02 Gbits/sec  595    373
KBytes
[  4]  21.00-22.00  sec   132 MBytes  1.11 Gbits/sec  442    455
KBytes
[  4]  22.00-23.00  sec   146 MBytes  1.22 Gbits/sec  575    345
KBytes
[  4]  23.00-24.00  sec   250 MBytes  2.10 Gbits/sec  822    365
KBytes
[  4]  24.00-25.00  sec   412 MBytes  3.46 Gbits/sec  448    399
KBytes
[  4]  25.00-26.00  sec   704 MBytes  5.91 Gbits/sec  674    346
KBytes
[  4]  26.00-27.00  sec   946 MBytes  7.93 Gbits/sec  741    281
KBytes
[  4]  27.00-28.00  sec   563 MBytes  4.72 Gbits/sec  732    311
KBytes
[  4]  28.00-29.00  sec   426 MBytes  3.57 Gbits/sec  936    527
KBytes
[  4]  29.00-30.00  sec  70.0 MBytes   587 Mbits/sec  366    246
KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-30.00  sec  12.2 GBytes  3.50 Gbits/sec  19948
sender
[  4]   0.00-30.00  sec  12.2 GBytes  3.49 Gbits/sec
receiver


I pinned the VM's cpu and disable the irqbalance service in above testing.

So I suspect the root cause may in  vhost-implementation(may be change tx
mode will clear some cache or ...?) / TCP congestion control?
I will do more testing on it.


Thanks
Zhenyu Gao



2017-09-05 23:11 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> >
> > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
> > So L4 packets's cksum were calculated in VM side but performance is not
> > good.
> > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput in
> > VM->phy->phy->VM situation. And it makes virtio-net frontend-driver
> > support NETIF_F_SG(feature scatter-gather) as well.
> >
> > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> > ---
> >
> > Here is some performance number:
>
> Hi Zhenyu,
>
> Thanks for the code changes since v3.
> I tested a VM to VM case using iperf and observed a performance
> degradation when the tx cksum was offloaded to the host:
>
> checksum in VM
> 0.0-30.0 sec  10.9 GBytes  3.12 Gbits/sec
> 0.0-30.0 sec  10.9 GBytes  3.11 Gbits/sec
> 0.0-30.0 sec  11.0 GBytes  3.16 Gbits/sec
>
> checksum in ovs dpdk
> 0.0-30.0 sec  7.52 GBytes  2.15 Gbits/sec
> 0.0-30.0 sec  7.12 GBytes  2.04 Gbits/sec
> 0.0-30.0 sec  8.17 GBytes  2.34 Gbits/sec
>
> I think for this feature to enabled we need performance to be roughly the
> same or better for all use cases. For now the gap here is too big I think.
>
> If you wish to reproduce:
>
> 1 host, 2 VMs each with 1 vhost port and flows set up to switch packets
> from each vhost port to the other.
>
> VM1:
> ifconfig eth1 1.1.1.1/24 up
> ethtool -K eth2 tx on/off
> iperf -c 1.1.1.2 -i 1 -t 30
>
> VM2:
> ifconfig eth1 1.1.1.2/24 up
> ethtool -K eth1 tx on/off
> iperf -s -i 1
>
> Thanks,
> Ciara
>
> >
> > Setup:
> >
> >  qperf client
> > +---------+
> > |   VM    |
> > +---------+
> >      |
> >      |                          qperf server
> > +--------------+              +------------+
> > | vswitch+dpdk |              | bare-metal |
> > +--------------+              +------------+
> >        |                            |
> >        |                            |
> >       pNic---------PhysicalSwitch----
> >
> > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx
> on'
> > in VM side.
> >                       It offload cksum job to ovs-dpdk side.
> >
> > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off'
> in VM
> > side.
> >                 VM calculate cksum for tcp/udp packets.
> >
> > We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> > cksum.
> >
> > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2
> host-qperf-server01
> > tcp_bw tcp_lat udp_bw udp_lat
> >   do cksum in ovs-dpdk          do cksum in VM             without this
> patch
> > tcp_bw:
> >     bw  =  1.9 MB/sec         bw  =  1.92 MB/sec        bw  =  1.95
> MB/sec
> > tcp_bw:
> >     bw  =  3.97 MB/sec        bw  =  3.99 MB/sec        bw  =  3.98
> MB/sec
> > tcp_bw:
> >     bw  =  7.75 MB/sec        bw  =  7.79 MB/sec        bw  =  7.89
> MB/sec
> > tcp_bw:
> >     bw  =  14.7 MB/sec        bw  =  14.7 MB/sec        bw  =  14.9
> MB/sec
> > tcp_bw:
> >     bw  =  27.7 MB/sec        bw  =  27.4 MB/sec        bw  =  28 MB/sec
> > tcp_bw:
> >     bw  =  51.1 MB/sec        bw  =  51.3 MB/sec        bw  =  51.8
> MB/sec
> > tcp_bw:
> >     bw  =  86.2 MB/sec        bw  =  84.4 MB/sec        bw  =  87.6
> MB/sec
> > tcp_bw:
> >     bw  =  141 MB/sec         bw  =  142 MB/sec        bw  =  141 MB/sec
> > tcp_bw:
> >     bw  =  203 MB/sec         bw  =  201 MB/sec        bw  =  211 MB/sec
> > tcp_bw:
> >     bw  =  267 MB/sec         bw  =  250 MB/sec        bw  =  260 MB/sec
> > tcp_bw:
> >     bw  =  324 MB/sec         bw  =  295 MB/sec        bw  =  302 MB/sec
> > tcp_bw:
> >     bw  =  397 MB/sec         bw  =  363 MB/sec        bw  =  347 MB/sec
> > tcp_bw:
> >     bw  =  765 MB/sec         bw  =  510 MB/sec        bw  =  383 MB/sec
> > tcp_bw:
> >     bw  =  850 MB/sec         bw  =  710 MB/sec        bw  =  417 MB/sec
> > tcp_bw:
> >     bw  =  1.09 GB/sec        bw  =  860 MB/sec        bw  =  444 MB/sec
> > tcp_bw:
> >     bw  =  1.17 GB/sec        bw  =  979 MB/sec        bw  =  447 MB/sec
> > tcp_bw:
> >     bw  =  1.17 GB/sec        bw  =  1.07 GB/sec       bw  =  462 MB/sec
> > tcp_lat:
> >     latency  =  29.1 us       latency  =  28.7 us        latency  =
> 29.1 us
> > tcp_lat:
> >     latency  =  29 us         latency  =  28.8 us        latency  =  29
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  28.8 us        latency  =  29
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  28.9 us        latency  =  29
> us
> > tcp_lat:
> >     latency  =  29.2 us       latency  =  28.9 us        latency  =
> 29.1 us
> > tcp_lat:
> >     latency  =  29.1 us       latency  =  29.1 us        latency  =
> 29.1 us
> > tcp_lat:
> >     latency  =  29.5 us       latency  =  29.5 us        latency  =
> 29.5 us
> > tcp_lat:
> >     latency  =  29.8 us       latency  =  29.8 us        latency  =
> 29.9 us
> > tcp_lat:
> >     latency  =  30.7 us       latency  =  30.7 us        latency  =
> 30.7 us
> > tcp_lat:
> >     latency  =  47.1 us       latency  =  46.2 us        latency  =
> 47.1 us
> > tcp_lat:
> >     latency  =  52.1 us       latency  =  52.3 us        latency  =
> 53.3 us
> > tcp_lat:
> >     latency  =  44 us         latency  =  43.8 us        latency  =
> 43.2 us
> > tcp_lat:
> >     latency  =  50 us         latency  =  46.6 us        latency  =
> 47.8 us
> > tcp_lat:
> >      latency  =  79.2 us      latency  =  77.9 us        latency  =
> 78.9 us
> > tcp_lat:
> >     latency  =  82.3 us       latency  =  81.7 us        latency  =
> 82.2 us
> > tcp_lat:
> >     latency  =  96.7 us       latency  =  90.8 us        latency  =  127
> us
> > tcp_lat:
> >     latency  =  215 us        latency  =  177 us        latency  =  225
> us
> > udp_bw:
> >     send_bw  =  422 KB/sec        send_bw  =  415 KB/sec        send_bw
> =  405
> > KB/sec
> >     recv_bw  =  402 KB/sec        recv_bw  =  404 KB/sec        recv_bw
> =  403
> > KB/sec
> > udp_bw:
> >     send_bw  =  845 KB/sec        send_bw  =  835 KB/sec        send_bw
> =  802
> > KB/sec
> >     recv_bw  =  831 KB/sec        recv_bw  =  804 KB/sec        recv_bw
> =  802
> > KB/sec
> > udp_bw:
> >     send_bw  =  1.69 MB/sec       send_bw  =  1.66 MB/sec
> send_bw  =  1.62
> > MB/sec
> >     recv_bw  =  1.45 MB/sec       recv_bw  =  1.63 MB/sec
> recv_bw  =   1.6
> > MB/sec
> > udp_bw:
> >     send_bw  =  3.38 MB/sec       send_bw  =  3.33 MB/sec
> send_bw  =  3.24
> > MB/sec
> >     recv_bw  =  3.32 MB/sec       recv_bw  =  3.25 MB/sec
> recv_bw  =  3.24
> > MB/sec
> > udp_bw:
> >     send_bw  =  6.76 MB/sec       send_bw  =  6.63 MB/sec
> send_bw  =  6.47
> > MB/sec
> >     recv_bw  =  6.42 MB/sec       recv_bw  =  5.59 MB/sec
> recv_bw  =  6.45
> > MB/sec
> > udp_bw:
> >     send_bw  =  13.5 MB/sec       send_bw  =  13.3 MB/sec
> send_bw  =  13
> > MB/sec
> >     recv_bw  =  13.4 MB/sec       recv_bw  =  12.1 MB/sec
> recv_bw  =  13
> > MB/sec
> > udp_bw:
> >     send_bw  =    27 MB/sec       send_bw  =  26.5 MB/sec
> send_bw  =  25.9
> > MB/sec
> >     recv_bw  =  26.4 MB/sec       recv_bw  =  21.5 MB/sec
> recv_bw  =  25.9
> > MB/sec
> > udp_bw:
> >     send_bw  =  53.8 MB/sec       send_bw  =  52.9 MB/sec
> send_bw  =  51.7
> > MB/sec
> >     recv_bw  =  49.1 MB/sec       recv_bw  =  47.6 MB/sec
> recv_bw  =  51.1
> > MB/sec
> > udp_bw:
> >     send_bw  =   108 MB/sec       send_bw  =  105 MB/sec
>  send_bw  =  102
> > MB/sec
> >     recv_bw  =  91.1 MB/sec       recv_bw  =  101 MB/sec
>  recv_bw  =  100
> > MB/sec
> > udp_bw:
> >     send_bw  =  212 MB/sec        send_bw  =  208 MB/sec
>  send_bw  =  203
> > MB/sec
> >     recv_bw  =  204 MB/sec        recv_bw  =  204 MB/sec
>  recv_bw  =  169
> > MB/sec
> > udp_bw:
> >     send_bw  =  414 MB/sec        send_bw  =  407 MB/sec
>  send_bw  =  398
> > MB/sec
> >     recv_bw  =  403 MB/sec        recv_bw  =  312 MB/sec
>  recv_bw  =  343
> > MB/sec
> > udp_bw:
> >     send_bw  =  555 MB/sec        send_bw  =  561 MB/sec
>  send_bw  =  557
> > MB/sec
> >     recv_bw  =  354 MB/sec        recv_bw  =  368 MB/sec
>  recv_bw  =  360
> > MB/sec
> > udp_bw:
> >     send_bw  =  877 MB/sec        send_bw  =  880 MB/sec
>  send_bw  =  868
> > MB/sec
> >     recv_bw  =  551 MB/sec        recv_bw  =  542 MB/sec
>  recv_bw  =  562
> > MB/sec
> > udp_bw:
> >     send_bw  =  1.1 GB/sec        send_bw  =  1.08 GB/sec
> send_bw  =  1.09
> > GB/sec
> >     recv_bw  =  805 MB/sec        recv_bw  =   785 MB/sec
> recv_bw  =   766
> > MB/sec
> > udp_bw:
> >     send_bw  =  1.21 GB/sec       send_bw  =  1.19 GB/sec
> send_bw  =  1.22
> > GB/sec
> >     recv_bw  =   899 MB/sec       recv_bw  =   715 MB/sec
> recv_bw  =   700
> > MB/sec
> > udp_bw:
> >     send_bw  =  1.31 GB/sec       send_bw  =  1.31 GB/sec
> send_bw  =  1.31
> > GB/sec
> >     recv_bw  =   614 MB/sec       recv_bw  =   622 MB/sec
> recv_bw  =   661
> > MB/sec
> > udp_bw:
> >     send_bw  =  0 bytes/sec       send_bw  =  0 bytes/sec
> send_bw  =  0
> > bytes/sec
> >     recv_bw  =  0 bytes/sec       recv_bw  =  0 bytes/sec
> recv_bw  =  0
> > bytes/sec
> > udp_lat:
> >     latency  =  25.9 us        latency  =  26.5 us        latency  =
> 26.5 us
> > udp_lat:
> >     latency  =  26.3 us        latency  =  26.4 us        latency  =
> 26.5 us
> > udp_lat:
> >     latency  =  26 us          latency  =  26.4 us        latency  =
> 26.6 us
> > udp_lat:
> >     latency  =  26.1 us        latency  =  26.2 us        latency  =
> 26.4 us
> > udp_lat:
> >     latency  =  26.3 us        latency  =  26.5 us        latency  =
> 26.7 us
> > udp_lat:
> >     latency  =  26.3 us        latency  =  26.4 us        latency  =
> 26.5 us
> > udp_lat:
> >     latency  =  26.3 us        latency  =  26.7 us        latency  =
> 26.9 us
> > udp_lat:
> >     latency  =  27.1 us        latency  =  27.1 us        latency  =
> 27.2 us
> > udp_lat:
> >     latency  =  27.5 us        latency  =  27.8 us        latency  =
> 28.1 us
> > udp_lat:
> >     latency  =  28.7 us        latency  =  28.9 us        latency  =
> 29.1 us
> > udp_lat:
> >     latency  =  30.4 us        latency  =  30.5 us        latency  =
> 30.9 us
> > udp_lat:
> >     latency  =  41.2 us        latency  =  41.3 us        latency  =
> 41.1 us
> > udp_lat:
> >     latency  =  41.3 us        latency  =  41.5 us        latency  =
> 41.5 us
> > udp_lat:
> >     latency  =  64.4 us        latency  =  64.5 us        latency  =
> 64.2 us
> > udp_lat:
> >     latency  =  71.5 us        latency  =  71.5 us        latency  =
> 71.7 us
> > udp_lat:
> >     latency  =  120 us         latency  =  120 us         latency  =
> 120 us
> > udp_lat:
> >     latency  =  0 ns           latency  =  0 ns           latency  =  0
> ns
> >
> >  lib/netdev-dpdk.c | 79
> > ++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 75 insertions(+), 4 deletions(-)
> >
> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > index f58e9be..0f91def 100644
> > --- a/lib/netdev-dpdk.c
> > +++ b/lib/netdev-dpdk.c
> > @@ -31,6 +31,7 @@
> >  #include <rte_errno.h>
> >  #include <rte_eth_ring.h>
> >  #include <rte_ethdev.h>
> > +#include <rte_ip.h>
> >  #include <rte_malloc.h>
> >  #include <rte_mbuf.h>
> >  #include <rte_meter.h>
> > @@ -992,8 +993,7 @@ netdev_dpdk_vhost_construct(struct netdev
> > *netdev)
> >
> >      err = rte_vhost_driver_disable_features(dev->vhost_id,
> >                                  1ULL << VIRTIO_NET_F_HOST_TSO4
> > -                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
> > -                                | 1ULL << VIRTIO_NET_F_CSUM);
> > +                                | 1ULL << VIRTIO_NET_F_HOST_TSO6);
> >      if (err) {
> >          VLOG_ERR("rte_vhost_driver_disable_features failed for vhost
> user "
> >                   "port: %s\n", name);
> > @@ -1455,6 +1455,76 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> > *rxq)
> >      rte_free(rx);
> >  }
> >
> > +static inline void
> > +netdev_dpdk_vhost_refill_l4_cksum(const char *data, struct dp_packet
> > *pkt,
> > +                                  uint8_t l4_proto, bool is_ipv4)
> > +{
> > +    void *l3hdr = (void *)(data + pkt->mbuf.l2_len);
> > +
> > +    if (l4_proto == IPPROTO_TCP) {
> > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
> > +                                         pkt->mbuf.l2_len +
> pkt->mbuf.l3_len);
> > +
> > +        tcp_hdr->tcp_csum = 0;
> > +        if (is_ipv4) {
> > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
> > +        } else {
> > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
> > +        }
> > +    } else if (l4_proto == IPPROTO_UDP) {
> > +        struct udp_header *udp_hdr = (struct udp_header *)(data +
> > +                                         pkt->mbuf.l2_len +
> pkt->mbuf.l3_len);
> > +        /* do not recalculate udp cksum if it was 0 */
> > +        if (udp_hdr->udp_csum != 0) {
> > +            udp_hdr->udp_csum = 0;
> > +            if (is_ipv4) {
> > +                /*do not calculate udp cksum if it was a fragment IP*/
> > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> > +                                      fragment_offset)) {
> > +                    return;
> > +                }
> > +
> > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr,
> udp_hdr);
> > +            } else {
> > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr,
> udp_hdr);
> > +            }
> > +        }
> > +    }
> > +
> > +    pkt->mbuf.ol_flags &= ~PKT_TX_L4_MASK;
> > +}
> > +
> > +static inline void
> > +netdev_dpdk_vhost_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> > +{
> > +    int i;
> > +
> > +    for (i = 0; i < pkt_cnt; i++) {
> > +        ovs_be16 dl_type;
> > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> > +        const char *data = dp_packet_data(pkt);
> > +        void *l3hdr = (char *)(data + pkt->mbuf.l2_len);
> > +
> > +        if (!(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> > +            /* DPDK vhost tags PKT_TX_L4_MASK if a L4 packet need
> cksum. */
> > +            continue;
> > +        }
> > +
> > +        if (OVS_UNLIKELY(pkt->mbuf.l2_len == 0 || pkt->mbuf.l3_len ==
> 0)) {
> > +            continue;
> > +        }
> > +
> > +        dl_type = *(ovs_be16 *)(data + pkt->mbuf.l2_len - sizeof
> dl_type);
> > +        if (dl_type == htons(ETH_TYPE_IP)) {
> > +            uint8_t l4_proto = ((struct ipv4_hdr
> *)l3hdr)->next_proto_id;
> > +            netdev_dpdk_vhost_refill_l4_cksum(data, pkt, l4_proto,
> true);
> > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> > +            uint8_t l4_proto = ((struct ipv6_hdr *)l3hdr)->proto;
> > +            netdev_dpdk_vhost_refill_l4_cksum(data, pkt, l4_proto,
> false);
> > +        }
> > +    }
> > +}
> > +
> >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
> ownership of
> >   * 'pkts', even in case of failure.
> >   *
> > @@ -1646,6 +1716,8 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq
> > *rxq,
> >
> >      dp_packet_batch_init_cutlen(batch);
> >      batch->count = (int) nb_rx;
> > +    netdev_dpdk_vhost_tx_csum(batch->packets, batch->count);
> > +
> >      return 0;
> >  }
> >
> > @@ -3288,8 +3360,7 @@ netdev_dpdk_vhost_client_reconfigure(struct
> > netdev *netdev)
> >
> >          err = rte_vhost_driver_disable_features(dev->vhost_id,
> >                                      1ULL << VIRTIO_NET_F_HOST_TSO4
> > -                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6
> > -                                    | 1ULL << VIRTIO_NET_F_CSUM);
> > +                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6);
> >          if (err) {
> >              VLOG_ERR("rte_vhost_driver_disable_features failed for
> vhost user "
> >                       "client port: %s\n", dev->up.name);
> > --
> > 1.8.3.1
>
>
diff mbox series

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index f58e9be..0f91def 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -31,6 +31,7 @@ 
 #include <rte_errno.h>
 #include <rte_eth_ring.h>
 #include <rte_ethdev.h>
+#include <rte_ip.h>
 #include <rte_malloc.h>
 #include <rte_mbuf.h>
 #include <rte_meter.h>
@@ -992,8 +993,7 @@  netdev_dpdk_vhost_construct(struct netdev *netdev)
 
     err = rte_vhost_driver_disable_features(dev->vhost_id,
                                 1ULL << VIRTIO_NET_F_HOST_TSO4
-                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                | 1ULL << VIRTIO_NET_F_CSUM);
+                                | 1ULL << VIRTIO_NET_F_HOST_TSO6);
     if (err) {
         VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
                  "port: %s\n", name);
@@ -1455,6 +1455,76 @@  netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
     rte_free(rx);
 }
 
+static inline void
+netdev_dpdk_vhost_refill_l4_cksum(const char *data, struct dp_packet *pkt,
+                                  uint8_t l4_proto, bool is_ipv4)
+{
+    void *l3hdr = (void *)(data + pkt->mbuf.l2_len);
+
+    if (l4_proto == IPPROTO_TCP) {
+        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
+                                         pkt->mbuf.l2_len + pkt->mbuf.l3_len);
+
+        tcp_hdr->tcp_csum = 0;
+        if (is_ipv4) {
+            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
+        } else {
+            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
+        }
+    } else if (l4_proto == IPPROTO_UDP) {
+        struct udp_header *udp_hdr = (struct udp_header *)(data +
+                                         pkt->mbuf.l2_len + pkt->mbuf.l3_len);
+        /* do not recalculate udp cksum if it was 0 */
+        if (udp_hdr->udp_csum != 0) {
+            udp_hdr->udp_csum = 0;
+            if (is_ipv4) {
+                /*do not calculate udp cksum if it was a fragment IP*/
+                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
+                                      fragment_offset)) {
+                    return;
+                }
+
+                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);
+            } else {
+                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);
+            }
+        }
+    }
+
+    pkt->mbuf.ol_flags &= ~PKT_TX_L4_MASK;
+}
+
+static inline void
+netdev_dpdk_vhost_tx_csum(struct dp_packet **pkts, int pkt_cnt)
+{
+    int i;
+
+    for (i = 0; i < pkt_cnt; i++) {
+        ovs_be16 dl_type;
+        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
+        const char *data = dp_packet_data(pkt);
+        void *l3hdr = (char *)(data + pkt->mbuf.l2_len);
+
+        if (!(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
+            /* DPDK vhost tags PKT_TX_L4_MASK if a L4 packet need cksum. */
+            continue;
+        }
+
+        if (OVS_UNLIKELY(pkt->mbuf.l2_len == 0 || pkt->mbuf.l3_len == 0)) {
+            continue;
+        }
+
+        dl_type = *(ovs_be16 *)(data + pkt->mbuf.l2_len - sizeof dl_type);
+        if (dl_type == htons(ETH_TYPE_IP)) {
+            uint8_t l4_proto = ((struct ipv4_hdr *)l3hdr)->next_proto_id;
+            netdev_dpdk_vhost_refill_l4_cksum(data, pkt, l4_proto, true);
+        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
+            uint8_t l4_proto = ((struct ipv6_hdr *)l3hdr)->proto;
+            netdev_dpdk_vhost_refill_l4_cksum(data, pkt, l4_proto, false);
+        }
+    }
+}
+
 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
  * 'pkts', even in case of failure.
  *
@@ -1646,6 +1716,8 @@  netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
 
     dp_packet_batch_init_cutlen(batch);
     batch->count = (int) nb_rx;
+    netdev_dpdk_vhost_tx_csum(batch->packets, batch->count);
+
     return 0;
 }
 
@@ -3288,8 +3360,7 @@  netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
 
         err = rte_vhost_driver_disable_features(dev->vhost_id,
                                     1ULL << VIRTIO_NET_F_HOST_TSO4
-                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                    | 1ULL << VIRTIO_NET_F_CSUM);
+                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6);
         if (err) {
             VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
                      "client port: %s\n", dev->up.name);