[ovs-dev,v1] netdev-dpdk: Implement TCP/UDP TX cksum in ovs-dpdk side

Submitted by Gao Zhenyu on Aug. 1, 2017, 2:26 p.m.

Details

Message ID 20170801142631.10652-1-sysugaozhenyu@gmail.com
State New
Headers show

Commit Message

Gao Zhenyu Aug. 1, 2017, 2:26 p.m.
Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
So L4 packets's cksum were calculated in VM side but performance is not
good.
Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
makes virtio-net frontend-driver support NETIF_F_SG as well

Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
---

Here is some performance number:

Setup:

 qperf client
+---------+
|   VM    |
+---------+
     |
     |                          qperf server
+--------------+              +------------+
| vswitch+dpdk |              | bare-metal |
+--------------+              +------------+
       |                            |
       |                            |
      pNic---------PhysicalSwitch----

do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx on' in VM side.
                      It offload cksum job to ovs-dpdk side.

do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in VM side.
                VM calculate cksum for tcp/udp packets.

We can see huge improvment in TCP throughput if we leverage ovs-dpdk cksum.

[root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01 tcp_bw tcp_lat udp_bw udp_lat
  do cksum in ovs-dpdk          do cksum in VM             without this patch
tcp_bw:
    bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec
tcp_bw:
    bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec
tcp_bw:
    bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec
tcp_bw:
    bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec
tcp_bw:
    bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec
tcp_bw:
    bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec
tcp_bw:
    bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec
tcp_bw: 
    bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec
tcp_bw:
    bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec
tcp_bw:
    bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec
tcp_bw:
    bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec
tcp_bw:
    bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec
tcp_bw:
    bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec
tcp_bw: 
    bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec
tcp_bw:
    bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec
tcp_bw:
    bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec
tcp_bw:
    bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec
tcp_lat:
    latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
tcp_lat:
    latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us
tcp_lat:
    latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us
tcp_lat:
    latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us
tcp_lat:
    latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
tcp_lat:
    latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us
tcp_lat:
    latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us
tcp_lat:
    latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us
tcp_lat:
    latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us
tcp_lat:
    latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us
tcp_lat:
    latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us
tcp_lat:
    latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us
tcp_lat:
     latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us
tcp_lat:
    latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us
tcp_lat:
    latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us
tcp_lat:
    latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us
tcp_lat:
    latency  =  167 us        latency  =  197 us        latency  =  172 us
udp_bw:
    send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403 KB/sec
    recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400 KB/sec
udp_bw:
    send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810 KB/sec
    recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807 KB/sec
udp_bw:
    send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63 MB/sec
    recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63 MB/sec
udp_bw:
    send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26 MB/sec
    recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82 MB/sec
udp_bw:
    send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45 MB/sec
    recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45 MB/sec
udp_bw:
    send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13 MB/sec
    recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13 MB/sec
udp_bw:
    send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9 MB/sec
    recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7 MB/sec
udp_bw:
    send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52 MB/sec
    recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2 MB/sec
udp_bw:
    send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103 MB/sec
    recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100 MB/sec
udp_bw:
    send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205 MB/sec
    recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202 MB/sec
udp_bw:
    send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401 MB/sec
    recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358 MB/sec
udp_bw:
    send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557 MB/sec
    recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362 MB/sec
udp_bw:
    send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863 MB/sec
    recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584 MB/sec
udp_bw:
    send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08 GB/sec
    recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793 MB/sec
udp_bw: 
    send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19 GB/sec
    recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673 MB/sec
udp_bw:
    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec
    recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762 MB/sec
udp_bw:
    send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec
    recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec
udp_lat:
    latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us
udp_lat:
    latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us
udp_lat:
    latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us
udp_lat:
    latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us
udp_lat:
    latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us
udp_lat:
    latency  =  28.1 us       latency  =  28 us         latency  =  28 us
udp_lat:
     latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us
udp_lat:
    latency  =  31 us         latency  =  31 us         latency  =  30.8 us
udp_lat:
    latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us
udp_lat:
    latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us
udp_lat:
    latency  =  64.9 us       latency  =  65 us         latency  =  65 us
udp_lat:
    latency  =  72.3 us       latency  =  72 us         latency  =  72 us
udp_lat:
    latency  =  121 us        latency  =  122 us        latency  =  122 us
udp_lat: 
    latency  =  0 ns          latency  =  0 ns          latency  =  0 ns


 lib/netdev-dpdk.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 2 deletions(-)

Comments

Ciara Loftus Aug. 4, 2017, 2:52 p.m.
> 
> Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
> So L4 packets's cksum were calculated in VM side but performance is not
> good.
> Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
> makes virtio-net frontend-driver support NETIF_F_SG as well
> 
> Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> ---
> 
> Here is some performance number:
> 
> Setup:
> 
>  qperf client
> +---------+
> |   VM    |
> +---------+
>      |
>      |                          qperf server
> +--------------+              +------------+
> | vswitch+dpdk |              | bare-metal |
> +--------------+              +------------+
>        |                            |
>        |                            |
>       pNic---------PhysicalSwitch----
> 
> do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx on'
> in VM side.
>                       It offload cksum job to ovs-dpdk side.
> 
> do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in VM
> side.
>                 VM calculate cksum for tcp/udp packets.
> 
> We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> cksum.

Hi Zhenyu,

Thanks for the patch. I tested some alternative use cases and unfortunately I see a degradation for phy-phy and phy-vm-phy topologies.
Here are my results:

phy-vm-phy:
without patch: 0.871Mpps
with patch (offload=on): 0.877Mpps
with patch (offload=off): 0.891Mpps

phy-phy:
without patch: 13.581Mpps
with patch: 13.055Mpps

The half a million pps drop for the second test case is concerning to me but not surprising since we're adding extra complexity to netdev_dpdk_send()
Could this be avoided? Would it make sense to put this functionality somewhere else eg. vhost receive?

On another note I have a general concern. I understand similar functionality is present in the DPDK vhost sample app. I wonder if it would be feasible for this to be implemented in the DPDK vhost library and leveraged here, rather than having two implementations in two separate code bases.

I have some other comments inline.

Thanks,
Ciara

> 
> [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01
> tcp_bw tcp_lat udp_bw udp_lat
>   do cksum in ovs-dpdk          do cksum in VM             without this patch
> tcp_bw:
>     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec
> tcp_bw:
>     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec
> tcp_bw:
>     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec
> tcp_bw:
>     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec
> tcp_bw:
>     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec
> tcp_bw:
>     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec
> tcp_bw:
>     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec
> tcp_bw:
>     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec
> tcp_bw:
>     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec
> tcp_bw:
>     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec
> tcp_bw:
>     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec
> tcp_bw:
>     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec
> tcp_bw:
>     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec
> tcp_bw:
>     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec
> tcp_bw:
>     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec
> tcp_bw:
>     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec
> tcp_bw:
>     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec
> tcp_lat:
>     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
> tcp_lat:
>     latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us
> tcp_lat:
>     latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us
> tcp_lat:
>     latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us
> tcp_lat:
>     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
> tcp_lat:
>     latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us
> tcp_lat:
>     latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us
> tcp_lat:
>     latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us
> tcp_lat:
>     latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us
> tcp_lat:
>     latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us
> tcp_lat:
>     latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us
> tcp_lat:
>     latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us
> tcp_lat:
>      latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us
> tcp_lat:
>     latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us
> tcp_lat:
>     latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us
> tcp_lat:
>     latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us
> tcp_lat:
>     latency  =  167 us        latency  =  197 us        latency  =  172 us
> udp_bw:
>     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403 KB/sec
>     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400 KB/sec
> udp_bw:
>     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810 KB/sec
>     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807 KB/sec
> udp_bw:
>     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63
> MB/sec
>     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63
> MB/sec
> udp_bw:
>     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26
> MB/sec
>     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82
> MB/sec
> udp_bw:
>     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45
> MB/sec
>     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45
> MB/sec
> udp_bw:
>     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13
> MB/sec
>     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13 MB/sec
> udp_bw:
>     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9
> MB/sec
>     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7
> MB/sec
> udp_bw:
>     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52
> MB/sec
>     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2
> MB/sec
> udp_bw:
>     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103
> MB/sec
>     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100 MB/sec
> udp_bw:
>     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205
> MB/sec
>     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202 MB/sec
> udp_bw:
>     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401
> MB/sec
>     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358 MB/sec
> udp_bw:
>     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557
> MB/sec
>     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362 MB/sec
> udp_bw:
>     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863
> MB/sec
>     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584 MB/sec
> udp_bw:
>     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08
> GB/sec
>     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793
> MB/sec
> udp_bw:
>     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19
> GB/sec
>     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673
> MB/sec
> udp_bw:
>     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec
>     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762 MB/sec
> udp_bw:
>     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
> bytes/sec
>     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec
> udp_lat:
>     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us
> udp_lat:
>     latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us
> udp_lat:
>     latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us
> udp_lat:
>     latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us
> udp_lat:
>     latency  =  28.1 us       latency  =  28 us         latency  =  28 us
> udp_lat:
>      latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us
> udp_lat:
>     latency  =  31 us         latency  =  31 us         latency  =  30.8 us
> udp_lat:
>     latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us
> udp_lat:
>     latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us
> udp_lat:
>     latency  =  64.9 us       latency  =  65 us         latency  =  65 us
> udp_lat:
>     latency  =  72.3 us       latency  =  72 us         latency  =  72 us
> udp_lat:
>     latency  =  121 us        latency  =  122 us        latency  =  122 us
> udp_lat:
>     latency  =  0 ns          latency  =  0 ns          latency  =  0 ns
> 
> 
>  lib/netdev-dpdk.c | 84
> +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 82 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index ea17b97..d27d615 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -28,6 +28,7 @@
>  #include <rte_errno.h>
>  #include <rte_eth_ring.h>
>  #include <rte_ethdev.h>
> +#include <rte_ip.h>
>  #include <rte_malloc.h>
>  #include <rte_mbuf.h>
>  #include <rte_meter.h>
> @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> *rxq)
>      rte_free(rx);
>  }
> 
> +static inline void
> +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
> +                       uint8_t l4_proto, bool is_ipv4)
> +{
> +    void *l3hdr = (void *)(data + pkt->l3_ofs);
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt->l4_ofs);
> +
> +        pkt->mbuf.l2_len = pkt->l3_ofs;
> +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> +        tcp_hdr->tcp_csum = 0;
> +        if (is_ipv4) {
> +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
> +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
> +        } else {
> +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
> +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
> +        }
> +    } else if (l4_proto == IPPROTO_UDP) {
> +        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt-
> >l4_ofs);
> +        /* do not recalculate udp cksum if it was 0 */
> +        if (udp_hdr->udp_csum != 0) {
> +            pkt->mbuf.l2_len = pkt->l3_ofs;
> +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> +            udp_hdr->udp_csum = 0;
> +            if (is_ipv4) {
> +                /*do not calculate udp cksum if it was a fragment IP*/
> +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> +                                      fragment_offset)) {
> +                    return;
> +                }
> +
> +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
> +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);
> +            } else {
> +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
> +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);
> +            }
> +        }
> +    }
> +}
> +
> +static inline void
> +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> +{
> +    int i;
> +
> +    for (i = 0; i < pkt_cnt; i++) {
> +        ovs_be16 dl_type;
> +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> +        const char *data = dp_packet_data(pkt);
> +        void *l3hdr = (char *)(data + pkt->l3_ofs);
> +
> +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
> +            continue;
> +        }
> +        /* This take a assumption that it should be a vhost packet if this
> +         * packet was allocated by DPDK pool and try sending to pNic. */
> +        if (pkt->source == DPBUF_DPDK &&
> +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need cksum
> +            continue;
> +        }

The comments here could be formatted better. Suggest combining both into one comment before the 'if'.
Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.

> +
> +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
> +        if (dl_type == htons(ETH_TYPE_IP)) {
> +            netdev_refill_l4_cksum(data, pkt,
> +                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,
> +                                   true);
> +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> +            netdev_refill_l4_cksum(data, pkt,
> +                                   ((struct ipv6_hdr *)l3hdr)->proto,
> +                                   false);
> +        }
> +    }
> +}
> +
>  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
>   * 'pkts', even in case of failure.
>   *
> @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,
> int qid,
>          return;
>      }
> 
> +    netdev_prepare_tx_csum(batch->packets, batch->count);

Putting this here assumes we only prepare the csum for vhost -> dpdk or vhost -> ring cases. What about vhost -> vhost?

> +
>      if (OVS_UNLIKELY(concurrent_txq)) {
>          qid = qid % dev->up.n_txq;
>          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
>      if (ovsthread_once_start(&once)) {
>          rte_vhost_driver_callback_register(&virtio_net_device_ops);
>          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                                  | 1ULL << VIRTIO_NET_F_CSUM);
> +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
>          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> 
>          ovsthread_once_done(&once);
> --
> 1.8.3.1
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Gao Zhenyu Aug. 4, 2017, 11:57 p.m.
Hi Loftus,

Thanks for testing and the comments!
Can you show more details about your phy-vm-phy,phy-phy setup and testing
steps? Then I can reproduce it to see if I can solve this pps problem.

BTW, how about throughput, did you saw improvment?

I would like to implement vhost->vhost part.

Thanks
Zhenyu Gao

2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> >
> > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
> > So L4 packets's cksum were calculated in VM side but performance is not
> > good.
> > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
> > makes virtio-net frontend-driver support NETIF_F_SG as well
> >
> > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> > ---
> >
> > Here is some performance number:
> >
> > Setup:
> >
> >  qperf client
> > +---------+
> > |   VM    |
> > +---------+
> >      |
> >      |                          qperf server
> > +--------------+              +------------+
> > | vswitch+dpdk |              | bare-metal |
> > +--------------+              +------------+
> >        |                            |
> >        |                            |
> >       pNic---------PhysicalSwitch----
> >
> > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx
> on'
> > in VM side.
> >                       It offload cksum job to ovs-dpdk side.
> >
> > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off'
> in VM
> > side.
> >                 VM calculate cksum for tcp/udp packets.
> >
> > We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> > cksum.
>
> Hi Zhenyu,
>
> Thanks for the patch. I tested some alternative use cases and
> unfortunately I see a degradation for phy-phy and phy-vm-phy topologies.
> Here are my results:
>
> phy-vm-phy:
> without patch: 0.871Mpps
> with patch (offload=on): 0.877Mpps
> with patch (offload=off): 0.891Mpps
>
> phy-phy:
> without patch: 13.581Mpps
> with patch: 13.055Mpps
>
> The half a million pps drop for the second test case is concerning to me
> but not surprising since we're adding extra complexity to netdev_dpdk_send()
> Could this be avoided? Would it make sense to put this functionality
> somewhere else eg. vhost receive?
>
> On another note I have a general concern. I understand similar
> functionality is present in the DPDK vhost sample app. I wonder if it would
> be feasible for this to be implemented in the DPDK vhost library and
> leveraged here, rather than having two implementations in two separate code
> bases.
>
> I have some other comments inline.
>
> Thanks,
> Ciara
>
> >
> > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2
> host-qperf-server01
> > tcp_bw tcp_lat udp_bw udp_lat
> >   do cksum in ovs-dpdk          do cksum in VM             without this
> patch
> > tcp_bw:
> >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95
> MB/sec
> > tcp_bw:
> >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98
> MB/sec
> > tcp_bw:
> >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19
> MB/sec
> > tcp_bw:
> >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7
> MB/sec
> > tcp_bw:
> >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7
> MB/sec
> > tcp_bw:
> >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9
> MB/sec
> > tcp_bw:
> >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1
> MB/sec
> > tcp_bw:
> >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec
> > tcp_bw:
> >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec
> > tcp_bw:
> >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec
> > tcp_bw:
> >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec
> > tcp_bw:
> >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec
> > tcp_bw:
> >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec
> > tcp_bw:
> >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec
> > tcp_bw:
> >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec
> > tcp_bw:
> >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec
> > tcp_bw:
> >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6
> us
> > tcp_lat:
> >     latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.3 us       latency  =  29.6
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.4 us       latency  =  29.5
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6
> us
> > tcp_lat:
> >     latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7
> us
> > tcp_lat:
> >     latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us
> > tcp_lat:
> >     latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2
> us
> > tcp_lat:
> >     latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us
> > tcp_lat:
> >     latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2
> us
> > tcp_lat:
> >     latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5
> us
> > tcp_lat:
> >     latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6
> us
> > tcp_lat:
> >      latency  =  47.6 us      latency  =  48 us         latency  =  48.1
> us
> > tcp_lat:
> >     latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8
> us
> > tcp_lat:
> >     latency  =  82.8 us       latency  =  82.3 us       latency  =  116
> us
> > tcp_lat:
> >     latency  =  94.8 us       latency  =  94.2 us       latency  =  134
> us
> > tcp_lat:
> >     latency  =  167 us        latency  =  197 us        latency  =  172
> us
> > udp_bw:
> >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403
> KB/sec
> >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400
> KB/sec
> > udp_bw:
> >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810
> KB/sec
> >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807
> KB/sec
> > udp_bw:
> >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63
> > MB/sec
> >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63
> > MB/sec
> > udp_bw:
> >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26
> > MB/sec
> >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82
> > MB/sec
> > udp_bw:
> >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45
> > MB/sec
> >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45
> > MB/sec
> > udp_bw:
> >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13
> > MB/sec
> >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13
> MB/sec
> > udp_bw:
> >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9
> > MB/sec
> >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7
> > MB/sec
> > udp_bw:
> >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52
> > MB/sec
> >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2
> > MB/sec
> > udp_bw:
> >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103
> > MB/sec
> >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100
> MB/sec
> > udp_bw:
> >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205
> > MB/sec
> >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202
> MB/sec
> > udp_bw:
> >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401
> > MB/sec
> >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358
> MB/sec
> > udp_bw:
> >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557
> > MB/sec
> >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362
> MB/sec
> > udp_bw:
> >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863
> > MB/sec
> >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584
> MB/sec
> > udp_bw:
> >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08
> > GB/sec
> >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793
> > MB/sec
> > udp_bw:
> >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19
> > GB/sec
> >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673
> > MB/sec
> > udp_bw:
> >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3
> GB/sec
> >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762
> MB/sec
> > udp_bw:
> >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
> > bytes/sec
> >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0
> bytes/sec
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7
> us
> > udp_lat:
> >     latency  =  27 us         latency  =  26.7 us       latency  =  26.6
> us
> > udp_lat:
> >     latency  =  27 us         latency  =  26.9 us       latency  =  26.7
> us
> > udp_lat:
> >     latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3
> us
> > udp_lat:
> >     latency  =  28.1 us       latency  =  28 us         latency  =  28 us
> > udp_lat:
> >      latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2
> us
> > udp_lat:
> >     latency  =  31 us         latency  =  31 us         latency  =  30.8
> us
> > udp_lat:
> >     latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3
> us
> > udp_lat:
> >     latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5
> us
> > udp_lat:
> >     latency  =  64.9 us       latency  =  65 us         latency  =  65 us
> > udp_lat:
> >     latency  =  72.3 us       latency  =  72 us         latency  =  72 us
> > udp_lat:
> >     latency  =  121 us        latency  =  122 us        latency  =  122
> us
> > udp_lat:
> >     latency  =  0 ns          latency  =  0 ns          latency  =  0 ns
> >
> >
> >  lib/netdev-dpdk.c | 84
> > +++++++++++++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 82 insertions(+), 2 deletions(-)
> >
> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > index ea17b97..d27d615 100644
> > --- a/lib/netdev-dpdk.c
> > +++ b/lib/netdev-dpdk.c
> > @@ -28,6 +28,7 @@
> >  #include <rte_errno.h>
> >  #include <rte_eth_ring.h>
> >  #include <rte_ethdev.h>
> > +#include <rte_ip.h>
> >  #include <rte_malloc.h>
> >  #include <rte_mbuf.h>
> >  #include <rte_meter.h>
> > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> > *rxq)
> >      rte_free(rx);
> >  }
> >
> > +static inline void
> > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
> > +                       uint8_t l4_proto, bool is_ipv4)
> > +{
> > +    void *l3hdr = (void *)(data + pkt->l3_ofs);
> > +
> > +    if (l4_proto == IPPROTO_TCP) {
> > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
> pkt->l4_ofs);
> > +
> > +        pkt->mbuf.l2_len = pkt->l3_ofs;
> > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > +        tcp_hdr->tcp_csum = 0;
> > +        if (is_ipv4) {
> > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
> > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
> > +        } else {
> > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
> > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
> > +        }
> > +    } else if (l4_proto == IPPROTO_UDP) {
> > +        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt-
> > >l4_ofs);
> > +        /* do not recalculate udp cksum if it was 0 */
> > +        if (udp_hdr->udp_csum != 0) {
> > +            pkt->mbuf.l2_len = pkt->l3_ofs;
> > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > +            udp_hdr->udp_csum = 0;
> > +            if (is_ipv4) {
> > +                /*do not calculate udp cksum if it was a fragment IP*/
> > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> > +                                      fragment_offset)) {
> > +                    return;
> > +                }
> > +
> > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
> > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr,
> udp_hdr);
> > +            } else {
> > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
> > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr,
> udp_hdr);
> > +            }
> > +        }
> > +    }
> > +}
> > +
> > +static inline void
> > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> > +{
> > +    int i;
> > +
> > +    for (i = 0; i < pkt_cnt; i++) {
> > +        ovs_be16 dl_type;
> > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> > +        const char *data = dp_packet_data(pkt);
> > +        void *l3hdr = (char *)(data + pkt->l3_ofs);
> > +
> > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
> > +            continue;
> > +        }
> > +        /* This take a assumption that it should be a vhost packet if
> this
> > +         * packet was allocated by DPDK pool and try sending to pNic. */
> > +        if (pkt->source == DPBUF_DPDK &&
> > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need
> cksum
> > +            continue;
> > +        }
>
> The comments here could be formatted better. Suggest combining both into
> one comment before the 'if'.
> Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.
>
> > +
> > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
> > +        if (dl_type == htons(ETH_TYPE_IP)) {
> > +            netdev_refill_l4_cksum(data, pkt,
> > +                                   ((struct ipv4_hdr
> *)l3hdr)->next_proto_id,
> > +                                   true);
> > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> > +            netdev_refill_l4_cksum(data, pkt,
> > +                                   ((struct ipv6_hdr *)l3hdr)->proto,
> > +                                   false);
> > +        }
> > +    }
> > +}
> > +
> >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
> ownership of
> >   * 'pkts', even in case of failure.
> >   *
> > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,
> > int qid,
> >          return;
> >      }
> >
> > +    netdev_prepare_tx_csum(batch->packets, batch->count);
>
> Putting this here assumes we only prepare the csum for vhost -> dpdk or
> vhost -> ring cases. What about vhost -> vhost?
>
> > +
> >      if (OVS_UNLIKELY(concurrent_txq)) {
> >          qid = qid % dev->up.n_txq;
> >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
> >      if (ovsthread_once_start(&once)) {
> >          rte_vhost_driver_callback_register(&virtio_net_device_ops);
> >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
> > -                                  | 1ULL << VIRTIO_NET_F_CSUM);
> > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
> >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> >
> >          ovsthread_once_done(&once);
> > --
> > 1.8.3.1
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Ciara Loftus Aug. 8, 2017, 9:53 a.m.
> 

> Hi Loftus,

> 

> Thanks for testing and the comments!

> Can you show more details about your phy-vm-phy,phy-phy setup and

> testing steps? Then I can reproduce it to see if I can solve this pps problem.


You're welcome. I forgot to mention my tests were with 64B packets.

For phy-phy the setup is a single host with 2 dpdk physical ports and 1 flow rule port1 -> port2.
See figure 3 here: https://tools.ietf.org/html/draft-ietf-bmwg-vswitch-opnfv-04#section-4

For the phy-vm-phy the setup is a single host with 2 dpdk physical ports and 2 vhostuser ports with flow rules:
Dpdk1 -> vhost 1 & vhost2 -> dpdk2
IP rules are set up in the VM to route packets from vhost1 to vhost 2.
See figure 4 in the link above.

> 

> BTW, how about throughput, did you saw improvment?


By throughput if you mean 0% packet loss, I did not test this.

Thanks,
Ciara

> 

> I would like to implement vhost->vhost part.

> 

> Thanks

> Zhenyu Gao

> 

> 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> >

> > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.

> > So L4 packets's cksum were calculated in VM side but performance is not

> > good.

> > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and

> > makes virtio-net frontend-driver support NETIF_F_SG as well

> >

> > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>

> > ---

> >

> > Here is some performance number:

> >

> > Setup:

> >

> >  qperf client

> > +---------+

> > |   VM    |

> > +---------+

> >      |

> >      |                          qperf server

> > +--------------+              +------------+

> > | vswitch+dpdk |              | bare-metal |

> > +--------------+              +------------+

> >        |                            |

> >        |                            |

> >       pNic---------PhysicalSwitch----

> >

> > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx

> on'

> > in VM side.

> >                       It offload cksum job to ovs-dpdk side.

> >

> > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in

> VM

> > side.

> >                 VM calculate cksum for tcp/udp packets.

> >

> > We can see huge improvment in TCP throughput if we leverage ovs-dpdk

> > cksum.

> Hi Zhenyu,

> 

> Thanks for the patch. I tested some alternative use cases and unfortunately I

> see a degradation for phy-phy and phy-vm-phy topologies.

> Here are my results:

> 

> phy-vm-phy:

> without patch: 0.871Mpps

> with patch (offload=on): 0.877Mpps

> with patch (offload=off): 0.891Mpps

> 

> phy-phy:

> without patch: 13.581Mpps

> with patch: 13.055Mpps

> 

> The half a million pps drop for the second test case is concerning to me but

> not surprising since we're adding extra complexity to netdev_dpdk_send()

> Could this be avoided? Would it make sense to put this functionality

> somewhere else eg. vhost receive?

> 

> On another note I have a general concern. I understand similar functionality

> is present in the DPDK vhost sample app. I wonder if it would be feasible for

> this to be implemented in the DPDK vhost library and leveraged here, rather

> than having two implementations in two separate code bases.

> 

> I have some other comments inline.

> 

> Thanks,

> Ciara

> 

> >

> > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01

> > tcp_bw tcp_lat udp_bw udp_lat

> >   do cksum in ovs-dpdk          do cksum in VM             without this patch

> > tcp_bw:

> >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec

> > tcp_bw:

> >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec

> > tcp_bw:

> >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec

> > tcp_bw:

> >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec

> > tcp_bw:

> >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec

> > tcp_bw:

> >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec

> > tcp_bw:

> >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec

> > tcp_bw:

> >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec

> > tcp_bw:

> >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec

> > tcp_bw:

> >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec

> > tcp_bw:

> >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec

> > tcp_bw:

> >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec

> > tcp_bw:

> >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec

> > tcp_bw:

> >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec

> > tcp_bw:

> >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec

> > tcp_bw:

> >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec

> > tcp_bw:

> >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us

> > tcp_lat:

> >     latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us

> > tcp_lat:

> >     latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us

> > tcp_lat:

> >     latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us

> > tcp_lat:

> >     latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us

> > tcp_lat:

> >     latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us

> > tcp_lat:

> >     latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us

> > tcp_lat:

> >     latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us

> > tcp_lat:

> >     latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us

> > tcp_lat:

> >      latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us

> > tcp_lat:

> >     latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us

> > tcp_lat:

> >     latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us

> > tcp_lat:

> >     latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us

> > tcp_lat:

> >     latency  =  167 us        latency  =  197 us        latency  =  172 us

> > udp_bw:

> >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403

> KB/sec

> >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400 KB/sec

> > udp_bw:

> >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810

> KB/sec

> >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807 KB/sec

> > udp_bw:

> >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63

> > MB/sec

> >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63

> > MB/sec

> > udp_bw:

> >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26

> > MB/sec

> >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82

> > MB/sec

> > udp_bw:

> >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45

> > MB/sec

> >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45

> > MB/sec

> > udp_bw:

> >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13

> > MB/sec

> >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13

> MB/sec

> > udp_bw:

> >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9

> > MB/sec

> >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7

> > MB/sec

> > udp_bw:

> >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52

> > MB/sec

> >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2

> > MB/sec

> > udp_bw:

> >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103

> > MB/sec

> >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100

> MB/sec

> > udp_bw:

> >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205

> > MB/sec

> >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202

> MB/sec

> > udp_bw:

> >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401

> > MB/sec

> >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358

> MB/sec

> > udp_bw:

> >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557

> > MB/sec

> >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362

> MB/sec

> > udp_bw:

> >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863

> > MB/sec

> >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584

> MB/sec

> > udp_bw:

> >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08

> > GB/sec

> >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793

> > MB/sec

> > udp_bw:

> >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19

> > GB/sec

> >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673

> > MB/sec

> > udp_bw:

> >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3

> GB/sec

> >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762

> MB/sec

> > udp_bw:

> >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0

> > bytes/sec

> >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us

> > udp_lat:

> >     latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us

> > udp_lat:

> >     latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us

> > udp_lat:

> >     latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us

> > udp_lat:

> >     latency  =  28.1 us       latency  =  28 us         latency  =  28 us

> > udp_lat:

> >      latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us

> > udp_lat:

> >     latency  =  31 us         latency  =  31 us         latency  =  30.8 us

> > udp_lat:

> >     latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us

> > udp_lat:

> >     latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us

> > udp_lat:

> >     latency  =  64.9 us       latency  =  65 us         latency  =  65 us

> > udp_lat:

> >     latency  =  72.3 us       latency  =  72 us         latency  =  72 us

> > udp_lat:

> >     latency  =  121 us        latency  =  122 us        latency  =  122 us

> > udp_lat:

> >     latency  =  0 ns          latency  =  0 ns          latency  =  0 ns

> >

> >

> >  lib/netdev-dpdk.c | 84

> > +++++++++++++++++++++++++++++++++++++++++++++++++++++--

> >  1 file changed, 82 insertions(+), 2 deletions(-)

> >

> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c

> > index ea17b97..d27d615 100644

> > --- a/lib/netdev-dpdk.c

> > +++ b/lib/netdev-dpdk.c

> > @@ -28,6 +28,7 @@

> >  #include <rte_errno.h>

> >  #include <rte_eth_ring.h>

> >  #include <rte_ethdev.h>

> > +#include <rte_ip.h>

> >  #include <rte_malloc.h>

> >  #include <rte_mbuf.h>

> >  #include <rte_meter.h>

> > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq

> > *rxq)

> >      rte_free(rx);

> >  }

> >

> > +static inline void

> > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,

> > +                       uint8_t l4_proto, bool is_ipv4)

> > +{

> > +    void *l3hdr = (void *)(data + pkt->l3_ofs);

> > +

> > +    if (l4_proto == IPPROTO_TCP) {

> > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt-

> >l4_ofs);

> > +

> > +        pkt->mbuf.l2_len = pkt->l3_ofs;

> > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;

> > +        tcp_hdr->tcp_csum = 0;

> > +        if (is_ipv4) {

> > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);

> > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;

> > +        } else {

> > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;

> > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);

> > +        }

> > +    } else if (l4_proto == IPPROTO_UDP) {

> > +        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt-

> > >l4_ofs);

> > +        /* do not recalculate udp cksum if it was 0 */

> > +        if (udp_hdr->udp_csum != 0) {

> > +            pkt->mbuf.l2_len = pkt->l3_ofs;

> > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;

> > +            udp_hdr->udp_csum = 0;

> > +            if (is_ipv4) {

> > +                /*do not calculate udp cksum if it was a fragment IP*/

> > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->

> > +                                      fragment_offset)) {

> > +                    return;

> > +                }

> > +

> > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;

> > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);

> > +            } else {

> > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;

> > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);

> > +            }

> > +        }

> > +    }

> > +}

> > +

> > +static inline void

> > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)

> > +{

> > +    int i;

> > +

> > +    for (i = 0; i < pkt_cnt; i++) {

> > +        ovs_be16 dl_type;

> > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];

> > +        const char *data = dp_packet_data(pkt);

> > +        void *l3hdr = (char *)(data + pkt->l3_ofs);

> > +

> > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {

> > +            continue;

> > +        }

> > +        /* This take a assumption that it should be a vhost packet if this

> > +         * packet was allocated by DPDK pool and try sending to pNic. */

> > +        if (pkt->source == DPBUF_DPDK &&

> > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {

> > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need

> cksum

> > +            continue;

> > +        }

> The comments here could be formatted better. Suggest combining both into

> one comment before the 'if'.

> Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.

> 

> > +

> > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);

> > +        if (dl_type == htons(ETH_TYPE_IP)) {

> > +            netdev_refill_l4_cksum(data, pkt,

> > +                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,

> > +                                   true);

> > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {

> > +            netdev_refill_l4_cksum(data, pkt,

> > +                                   ((struct ipv6_hdr *)l3hdr)->proto,

> > +                                   false);

> > +        }

> > +    }

> > +}

> > +

> >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of

> >   * 'pkts', even in case of failure.

> >   *

> > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,

> > int qid,

> >          return;

> >      }

> >

> > +    netdev_prepare_tx_csum(batch->packets, batch->count);

> 

> Putting this here assumes we only prepare the csum for vhost -> dpdk or

> vhost -> ring cases. What about vhost -> vhost?

> 

> > +

> >      if (OVS_UNLIKELY(concurrent_txq)) {

> >          qid = qid % dev->up.n_txq;

> >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)

> >      if (ovsthread_once_start(&once)) {

> >          rte_vhost_driver_callback_register(&virtio_net_device_ops);

> >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6

> > -                                  | 1ULL << VIRTIO_NET_F_CSUM);

> > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);

> >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);

> >

> >          ovsthread_once_done(&once);

> > --

> > 1.8.3.1

> >

> > _______________________________________________

> > dev mailing list

> > dev@openvswitch.org

> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Gao Zhenyu Aug. 10, 2017, 4:35 a.m.
I see, for flows in phy-phy setup, they should not be calculate cksum.
I will revise my patch to do the cksum for vhost port only. I will send a
new patch next week.

Thanks
Zhenyu Gao

2017-08-08 17:53 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> >
> > Hi Loftus,
> >
> > Thanks for testing and the comments!
> > Can you show more details about your phy-vm-phy,phy-phy setup and
> > testing steps? Then I can reproduce it to see if I can solve this pps
> problem.
>
> You're welcome. I forgot to mention my tests were with 64B packets.
>
> For phy-phy the setup is a single host with 2 dpdk physical ports and 1
> flow rule port1 -> port2.
> See figure 3 here: https://tools.ietf.org/html/
> draft-ietf-bmwg-vswitch-opnfv-04#section-4
>
> For the phy-vm-phy the setup is a single host with 2 dpdk physical ports
> and 2 vhostuser ports with flow rules:
> Dpdk1 -> vhost 1 & vhost2 -> dpdk2
> IP rules are set up in the VM to route packets from vhost1 to vhost 2.
> See figure 4 in the link above.
>
> >
> > BTW, how about throughput, did you saw improvment?
>
> By throughput if you mean 0% packet loss, I did not test this.
>
> Thanks,
> Ciara
>
> >
> > I would like to implement vhost->vhost part.
> >
> > Thanks
> > Zhenyu Gao
> >
> > 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
> > >
> > > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
> > > So L4 packets's cksum were calculated in VM side but performance is not
> > > good.
> > > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
> > > makes virtio-net frontend-driver support NETIF_F_SG as well
> > >
> > > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> > > ---
> > >
> > > Here is some performance number:
> > >
> > > Setup:
> > >
> > >  qperf client
> > > +---------+
> > > |   VM    |
> > > +---------+
> > >      |
> > >      |                          qperf server
> > > +--------------+              +------------+
> > > | vswitch+dpdk |              | bare-metal |
> > > +--------------+              +------------+
> > >        |                            |
> > >        |                            |
> > >       pNic---------PhysicalSwitch----
> > >
> > > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0
> tx
> > on'
> > > in VM side.
> > >                       It offload cksum job to ovs-dpdk side.
> > >
> > > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx
> off' in
> > VM
> > > side.
> > >                 VM calculate cksum for tcp/udp packets.
> > >
> > > We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> > > cksum.
> > Hi Zhenyu,
> >
> > Thanks for the patch. I tested some alternative use cases and
> unfortunately I
> > see a degradation for phy-phy and phy-vm-phy topologies.
> > Here are my results:
> >
> > phy-vm-phy:
> > without patch: 0.871Mpps
> > with patch (offload=on): 0.877Mpps
> > with patch (offload=off): 0.891Mpps
> >
> > phy-phy:
> > without patch: 13.581Mpps
> > with patch: 13.055Mpps
> >
> > The half a million pps drop for the second test case is concerning to me
> but
> > not surprising since we're adding extra complexity to netdev_dpdk_send()
> > Could this be avoided? Would it make sense to put this functionality
> > somewhere else eg. vhost receive?
> >
> > On another note I have a general concern. I understand similar
> functionality
> > is present in the DPDK vhost sample app. I wonder if it would be
> feasible for
> > this to be implemented in the DPDK vhost library and leveraged here,
> rather
> > than having two implementations in two separate code bases.
> >
> > I have some other comments inline.
> >
> > Thanks,
> > Ciara
> >
> > >
> > > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2
> host-qperf-server01
> > > tcp_bw tcp_lat udp_bw udp_lat
> > >   do cksum in ovs-dpdk          do cksum in VM             without
> this patch
> > > tcp_bw:
> > >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95
> MB/sec
> > > tcp_bw:
> > >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98
> MB/sec
> > > tcp_bw:
> > >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19
> MB/sec
> > > tcp_bw:
> > >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7
> MB/sec
> > > tcp_bw:
> > >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7
> MB/sec
> > > tcp_bw:
> > >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9
> MB/sec
> > > tcp_bw:
> > >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1
> MB/sec
> > > tcp_bw:
> > >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149
> MB/sec
> > > tcp_bw:
> > >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216
> MB/sec
> > > tcp_bw:
> > >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275
> MB/sec
> > > tcp_bw:
> > >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321
> MB/sec
> > > tcp_bw:
> > >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361
> MB/sec
> > > tcp_bw:
> > >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419
> MB/sec
> > > tcp_bw:
> > >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457
> MB/sec
> > > tcp_bw:
> > >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480
> MB/sec
> > > tcp_bw:
> > >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488
> MB/sec
> > > tcp_bw:
> > >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483
> MB/sec
> > > tcp_lat:
> > >     latency  =  29 us         latency  =  29.2 us       latency  =
> 29.6 us
> > > tcp_lat:
> > >     latency  =  28.9 us       latency  =  29.3 us       latency  =
> 29.5 us
> > > tcp_lat:
> > >     latency  =  29 us         latency  =  29.3 us       latency  =
> 29.6 us
> > > tcp_lat:
> > >     latency  =  29 us         latency  =  29.4 us       latency  =
> 29.5 us
> > > tcp_lat:
> > >     latency  =  29 us         latency  =  29.2 us       latency  =
> 29.6 us
> > > tcp_lat:
> > >     latency  =  29.1 us       latency  =  29.3 us       latency  =
> 29.7 us
> > > tcp_lat:
> > >     latency  =  29.4 us       latency  =  29.6 us       latency  =  30
> us
> > > tcp_lat:
> > >     latency  =  29.8 us       latency  =  30.1 us       latency  =
> 30.2 us
> > > tcp_lat:
> > >     latency  =  30.9 us       latency  =  30.9 us       latency  =  31
> us
> > > tcp_lat:
> > >     latency  =  46.9 us       latency  =  46.2 us       latency  =
> 32.2 us
> > > tcp_lat:
> > >     latency  =  51.5 us       latency  =  52.6 us       latency  =
> 34.5 us
> > > tcp_lat:
> > >     latency  =  43.9 us       latency  =  43.8 us       latency  =
> 43.6 us
> > > tcp_lat:
> > >      latency  =  47.6 us      latency  =  48 us         latency  =
> 48.1 us
> > > tcp_lat:
> > >     latency  =  77.7 us       latency  =  78.8 us       latency  =
> 78.8 us
> > > tcp_lat:
> > >     latency  =  82.8 us       latency  =  82.3 us       latency  =
> 116 us
> > > tcp_lat:
> > >     latency  =  94.8 us       latency  =  94.2 us       latency  =
> 134 us
> > > tcp_lat:
> > >     latency  =  167 us        latency  =  197 us        latency  =
> 172 us
> > > udp_bw:
> > >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403
> > KB/sec
> > >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =
> 400 KB/sec
> > > udp_bw:
> > >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810
> > KB/sec
> > >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =
> 807 KB/sec
> > > udp_bw:
> > >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =
> 1.63
> > > MB/sec
> > >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =
> 1.63
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =
> 3.26
> > > MB/sec
> > >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =
> 2.82
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =
> 6.45
> > > MB/sec
> > >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =
> 6.45
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13
> > > MB/sec
> > >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =
> 25.9
> > > MB/sec
> > >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =
> 25.7
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =
> 52
> > > MB/sec
> > >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =
> 51.2
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103
> > > MB/sec
> > >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205
> > > MB/sec
> > >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401
> > > MB/sec
> > >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557
> > > MB/sec
> > >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863
> > > MB/sec
> > >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =
> 1.08
> > > GB/sec
> > >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =
>  793
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =
> 1.19
> > > GB/sec
> > >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =
>  673
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3
> > GB/sec
> > >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
> > > bytes/sec
> > >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0
> bytes/sec
> > > udp_lat:
> > >     latency  =  26.7 us       latency  =  26.5 us       latency  =
> 26.4 us
> > > udp_lat:
> > >     latency  =  26.7 us       latency  =  26.5 us       latency  =
> 26.3 us
> > > udp_lat:
> > >     latency  =  26.7 us       latency  =  26.7 us       latency  =
> 26.3 us
> > > udp_lat:
> > >     latency  =  26.7 us       latency  =  26.6 us       latency  =
> 26.3 us
> > > udp_lat:
> > >     latency  =  26.7 us       latency  =  26.7 us       latency  =
> 26.7 us
> > > udp_lat:
> > >     latency  =  27 us         latency  =  26.7 us       latency  =
> 26.6 us
> > > udp_lat:
> > >     latency  =  27 us         latency  =  26.9 us       latency  =
> 26.7 us
> > > udp_lat:
> > >     latency  =  27.6 us       latency  =  27.4 us       latency  =
> 27.3 us
> > > udp_lat:
> > >     latency  =  28.1 us       latency  =  28 us         latency  =  28
> us
> > > udp_lat:
> > >      latency  =  29.4 us      latency  =  29.2 us       latency  =
> 29.2 us
> > > udp_lat:
> > >     latency  =  31 us         latency  =  31 us         latency  =
> 30.8 us
> > > udp_lat:
> > >     latency  =  41.4 us       latency  =  41.4 us       latency  =
> 41.3 us
> > > udp_lat:
> > >     latency  =  41.6 us       latency  =  41.5 us       latency  =
> 41.5 us
> > > udp_lat:
> > >     latency  =  64.9 us       latency  =  65 us         latency  =  65
> us
> > > udp_lat:
> > >     latency  =  72.3 us       latency  =  72 us         latency  =  72
> us
> > > udp_lat:
> > >     latency  =  121 us        latency  =  122 us        latency  =
> 122 us
> > > udp_lat:
> > >     latency  =  0 ns          latency  =  0 ns          latency  =  0
> ns
> > >
> > >
> > >  lib/netdev-dpdk.c | 84
> > > +++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 82 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > > index ea17b97..d27d615 100644
> > > --- a/lib/netdev-dpdk.c
> > > +++ b/lib/netdev-dpdk.c
> > > @@ -28,6 +28,7 @@
> > >  #include <rte_errno.h>
> > >  #include <rte_eth_ring.h>
> > >  #include <rte_ethdev.h>
> > > +#include <rte_ip.h>
> > >  #include <rte_malloc.h>
> > >  #include <rte_mbuf.h>
> > >  #include <rte_meter.h>
> > > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> > > *rxq)
> > >      rte_free(rx);
> > >  }
> > >
> > > +static inline void
> > > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
> > > +                       uint8_t l4_proto, bool is_ipv4)
> > > +{
> > > +    void *l3hdr = (void *)(data + pkt->l3_ofs);
> > > +
> > > +    if (l4_proto == IPPROTO_TCP) {
> > > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt-
> > >l4_ofs);
> > > +
> > > +        pkt->mbuf.l2_len = pkt->l3_ofs;
> > > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > > +        tcp_hdr->tcp_csum = 0;
> > > +        if (is_ipv4) {
> > > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
> > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
> > > +        } else {
> > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
> > > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
> > > +        }
> > > +    } else if (l4_proto == IPPROTO_UDP) {
> > > +        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt-
> > > >l4_ofs);
> > > +        /* do not recalculate udp cksum if it was 0 */
> > > +        if (udp_hdr->udp_csum != 0) {
> > > +            pkt->mbuf.l2_len = pkt->l3_ofs;
> > > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > > +            udp_hdr->udp_csum = 0;
> > > +            if (is_ipv4) {
> > > +                /*do not calculate udp cksum if it was a fragment IP*/
> > > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> > > +                                      fragment_offset)) {
> > > +                    return;
> > > +                }
> > > +
> > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
> > > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr,
> udp_hdr);
> > > +            } else {
> > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
> > > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr,
> udp_hdr);
> > > +            }
> > > +        }
> > > +    }
> > > +}
> > > +
> > > +static inline void
> > > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> > > +{
> > > +    int i;
> > > +
> > > +    for (i = 0; i < pkt_cnt; i++) {
> > > +        ovs_be16 dl_type;
> > > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> > > +        const char *data = dp_packet_data(pkt);
> > > +        void *l3hdr = (char *)(data + pkt->l3_ofs);
> > > +
> > > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
> > > +            continue;
> > > +        }
> > > +        /* This take a assumption that it should be a vhost packet if
> this
> > > +         * packet was allocated by DPDK pool and try sending to pNic.
> */
> > > +        if (pkt->source == DPBUF_DPDK &&
> > > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> > > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need
> > cksum
> > > +            continue;
> > > +        }
> > The comments here could be formatted better. Suggest combining both into
> > one comment before the 'if'.
> > Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.
> >
> > > +
> > > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
> > > +        if (dl_type == htons(ETH_TYPE_IP)) {
> > > +            netdev_refill_l4_cksum(data, pkt,
> > > +                                   ((struct ipv4_hdr
> *)l3hdr)->next_proto_id,
> > > +                                   true);
> > > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> > > +            netdev_refill_l4_cksum(data, pkt,
> > > +                                   ((struct ipv6_hdr *)l3hdr)->proto,
> > > +                                   false);
> > > +        }
> > > +    }
> > > +}
> > > +
> > >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
> ownership of
> > >   * 'pkts', even in case of failure.
> > >   *
> > > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,
> > > int qid,
> > >          return;
> > >      }
> > >
> > > +    netdev_prepare_tx_csum(batch->packets, batch->count);
> >
> > Putting this here assumes we only prepare the csum for vhost -> dpdk or
> > vhost -> ring cases. What about vhost -> vhost?
> >
> > > +
> > >      if (OVS_UNLIKELY(concurrent_txq)) {
> > >          qid = qid % dev->up.n_txq;
> > >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> > > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
> > >      if (ovsthread_once_start(&once)) {
> > >          rte_vhost_driver_callback_register(&virtio_net_device_ops);
> > >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> > > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
> > > -                                  | 1ULL << VIRTIO_NET_F_CSUM);
> > > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
> > >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> > >
> > >          ovsthread_once_done(&once);
> > > --
> > > 1.8.3.1
> > >
> > > _______________________________________________
> > > dev mailing list
> > > dev@openvswitch.org
> > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
Gao Zhenyu Aug. 16, 2017, 1:38 p.m.
Hi Loftus,

   I had submitted a new version, please see
https://patchwork.ozlabs.org/patch/802070/
   It move the cksum to vhost receive side.

Thanks
Zhenyu Gao

2017-08-10 12:35 GMT+08:00 Gao Zhenyu <sysugaozhenyu@gmail.com>:

> I see, for flows in phy-phy setup, they should not be calculate cksum.
> I will revise my patch to do the cksum for vhost port only. I will send a
> new patch next week.
>
> Thanks
> Zhenyu Gao
>
> 2017-08-08 17:53 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
>
>> >
>> > Hi Loftus,
>> >
>> > Thanks for testing and the comments!
>> > Can you show more details about your phy-vm-phy,phy-phy setup and
>> > testing steps? Then I can reproduce it to see if I can solve this pps
>> problem.
>>
>> You're welcome. I forgot to mention my tests were with 64B packets.
>>
>> For phy-phy the setup is a single host with 2 dpdk physical ports and 1
>> flow rule port1 -> port2.
>> See figure 3 here: https://tools.ietf.org/html/dr
>> aft-ietf-bmwg-vswitch-opnfv-04#section-4
>>
>> For the phy-vm-phy the setup is a single host with 2 dpdk physical ports
>> and 2 vhostuser ports with flow rules:
>> Dpdk1 -> vhost 1 & vhost2 -> dpdk2
>> IP rules are set up in the VM to route packets from vhost1 to vhost 2.
>> See figure 4 in the link above.
>>
>> >
>> > BTW, how about throughput, did you saw improvment?
>>
>> By throughput if you mean 0% packet loss, I did not test this.
>>
>> Thanks,
>> Ciara
>>
>> >
>> > I would like to implement vhost->vhost part.
>> >
>> > Thanks
>> > Zhenyu Gao
>> >
>> > 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
>> > >
>> > > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx
>> cksum.
>> > > So L4 packets's cksum were calculated in VM side but performance is
>> not
>> > > good.
>> > > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
>> > > makes virtio-net frontend-driver support NETIF_F_SG as well
>> > >
>> > > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
>> > > ---
>> > >
>> > > Here is some performance number:
>> > >
>> > > Setup:
>> > >
>> > >  qperf client
>> > > +---------+
>> > > |   VM    |
>> > > +---------+
>> > >      |
>> > >      |                          qperf server
>> > > +--------------+              +------------+
>> > > | vswitch+dpdk |              | bare-metal |
>> > > +--------------+              +------------+
>> > >        |                            |
>> > >        |                            |
>> > >       pNic---------PhysicalSwitch----
>> > >
>> > > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0
>> tx
>> > on'
>> > > in VM side.
>> > >                       It offload cksum job to ovs-dpdk side.
>> > >
>> > > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx
>> off' in
>> > VM
>> > > side.
>> > >                 VM calculate cksum for tcp/udp packets.
>> > >
>> > > We can see huge improvment in TCP throughput if we leverage ovs-dpdk
>> > > cksum.
>> > Hi Zhenyu,
>> >
>> > Thanks for the patch. I tested some alternative use cases and
>> unfortunately I
>> > see a degradation for phy-phy and phy-vm-phy topologies.
>> > Here are my results:
>> >
>> > phy-vm-phy:
>> > without patch: 0.871Mpps
>> > with patch (offload=on): 0.877Mpps
>> > with patch (offload=off): 0.891Mpps
>> >
>> > phy-phy:
>> > without patch: 13.581Mpps
>> > with patch: 13.055Mpps
>> >
>> > The half a million pps drop for the second test case is concerning to
>> me but
>> > not surprising since we're adding extra complexity to netdev_dpdk_send()
>> > Could this be avoided? Would it make sense to put this functionality
>> > somewhere else eg. vhost receive?
>> >
>> > On another note I have a general concern. I understand similar
>> functionality
>> > is present in the DPDK vhost sample app. I wonder if it would be
>> feasible for
>> > this to be implemented in the DPDK vhost library and leveraged here,
>> rather
>> > than having two implementations in two separate code bases.
>> >
>> > I have some other comments inline.
>> >
>> > Thanks,
>> > Ciara
>> >
>> > >
>> > > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2
>> host-qperf-server01
>> > > tcp_bw tcp_lat udp_bw udp_lat
>> > >   do cksum in ovs-dpdk          do cksum in VM             without
>> this patch
>> > > tcp_bw:
>> > >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483
>> MB/sec
>> > > tcp_lat:
>> > >     latency  =  29 us         latency  =  29.2 us       latency  =
>> 29.6 us
>> > > tcp_lat:
>> > >     latency  =  28.9 us       latency  =  29.3 us       latency  =
>> 29.5 us
>> > > tcp_lat:
>> > >     latency  =  29 us         latency  =  29.3 us       latency  =
>> 29.6 us
>> > > tcp_lat:
>> > >     latency  =  29 us         latency  =  29.4 us       latency  =
>> 29.5 us
>> > > tcp_lat:
>> > >     latency  =  29 us         latency  =  29.2 us       latency  =
>> 29.6 us
>> > > tcp_lat:
>> > >     latency  =  29.1 us       latency  =  29.3 us       latency  =
>> 29.7 us
>> > > tcp_lat:
>> > >     latency  =  29.4 us       latency  =  29.6 us       latency  =
>> 30 us
>> > > tcp_lat:
>> > >     latency  =  29.8 us       latency  =  30.1 us       latency  =
>> 30.2 us
>> > > tcp_lat:
>> > >     latency  =  30.9 us       latency  =  30.9 us       latency  =
>> 31 us
>> > > tcp_lat:
>> > >     latency  =  46.9 us       latency  =  46.2 us       latency  =
>> 32.2 us
>> > > tcp_lat:
>> > >     latency  =  51.5 us       latency  =  52.6 us       latency  =
>> 34.5 us
>> > > tcp_lat:
>> > >     latency  =  43.9 us       latency  =  43.8 us       latency  =
>> 43.6 us
>> > > tcp_lat:
>> > >      latency  =  47.6 us      latency  =  48 us         latency  =
>> 48.1 us
>> > > tcp_lat:
>> > >     latency  =  77.7 us       latency  =  78.8 us       latency  =
>> 78.8 us
>> > > tcp_lat:
>> > >     latency  =  82.8 us       latency  =  82.3 us       latency  =
>> 116 us
>> > > tcp_lat:
>> > >     latency  =  94.8 us       latency  =  94.2 us       latency  =
>> 134 us
>> > > tcp_lat:
>> > >     latency  =  167 us        latency  =  197 us        latency  =
>> 172 us
>> > > udp_bw:
>> > >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =
>> 403
>> > KB/sec
>> > >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =
>> 400 KB/sec
>> > > udp_bw:
>> > >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =
>> 810
>> > KB/sec
>> > >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =
>> 807 KB/sec
>> > > udp_bw:
>> > >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =
>> 1.63
>> > > MB/sec
>> > >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =
>> 1.63
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =
>> 3.26
>> > > MB/sec
>> > >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =
>> 2.82
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =
>> 6.45
>> > > MB/sec
>> > >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =
>> 6.45
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13
>> > > MB/sec
>> > >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =
>> 25.9
>> > > MB/sec
>> > >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =
>> 25.7
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =
>> 52
>> > > MB/sec
>> > >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =
>> 51.2
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =
>> 103
>> > > MB/sec
>> > >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =
>> 100
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =
>> 205
>> > > MB/sec
>> > >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =
>> 202
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =
>> 401
>> > > MB/sec
>> > >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =
>> 358
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =
>> 557
>> > > MB/sec
>> > >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =
>> 362
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =
>> 863
>> > > MB/sec
>> > >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =
>> 584
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =
>> 1.08
>> > > GB/sec
>> > >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =
>>  793
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =
>> 1.19
>> > > GB/sec
>> > >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =
>>  673
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =
>> 1.3
>> > GB/sec
>> > >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =
>> 762
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
>> > > bytes/sec
>> > >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0
>> bytes/sec
>> > > udp_lat:
>> > >     latency  =  26.7 us       latency  =  26.5 us       latency  =
>> 26.4 us
>> > > udp_lat:
>> > >     latency  =  26.7 us       latency  =  26.5 us       latency  =
>> 26.3 us
>> > > udp_lat:
>> > >     latency  =  26.7 us       latency  =  26.7 us       latency  =
>> 26.3 us
>> > > udp_lat:
>> > >     latency  =  26.7 us       latency  =  26.6 us       latency  =
>> 26.3 us
>> > > udp_lat:
>> > >     latency  =  26.7 us       latency  =  26.7 us       latency  =
>> 26.7 us
>> > > udp_lat:
>> > >     latency  =  27 us         latency  =  26.7 us       latency  =
>> 26.6 us
>> > > udp_lat:
>> > >     latency  =  27 us         latency  =  26.9 us       latency  =
>> 26.7 us
>> > > udp_lat:
>> > >     latency  =  27.6 us       latency  =  27.4 us       latency  =
>> 27.3 us
>> > > udp_lat:
>> > >     latency  =  28.1 us       latency  =  28 us         latency  =
>> 28 us
>> > > udp_lat:
>> > >      latency  =  29.4 us      latency  =  29.2 us       latency  =
>> 29.2 us
>> > > udp_lat:
>> > >     latency  =  31 us         latency  =  31 us         latency  =
>> 30.8 us
>> > > udp_lat:
>> > >     latency  =  41.4 us       latency  =  41.4 us       latency  =
>> 41.3 us
>> > > udp_lat:
>> > >     latency  =  41.6 us       latency  =  41.5 us       latency  =
>> 41.5 us
>> > > udp_lat:
>> > >     latency  =  64.9 us       latency  =  65 us         latency  =
>> 65 us
>> > > udp_lat:
>> > >     latency  =  72.3 us       latency  =  72 us         latency  =
>> 72 us
>> > > udp_lat:
>> > >     latency  =  121 us        latency  =  122 us        latency  =
>> 122 us
>> > > udp_lat:
>> > >     latency  =  0 ns          latency  =  0 ns          latency  =  0
>> ns
>> > >
>> > >
>> > >  lib/netdev-dpdk.c | 84
>> > > +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>> > >  1 file changed, 82 insertions(+), 2 deletions(-)
>> > >
>> > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>> > > index ea17b97..d27d615 100644
>> > > --- a/lib/netdev-dpdk.c
>> > > +++ b/lib/netdev-dpdk.c
>> > > @@ -28,6 +28,7 @@
>> > >  #include <rte_errno.h>
>> > >  #include <rte_eth_ring.h>
>> > >  #include <rte_ethdev.h>
>> > > +#include <rte_ip.h>
>> > >  #include <rte_malloc.h>
>> > >  #include <rte_mbuf.h>
>> > >  #include <rte_meter.h>
>> > > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
>> > > *rxq)
>> > >      rte_free(rx);
>> > >  }
>> > >
>> > > +static inline void
>> > > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
>> > > +                       uint8_t l4_proto, bool is_ipv4)
>> > > +{
>> > > +    void *l3hdr = (void *)(data + pkt->l3_ofs);
>> > > +
>> > > +    if (l4_proto == IPPROTO_TCP) {
>> > > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
>> pkt-
>> > >l4_ofs);
>> > > +
>> > > +        pkt->mbuf.l2_len = pkt->l3_ofs;
>> > > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
>> > > +        tcp_hdr->tcp_csum = 0;
>> > > +        if (is_ipv4) {
>> > > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr,
>> tcp_hdr);
>> > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
>> > > +        } else {
>> > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
>> > > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr,
>> tcp_hdr);
>> > > +        }
>> > > +    } else if (l4_proto == IPPROTO_UDP) {
>> > > +        struct udp_header *udp_hdr = (struct udp_header *)(data +
>> pkt-
>> > > >l4_ofs);
>> > > +        /* do not recalculate udp cksum if it was 0 */
>> > > +        if (udp_hdr->udp_csum != 0) {
>> > > +            pkt->mbuf.l2_len = pkt->l3_ofs;
>> > > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
>> > > +            udp_hdr->udp_csum = 0;
>> > > +            if (is_ipv4) {
>> > > +                /*do not calculate udp cksum if it was a fragment
>> IP*/
>> > > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
>> > > +                                      fragment_offset)) {
>> > > +                    return;
>> > > +                }
>> > > +
>> > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
>> > > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr,
>> udp_hdr);
>> > > +            } else {
>> > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
>> > > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr,
>> udp_hdr);
>> > > +            }
>> > > +        }
>> > > +    }
>> > > +}
>> > > +
>> > > +static inline void
>> > > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
>> > > +{
>> > > +    int i;
>> > > +
>> > > +    for (i = 0; i < pkt_cnt; i++) {
>> > > +        ovs_be16 dl_type;
>> > > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
>> > > +        const char *data = dp_packet_data(pkt);
>> > > +        void *l3hdr = (char *)(data + pkt->l3_ofs);
>> > > +
>> > > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
>> > > +            continue;
>> > > +        }
>> > > +        /* This take a assumption that it should be a vhost packet
>> if this
>> > > +         * packet was allocated by DPDK pool and try sending to
>> pNic. */
>> > > +        if (pkt->source == DPBUF_DPDK &&
>> > > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
>> > > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet
>> need
>> > cksum
>> > > +            continue;
>> > > +        }
>> > The comments here could be formatted better. Suggest combining both into
>> > one comment before the 'if'.
>> > Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.
>> >
>> > > +
>> > > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
>> > > +        if (dl_type == htons(ETH_TYPE_IP)) {
>> > > +            netdev_refill_l4_cksum(data, pkt,
>> > > +                                   ((struct ipv4_hdr
>> *)l3hdr)->next_proto_id,
>> > > +                                   true);
>> > > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>> > > +            netdev_refill_l4_cksum(data, pkt,
>> > > +                                   ((struct ipv6_hdr *)l3hdr)->proto,
>> > > +                                   false);
>> > > +        }
>> > > +    }
>> > > +}
>> > > +
>> > >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
>> ownership of
>> > >   * 'pkts', even in case of failure.
>> > >   *
>> > > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,
>> > > int qid,
>> > >          return;
>> > >      }
>> > >
>> > > +    netdev_prepare_tx_csum(batch->packets, batch->count);
>> >
>> > Putting this here assumes we only prepare the csum for vhost -> dpdk or
>> > vhost -> ring cases. What about vhost -> vhost?
>> >
>> > > +
>> > >      if (OVS_UNLIKELY(concurrent_txq)) {
>> > >          qid = qid % dev->up.n_txq;
>> > >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>> > > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
>> > >      if (ovsthread_once_start(&once)) {
>> > >          rte_vhost_driver_callback_register(&virtio_net_device_ops);
>> > >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
>> > > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
>> > > -                                  | 1ULL << VIRTIO_NET_F_CSUM);
>> > > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
>> > >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
>> > >
>> > >          ovsthread_once_done(&once);
>> > > --
>> > > 1.8.3.1
>> > >
>> > > _______________________________________________
>> > > dev mailing list
>> > > dev@openvswitch.org
>> > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>>
>

Patch hide | download patch | download mbox

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index ea17b97..d27d615 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -28,6 +28,7 @@ 
 #include <rte_errno.h>
 #include <rte_eth_ring.h>
 #include <rte_ethdev.h>
+#include <rte_ip.h>
 #include <rte_malloc.h>
 #include <rte_mbuf.h>
 #include <rte_meter.h>
@@ -1392,6 +1393,84 @@  netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
     rte_free(rx);
 }
 
+static inline void
+netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
+                       uint8_t l4_proto, bool is_ipv4)
+{
+    void *l3hdr = (void *)(data + pkt->l3_ofs);
+
+    if (l4_proto == IPPROTO_TCP) {
+        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt->l4_ofs);
+
+        pkt->mbuf.l2_len = pkt->l3_ofs;
+        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
+        tcp_hdr->tcp_csum = 0;
+        if (is_ipv4) {
+            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
+            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
+        } else {
+            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
+            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
+        }
+    } else if (l4_proto == IPPROTO_UDP) {
+        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt->l4_ofs);
+        /* do not recalculate udp cksum if it was 0 */
+        if (udp_hdr->udp_csum != 0) {
+            pkt->mbuf.l2_len = pkt->l3_ofs;
+            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
+            udp_hdr->udp_csum = 0;
+            if (is_ipv4) {
+                /*do not calculate udp cksum if it was a fragment IP*/
+                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
+                                      fragment_offset)) {
+                    return;
+                }
+
+                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
+                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);
+            } else {
+                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
+                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);
+            }
+        }
+    }
+}
+
+static inline void
+netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
+{
+    int i;
+
+    for (i = 0; i < pkt_cnt; i++) {
+        ovs_be16 dl_type;
+        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
+        const char *data = dp_packet_data(pkt);
+        void *l3hdr = (char *)(data + pkt->l3_ofs);
+
+        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
+            continue;
+        }
+        /* This take a assumption that it should be a vhost packet if this
+         * packet was allocated by DPDK pool and try sending to pNic. */
+        if (pkt->source == DPBUF_DPDK &&
+            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
+            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need cksum
+            continue;
+        }
+
+        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
+        if (dl_type == htons(ETH_TYPE_IP)) {
+            netdev_refill_l4_cksum(data, pkt,
+                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,
+                                   true);
+        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
+            netdev_refill_l4_cksum(data, pkt,
+                                   ((struct ipv6_hdr *)l3hdr)->proto,
+                                   false);
+        }
+    }
+}
+
 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
  * 'pkts', even in case of failure.
  *
@@ -1833,6 +1912,8 @@  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
         return;
     }
 
+    netdev_prepare_tx_csum(batch->packets, batch->count);
+
     if (OVS_UNLIKELY(concurrent_txq)) {
         qid = qid % dev->up.n_txq;
         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
@@ -2741,8 +2822,7 @@  netdev_dpdk_vhost_class_init(void)
     if (ovsthread_once_start(&once)) {
         rte_vhost_driver_callback_register(&virtio_net_device_ops);
         rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
-                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                  | 1ULL << VIRTIO_NET_F_CSUM);
+                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
         ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
 
         ovsthread_once_done(&once);