[ovs-dev,v1] netdev-dpdk: Implement TCP/UDP TX cksum in ovs-dpdk side

Message ID 20170801142631.10652-1-sysugaozhenyu@gmail.com
State New
Headers show

Commit Message

Gao Zhenyu Aug. 1, 2017, 2:26 p.m.
Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
So L4 packets's cksum were calculated in VM side but performance is not
good.
Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
makes virtio-net frontend-driver support NETIF_F_SG as well

Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
---

Here is some performance number:

Setup:

 qperf client
+---------+
|   VM    |
+---------+
     |
     |                          qperf server
+--------------+              +------------+
| vswitch+dpdk |              | bare-metal |
+--------------+              +------------+
       |                            |
       |                            |
      pNic---------PhysicalSwitch----

do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx on' in VM side.
                      It offload cksum job to ovs-dpdk side.

do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in VM side.
                VM calculate cksum for tcp/udp packets.

We can see huge improvment in TCP throughput if we leverage ovs-dpdk cksum.

[root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01 tcp_bw tcp_lat udp_bw udp_lat
  do cksum in ovs-dpdk          do cksum in VM             without this patch
tcp_bw:
    bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec
tcp_bw:
    bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec
tcp_bw:
    bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec
tcp_bw:
    bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec
tcp_bw:
    bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec
tcp_bw:
    bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec
tcp_bw:
    bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec
tcp_bw: 
    bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec
tcp_bw:
    bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec
tcp_bw:
    bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec
tcp_bw:
    bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec
tcp_bw:
    bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec
tcp_bw:
    bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec
tcp_bw: 
    bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec
tcp_bw:
    bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec
tcp_bw:
    bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec
tcp_bw:
    bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec
tcp_lat:
    latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
tcp_lat:
    latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us
tcp_lat:
    latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us
tcp_lat:
    latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us
tcp_lat:
    latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
tcp_lat:
    latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us
tcp_lat:
    latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us
tcp_lat:
    latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us
tcp_lat:
    latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us
tcp_lat:
    latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us
tcp_lat:
    latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us
tcp_lat:
    latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us
tcp_lat:
     latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us
tcp_lat:
    latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us
tcp_lat:
    latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us
tcp_lat:
    latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us
tcp_lat:
    latency  =  167 us        latency  =  197 us        latency  =  172 us
udp_bw:
    send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403 KB/sec
    recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400 KB/sec
udp_bw:
    send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810 KB/sec
    recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807 KB/sec
udp_bw:
    send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63 MB/sec
    recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63 MB/sec
udp_bw:
    send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26 MB/sec
    recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82 MB/sec
udp_bw:
    send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45 MB/sec
    recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45 MB/sec
udp_bw:
    send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13 MB/sec
    recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13 MB/sec
udp_bw:
    send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9 MB/sec
    recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7 MB/sec
udp_bw:
    send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52 MB/sec
    recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2 MB/sec
udp_bw:
    send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103 MB/sec
    recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100 MB/sec
udp_bw:
    send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205 MB/sec
    recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202 MB/sec
udp_bw:
    send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401 MB/sec
    recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358 MB/sec
udp_bw:
    send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557 MB/sec
    recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362 MB/sec
udp_bw:
    send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863 MB/sec
    recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584 MB/sec
udp_bw:
    send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08 GB/sec
    recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793 MB/sec
udp_bw: 
    send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19 GB/sec
    recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673 MB/sec
udp_bw:
    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec
    recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762 MB/sec
udp_bw:
    send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec
    recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec
udp_lat:
    latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us
udp_lat:
    latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us
udp_lat:
    latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us
udp_lat:
    latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us
udp_lat:
    latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us
udp_lat:
    latency  =  28.1 us       latency  =  28 us         latency  =  28 us
udp_lat:
     latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us
udp_lat:
    latency  =  31 us         latency  =  31 us         latency  =  30.8 us
udp_lat:
    latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us
udp_lat:
    latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us
udp_lat:
    latency  =  64.9 us       latency  =  65 us         latency  =  65 us
udp_lat:
    latency  =  72.3 us       latency  =  72 us         latency  =  72 us
udp_lat:
    latency  =  121 us        latency  =  122 us        latency  =  122 us
udp_lat: 
    latency  =  0 ns          latency  =  0 ns          latency  =  0 ns


 lib/netdev-dpdk.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 2 deletions(-)

Comments

Ciara Loftus Aug. 4, 2017, 2:52 p.m. | #1
> 
> Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
> So L4 packets's cksum were calculated in VM side but performance is not
> good.
> Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
> makes virtio-net frontend-driver support NETIF_F_SG as well
> 
> Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> ---
> 
> Here is some performance number:
> 
> Setup:
> 
>  qperf client
> +---------+
> |   VM    |
> +---------+
>      |
>      |                          qperf server
> +--------------+              +------------+
> | vswitch+dpdk |              | bare-metal |
> +--------------+              +------------+
>        |                            |
>        |                            |
>       pNic---------PhysicalSwitch----
> 
> do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx on'
> in VM side.
>                       It offload cksum job to ovs-dpdk side.
> 
> do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in VM
> side.
>                 VM calculate cksum for tcp/udp packets.
> 
> We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> cksum.

Hi Zhenyu,

Thanks for the patch. I tested some alternative use cases and unfortunately I see a degradation for phy-phy and phy-vm-phy topologies.
Here are my results:

phy-vm-phy:
without patch: 0.871Mpps
with patch (offload=on): 0.877Mpps
with patch (offload=off): 0.891Mpps

phy-phy:
without patch: 13.581Mpps
with patch: 13.055Mpps

The half a million pps drop for the second test case is concerning to me but not surprising since we're adding extra complexity to netdev_dpdk_send()
Could this be avoided? Would it make sense to put this functionality somewhere else eg. vhost receive?

On another note I have a general concern. I understand similar functionality is present in the DPDK vhost sample app. I wonder if it would be feasible for this to be implemented in the DPDK vhost library and leveraged here, rather than having two implementations in two separate code bases.

I have some other comments inline.

Thanks,
Ciara

> 
> [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01
> tcp_bw tcp_lat udp_bw udp_lat
>   do cksum in ovs-dpdk          do cksum in VM             without this patch
> tcp_bw:
>     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec
> tcp_bw:
>     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec
> tcp_bw:
>     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec
> tcp_bw:
>     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec
> tcp_bw:
>     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec
> tcp_bw:
>     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec
> tcp_bw:
>     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec
> tcp_bw:
>     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec
> tcp_bw:
>     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec
> tcp_bw:
>     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec
> tcp_bw:
>     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec
> tcp_bw:
>     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec
> tcp_bw:
>     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec
> tcp_bw:
>     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec
> tcp_bw:
>     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec
> tcp_bw:
>     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec
> tcp_bw:
>     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec
> tcp_lat:
>     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
> tcp_lat:
>     latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us
> tcp_lat:
>     latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us
> tcp_lat:
>     latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us
> tcp_lat:
>     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
> tcp_lat:
>     latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us
> tcp_lat:
>     latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us
> tcp_lat:
>     latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us
> tcp_lat:
>     latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us
> tcp_lat:
>     latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us
> tcp_lat:
>     latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us
> tcp_lat:
>     latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us
> tcp_lat:
>      latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us
> tcp_lat:
>     latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us
> tcp_lat:
>     latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us
> tcp_lat:
>     latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us
> tcp_lat:
>     latency  =  167 us        latency  =  197 us        latency  =  172 us
> udp_bw:
>     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403 KB/sec
>     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400 KB/sec
> udp_bw:
>     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810 KB/sec
>     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807 KB/sec
> udp_bw:
>     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63
> MB/sec
>     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63
> MB/sec
> udp_bw:
>     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26
> MB/sec
>     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82
> MB/sec
> udp_bw:
>     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45
> MB/sec
>     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45
> MB/sec
> udp_bw:
>     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13
> MB/sec
>     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13 MB/sec
> udp_bw:
>     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9
> MB/sec
>     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7
> MB/sec
> udp_bw:
>     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52
> MB/sec
>     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2
> MB/sec
> udp_bw:
>     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103
> MB/sec
>     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100 MB/sec
> udp_bw:
>     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205
> MB/sec
>     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202 MB/sec
> udp_bw:
>     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401
> MB/sec
>     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358 MB/sec
> udp_bw:
>     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557
> MB/sec
>     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362 MB/sec
> udp_bw:
>     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863
> MB/sec
>     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584 MB/sec
> udp_bw:
>     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08
> GB/sec
>     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793
> MB/sec
> udp_bw:
>     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19
> GB/sec
>     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673
> MB/sec
> udp_bw:
>     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec
>     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762 MB/sec
> udp_bw:
>     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
> bytes/sec
>     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec
> udp_lat:
>     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us
> udp_lat:
>     latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us
> udp_lat:
>     latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us
> udp_lat:
>     latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us
> udp_lat:
>     latency  =  28.1 us       latency  =  28 us         latency  =  28 us
> udp_lat:
>      latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us
> udp_lat:
>     latency  =  31 us         latency  =  31 us         latency  =  30.8 us
> udp_lat:
>     latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us
> udp_lat:
>     latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us
> udp_lat:
>     latency  =  64.9 us       latency  =  65 us         latency  =  65 us
> udp_lat:
>     latency  =  72.3 us       latency  =  72 us         latency  =  72 us
> udp_lat:
>     latency  =  121 us        latency  =  122 us        latency  =  122 us
> udp_lat:
>     latency  =  0 ns          latency  =  0 ns          latency  =  0 ns
> 
> 
>  lib/netdev-dpdk.c | 84
> +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 82 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index ea17b97..d27d615 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -28,6 +28,7 @@
>  #include <rte_errno.h>
>  #include <rte_eth_ring.h>
>  #include <rte_ethdev.h>
> +#include <rte_ip.h>
>  #include <rte_malloc.h>
>  #include <rte_mbuf.h>
>  #include <rte_meter.h>
> @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> *rxq)
>      rte_free(rx);
>  }
> 
> +static inline void
> +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
> +                       uint8_t l4_proto, bool is_ipv4)
> +{
> +    void *l3hdr = (void *)(data + pkt->l3_ofs);
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt->l4_ofs);
> +
> +        pkt->mbuf.l2_len = pkt->l3_ofs;
> +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> +        tcp_hdr->tcp_csum = 0;
> +        if (is_ipv4) {
> +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
> +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
> +        } else {
> +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
> +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
> +        }
> +    } else if (l4_proto == IPPROTO_UDP) {
> +        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt-
> >l4_ofs);
> +        /* do not recalculate udp cksum if it was 0 */
> +        if (udp_hdr->udp_csum != 0) {
> +            pkt->mbuf.l2_len = pkt->l3_ofs;
> +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> +            udp_hdr->udp_csum = 0;
> +            if (is_ipv4) {
> +                /*do not calculate udp cksum if it was a fragment IP*/
> +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> +                                      fragment_offset)) {
> +                    return;
> +                }
> +
> +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
> +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);
> +            } else {
> +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
> +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);
> +            }
> +        }
> +    }
> +}
> +
> +static inline void
> +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> +{
> +    int i;
> +
> +    for (i = 0; i < pkt_cnt; i++) {
> +        ovs_be16 dl_type;
> +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> +        const char *data = dp_packet_data(pkt);
> +        void *l3hdr = (char *)(data + pkt->l3_ofs);
> +
> +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
> +            continue;
> +        }
> +        /* This take a assumption that it should be a vhost packet if this
> +         * packet was allocated by DPDK pool and try sending to pNic. */
> +        if (pkt->source == DPBUF_DPDK &&
> +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need cksum
> +            continue;
> +        }

The comments here could be formatted better. Suggest combining both into one comment before the 'if'.
Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.

> +
> +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
> +        if (dl_type == htons(ETH_TYPE_IP)) {
> +            netdev_refill_l4_cksum(data, pkt,
> +                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,
> +                                   true);
> +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> +            netdev_refill_l4_cksum(data, pkt,
> +                                   ((struct ipv6_hdr *)l3hdr)->proto,
> +                                   false);
> +        }
> +    }
> +}
> +
>  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
>   * 'pkts', even in case of failure.
>   *
> @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,
> int qid,
>          return;
>      }
> 
> +    netdev_prepare_tx_csum(batch->packets, batch->count);

Putting this here assumes we only prepare the csum for vhost -> dpdk or vhost -> ring cases. What about vhost -> vhost?

> +
>      if (OVS_UNLIKELY(concurrent_txq)) {
>          qid = qid % dev->up.n_txq;
>          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
>      if (ovsthread_once_start(&once)) {
>          rte_vhost_driver_callback_register(&virtio_net_device_ops);
>          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                                  | 1ULL << VIRTIO_NET_F_CSUM);
> +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
>          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> 
>          ovsthread_once_done(&once);
> --
> 1.8.3.1
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Gao Zhenyu Aug. 4, 2017, 11:57 p.m. | #2
Hi Loftus,

Thanks for testing and the comments!
Can you show more details about your phy-vm-phy,phy-phy setup and testing
steps? Then I can reproduce it to see if I can solve this pps problem.

BTW, how about throughput, did you saw improvment?

I would like to implement vhost->vhost part.

Thanks
Zhenyu Gao

2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> >
> > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
> > So L4 packets's cksum were calculated in VM side but performance is not
> > good.
> > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
> > makes virtio-net frontend-driver support NETIF_F_SG as well
> >
> > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> > ---
> >
> > Here is some performance number:
> >
> > Setup:
> >
> >  qperf client
> > +---------+
> > |   VM    |
> > +---------+
> >      |
> >      |                          qperf server
> > +--------------+              +------------+
> > | vswitch+dpdk |              | bare-metal |
> > +--------------+              +------------+
> >        |                            |
> >        |                            |
> >       pNic---------PhysicalSwitch----
> >
> > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx
> on'
> > in VM side.
> >                       It offload cksum job to ovs-dpdk side.
> >
> > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off'
> in VM
> > side.
> >                 VM calculate cksum for tcp/udp packets.
> >
> > We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> > cksum.
>
> Hi Zhenyu,
>
> Thanks for the patch. I tested some alternative use cases and
> unfortunately I see a degradation for phy-phy and phy-vm-phy topologies.
> Here are my results:
>
> phy-vm-phy:
> without patch: 0.871Mpps
> with patch (offload=on): 0.877Mpps
> with patch (offload=off): 0.891Mpps
>
> phy-phy:
> without patch: 13.581Mpps
> with patch: 13.055Mpps
>
> The half a million pps drop for the second test case is concerning to me
> but not surprising since we're adding extra complexity to netdev_dpdk_send()
> Could this be avoided? Would it make sense to put this functionality
> somewhere else eg. vhost receive?
>
> On another note I have a general concern. I understand similar
> functionality is present in the DPDK vhost sample app. I wonder if it would
> be feasible for this to be implemented in the DPDK vhost library and
> leveraged here, rather than having two implementations in two separate code
> bases.
>
> I have some other comments inline.
>
> Thanks,
> Ciara
>
> >
> > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2
> host-qperf-server01
> > tcp_bw tcp_lat udp_bw udp_lat
> >   do cksum in ovs-dpdk          do cksum in VM             without this
> patch
> > tcp_bw:
> >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95
> MB/sec
> > tcp_bw:
> >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98
> MB/sec
> > tcp_bw:
> >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19
> MB/sec
> > tcp_bw:
> >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7
> MB/sec
> > tcp_bw:
> >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7
> MB/sec
> > tcp_bw:
> >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9
> MB/sec
> > tcp_bw:
> >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1
> MB/sec
> > tcp_bw:
> >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec
> > tcp_bw:
> >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec
> > tcp_bw:
> >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec
> > tcp_bw:
> >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec
> > tcp_bw:
> >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec
> > tcp_bw:
> >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec
> > tcp_bw:
> >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec
> > tcp_bw:
> >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec
> > tcp_bw:
> >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec
> > tcp_bw:
> >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6
> us
> > tcp_lat:
> >     latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.3 us       latency  =  29.6
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.4 us       latency  =  29.5
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6
> us
> > tcp_lat:
> >     latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7
> us
> > tcp_lat:
> >     latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us
> > tcp_lat:
> >     latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2
> us
> > tcp_lat:
> >     latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us
> > tcp_lat:
> >     latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2
> us
> > tcp_lat:
> >     latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5
> us
> > tcp_lat:
> >     latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6
> us
> > tcp_lat:
> >      latency  =  47.6 us      latency  =  48 us         latency  =  48.1
> us
> > tcp_lat:
> >     latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8
> us
> > tcp_lat:
> >     latency  =  82.8 us       latency  =  82.3 us       latency  =  116
> us
> > tcp_lat:
> >     latency  =  94.8 us       latency  =  94.2 us       latency  =  134
> us
> > tcp_lat:
> >     latency  =  167 us        latency  =  197 us        latency  =  172
> us
> > udp_bw:
> >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403
> KB/sec
> >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400
> KB/sec
> > udp_bw:
> >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810
> KB/sec
> >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807
> KB/sec
> > udp_bw:
> >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63
> > MB/sec
> >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63
> > MB/sec
> > udp_bw:
> >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26
> > MB/sec
> >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82
> > MB/sec
> > udp_bw:
> >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45
> > MB/sec
> >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45
> > MB/sec
> > udp_bw:
> >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13
> > MB/sec
> >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13
> MB/sec
> > udp_bw:
> >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9
> > MB/sec
> >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7
> > MB/sec
> > udp_bw:
> >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52
> > MB/sec
> >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2
> > MB/sec
> > udp_bw:
> >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103
> > MB/sec
> >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100
> MB/sec
> > udp_bw:
> >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205
> > MB/sec
> >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202
> MB/sec
> > udp_bw:
> >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401
> > MB/sec
> >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358
> MB/sec
> > udp_bw:
> >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557
> > MB/sec
> >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362
> MB/sec
> > udp_bw:
> >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863
> > MB/sec
> >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584
> MB/sec
> > udp_bw:
> >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08
> > GB/sec
> >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793
> > MB/sec
> > udp_bw:
> >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19
> > GB/sec
> >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673
> > MB/sec
> > udp_bw:
> >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3
> GB/sec
> >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762
> MB/sec
> > udp_bw:
> >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
> > bytes/sec
> >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0
> bytes/sec
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7
> us
> > udp_lat:
> >     latency  =  27 us         latency  =  26.7 us       latency  =  26.6
> us
> > udp_lat:
> >     latency  =  27 us         latency  =  26.9 us       latency  =  26.7
> us
> > udp_lat:
> >     latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3
> us
> > udp_lat:
> >     latency  =  28.1 us       latency  =  28 us         latency  =  28 us
> > udp_lat:
> >      latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2
> us
> > udp_lat:
> >     latency  =  31 us         latency  =  31 us         latency  =  30.8
> us
> > udp_lat:
> >     latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3
> us
> > udp_lat:
> >     latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5
> us
> > udp_lat:
> >     latency  =  64.9 us       latency  =  65 us         latency  =  65 us
> > udp_lat:
> >     latency  =  72.3 us       latency  =  72 us         latency  =  72 us
> > udp_lat:
> >     latency  =  121 us        latency  =  122 us        latency  =  122
> us
> > udp_lat:
> >     latency  =  0 ns          latency  =  0 ns          latency  =  0 ns
> >
> >
> >  lib/netdev-dpdk.c | 84
> > +++++++++++++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 82 insertions(+), 2 deletions(-)
> >
> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > index ea17b97..d27d615 100644
> > --- a/lib/netdev-dpdk.c
> > +++ b/lib/netdev-dpdk.c
> > @@ -28,6 +28,7 @@
> >  #include <rte_errno.h>
> >  #include <rte_eth_ring.h>
> >  #include <rte_ethdev.h>
> > +#include <rte_ip.h>
> >  #include <rte_malloc.h>
> >  #include <rte_mbuf.h>
> >  #include <rte_meter.h>
> > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> > *rxq)
> >      rte_free(rx);
> >  }
> >
> > +static inline void
> > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
> > +                       uint8_t l4_proto, bool is_ipv4)
> > +{
> > +    void *l3hdr = (void *)(data + pkt->l3_ofs);
> > +
> > +    if (l4_proto == IPPROTO_TCP) {
> > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
> pkt->l4_ofs);
> > +
> > +        pkt->mbuf.l2_len = pkt->l3_ofs;
> > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > +        tcp_hdr->tcp_csum = 0;
> > +        if (is_ipv4) {
> > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
> > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
> > +        } else {
> > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
> > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
> > +        }
> > +    } else if (l4_proto == IPPROTO_UDP) {
> > +        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt-
> > >l4_ofs);
> > +        /* do not recalculate udp cksum if it was 0 */
> > +        if (udp_hdr->udp_csum != 0) {
> > +            pkt->mbuf.l2_len = pkt->l3_ofs;
> > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > +            udp_hdr->udp_csum = 0;
> > +            if (is_ipv4) {
> > +                /*do not calculate udp cksum if it was a fragment IP*/
> > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> > +                                      fragment_offset)) {
> > +                    return;
> > +                }
> > +
> > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
> > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr,
> udp_hdr);
> > +            } else {
> > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
> > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr,
> udp_hdr);
> > +            }
> > +        }
> > +    }
> > +}
> > +
> > +static inline void
> > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> > +{
> > +    int i;
> > +
> > +    for (i = 0; i < pkt_cnt; i++) {
> > +        ovs_be16 dl_type;
> > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> > +        const char *data = dp_packet_data(pkt);
> > +        void *l3hdr = (char *)(data + pkt->l3_ofs);
> > +
> > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
> > +            continue;
> > +        }
> > +        /* This take a assumption that it should be a vhost packet if
> this
> > +         * packet was allocated by DPDK pool and try sending to pNic. */
> > +        if (pkt->source == DPBUF_DPDK &&
> > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need
> cksum
> > +            continue;
> > +        }
>
> The comments here could be formatted better. Suggest combining both into
> one comment before the 'if'.
> Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.
>
> > +
> > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
> > +        if (dl_type == htons(ETH_TYPE_IP)) {
> > +            netdev_refill_l4_cksum(data, pkt,
> > +                                   ((struct ipv4_hdr
> *)l3hdr)->next_proto_id,
> > +                                   true);
> > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> > +            netdev_refill_l4_cksum(data, pkt,
> > +                                   ((struct ipv6_hdr *)l3hdr)->proto,
> > +                                   false);
> > +        }
> > +    }
> > +}
> > +
> >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
> ownership of
> >   * 'pkts', even in case of failure.
> >   *
> > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,
> > int qid,
> >          return;
> >      }
> >
> > +    netdev_prepare_tx_csum(batch->packets, batch->count);
>
> Putting this here assumes we only prepare the csum for vhost -> dpdk or
> vhost -> ring cases. What about vhost -> vhost?
>
> > +
> >      if (OVS_UNLIKELY(concurrent_txq)) {
> >          qid = qid % dev->up.n_txq;
> >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
> >      if (ovsthread_once_start(&once)) {
> >          rte_vhost_driver_callback_register(&virtio_net_device_ops);
> >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
> > -                                  | 1ULL << VIRTIO_NET_F_CSUM);
> > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
> >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> >
> >          ovsthread_once_done(&once);
> > --
> > 1.8.3.1
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Ciara Loftus Aug. 8, 2017, 9:53 a.m. | #3
> 

> Hi Loftus,

> 

> Thanks for testing and the comments!

> Can you show more details about your phy-vm-phy,phy-phy setup and

> testing steps? Then I can reproduce it to see if I can solve this pps problem.


You're welcome. I forgot to mention my tests were with 64B packets.

For phy-phy the setup is a single host with 2 dpdk physical ports and 1 flow rule port1 -> port2.
See figure 3 here: https://tools.ietf.org/html/draft-ietf-bmwg-vswitch-opnfv-04#section-4

For the phy-vm-phy the setup is a single host with 2 dpdk physical ports and 2 vhostuser ports with flow rules:
Dpdk1 -> vhost 1 & vhost2 -> dpdk2
IP rules are set up in the VM to route packets from vhost1 to vhost 2.
See figure 4 in the link above.

> 

> BTW, how about throughput, did you saw improvment?


By throughput if you mean 0% packet loss, I did not test this.

Thanks,
Ciara

> 

> I would like to implement vhost->vhost part.

> 

> Thanks

> Zhenyu Gao

> 

> 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> >

> > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.

> > So L4 packets's cksum were calculated in VM side but performance is not

> > good.

> > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and

> > makes virtio-net frontend-driver support NETIF_F_SG as well

> >

> > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>

> > ---

> >

> > Here is some performance number:

> >

> > Setup:

> >

> >  qperf client

> > +---------+

> > |   VM    |

> > +---------+

> >      |

> >      |                          qperf server

> > +--------------+              +------------+

> > | vswitch+dpdk |              | bare-metal |

> > +--------------+              +------------+

> >        |                            |

> >        |                            |

> >       pNic---------PhysicalSwitch----

> >

> > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx

> on'

> > in VM side.

> >                       It offload cksum job to ovs-dpdk side.

> >

> > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in

> VM

> > side.

> >                 VM calculate cksum for tcp/udp packets.

> >

> > We can see huge improvment in TCP throughput if we leverage ovs-dpdk

> > cksum.

> Hi Zhenyu,

> 

> Thanks for the patch. I tested some alternative use cases and unfortunately I

> see a degradation for phy-phy and phy-vm-phy topologies.

> Here are my results:

> 

> phy-vm-phy:

> without patch: 0.871Mpps

> with patch (offload=on): 0.877Mpps

> with patch (offload=off): 0.891Mpps

> 

> phy-phy:

> without patch: 13.581Mpps

> with patch: 13.055Mpps

> 

> The half a million pps drop for the second test case is concerning to me but

> not surprising since we're adding extra complexity to netdev_dpdk_send()

> Could this be avoided? Would it make sense to put this functionality

> somewhere else eg. vhost receive?

> 

> On another note I have a general concern. I understand similar functionality

> is present in the DPDK vhost sample app. I wonder if it would be feasible for

> this to be implemented in the DPDK vhost library and leveraged here, rather

> than having two implementations in two separate code bases.

> 

> I have some other comments inline.

> 

> Thanks,

> Ciara

> 

> >

> > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01

> > tcp_bw tcp_lat udp_bw udp_lat

> >   do cksum in ovs-dpdk          do cksum in VM             without this patch

> > tcp_bw:

> >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec

> > tcp_bw:

> >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec

> > tcp_bw:

> >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec

> > tcp_bw:

> >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec

> > tcp_bw:

> >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec

> > tcp_bw:

> >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec

> > tcp_bw:

> >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec

> > tcp_bw:

> >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec

> > tcp_bw:

> >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec

> > tcp_bw:

> >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec

> > tcp_bw:

> >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec

> > tcp_bw:

> >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec

> > tcp_bw:

> >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec

> > tcp_bw:

> >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec

> > tcp_bw:

> >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec

> > tcp_bw:

> >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec

> > tcp_bw:

> >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us

> > tcp_lat:

> >     latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us

> > tcp_lat:

> >     latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us

> > tcp_lat:

> >     latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us

> > tcp_lat:

> >     latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us

> > tcp_lat:

> >     latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us

> > tcp_lat:

> >     latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us

> > tcp_lat:

> >     latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us

> > tcp_lat:

> >     latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us

> > tcp_lat:

> >      latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us

> > tcp_lat:

> >     latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us

> > tcp_lat:

> >     latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us

> > tcp_lat:

> >     latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us

> > tcp_lat:

> >     latency  =  167 us        latency  =  197 us        latency  =  172 us

> > udp_bw:

> >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403

> KB/sec

> >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400 KB/sec

> > udp_bw:

> >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810

> KB/sec

> >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807 KB/sec

> > udp_bw:

> >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63

> > MB/sec

> >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63

> > MB/sec

> > udp_bw:

> >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26

> > MB/sec

> >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82

> > MB/sec

> > udp_bw:

> >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45

> > MB/sec

> >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45

> > MB/sec

> > udp_bw:

> >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13

> > MB/sec

> >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13

> MB/sec

> > udp_bw:

> >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9

> > MB/sec

> >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7

> > MB/sec

> > udp_bw:

> >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52

> > MB/sec

> >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2

> > MB/sec

> > udp_bw:

> >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103

> > MB/sec

> >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100

> MB/sec

> > udp_bw:

> >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205

> > MB/sec

> >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202

> MB/sec

> > udp_bw:

> >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401

> > MB/sec

> >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358

> MB/sec

> > udp_bw:

> >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557

> > MB/sec

> >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362

> MB/sec

> > udp_bw:

> >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863

> > MB/sec

> >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584

> MB/sec

> > udp_bw:

> >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08

> > GB/sec

> >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793

> > MB/sec

> > udp_bw:

> >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19

> > GB/sec

> >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673

> > MB/sec

> > udp_bw:

> >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3

> GB/sec

> >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762

> MB/sec

> > udp_bw:

> >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0

> > bytes/sec

> >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us

> > udp_lat:

> >     latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us

> > udp_lat:

> >     latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us

> > udp_lat:

> >     latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us

> > udp_lat:

> >     latency  =  28.1 us       latency  =  28 us         latency  =  28 us

> > udp_lat:

> >      latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us

> > udp_lat:

> >     latency  =  31 us         latency  =  31 us         latency  =  30.8 us

> > udp_lat:

> >     latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us

> > udp_lat:

> >     latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us

> > udp_lat:

> >     latency  =  64.9 us       latency  =  65 us         latency  =  65 us

> > udp_lat:

> >     latency  =  72.3 us       latency  =  72 us         latency  =  72 us

> > udp_lat:

> >     latency  =  121 us        latency  =  122 us        latency  =  122 us

> > udp_lat:

> >     latency  =  0 ns          latency  =  0 ns          latency  =  0 ns

> >

> >

> >  lib/netdev-dpdk.c | 84

> > +++++++++++++++++++++++++++++++++++++++++++++++++++++--

> >  1 file changed, 82 insertions(+), 2 deletions(-)

> >

> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c

> > index ea17b97..d27d615 100644

> > --- a/lib/netdev-dpdk.c

> > +++ b/lib/netdev-dpdk.c

> > @@ -28,6 +28,7 @@

> >  #include <rte_errno.h>

> >  #include <rte_eth_ring.h>

> >  #include <rte_ethdev.h>

> > +#include <rte_ip.h>

> >  #include <rte_malloc.h>

> >  #include <rte_mbuf.h>

> >  #include <rte_meter.h>

> > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq

> > *rxq)

> >      rte_free(rx);

> >  }

> >

> > +static inline void

> > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,

> > +                       uint8_t l4_proto, bool is_ipv4)

> > +{

> > +    void *l3hdr = (void *)(data + pkt->l3_ofs);

> > +

> > +    if (l4_proto == IPPROTO_TCP) {

> > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt-

> >l4_ofs);

> > +

> > +        pkt->mbuf.l2_len = pkt->l3_ofs;

> > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;

> > +        tcp_hdr->tcp_csum = 0;

> > +        if (is_ipv4) {

> > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);

> > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;

> > +        } else {

> > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;

> > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);

> > +        }

> > +    } else if (l4_proto == IPPROTO_UDP) {

> > +        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt-

> > >l4_ofs);

> > +        /* do not recalculate udp cksum if it was 0 */

> > +        if (udp_hdr->udp_csum != 0) {

> > +            pkt->mbuf.l2_len = pkt->l3_ofs;

> > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;

> > +            udp_hdr->udp_csum = 0;

> > +            if (is_ipv4) {

> > +                /*do not calculate udp cksum if it was a fragment IP*/

> > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->

> > +                                      fragment_offset)) {

> > +                    return;

> > +                }

> > +

> > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;

> > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);

> > +            } else {

> > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;

> > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);

> > +            }

> > +        }

> > +    }

> > +}

> > +

> > +static inline void

> > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)

> > +{

> > +    int i;

> > +

> > +    for (i = 0; i < pkt_cnt; i++) {

> > +        ovs_be16 dl_type;

> > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];

> > +        const char *data = dp_packet_data(pkt);

> > +        void *l3hdr = (char *)(data + pkt->l3_ofs);

> > +

> > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {

> > +            continue;

> > +        }

> > +        /* This take a assumption that it should be a vhost packet if this

> > +         * packet was allocated by DPDK pool and try sending to pNic. */

> > +        if (pkt->source == DPBUF_DPDK &&

> > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {

> > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need

> cksum

> > +            continue;

> > +        }

> The comments here could be formatted better. Suggest combining both into

> one comment before the 'if'.

> Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.

> 

> > +

> > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);

> > +        if (dl_type == htons(ETH_TYPE_IP)) {

> > +            netdev_refill_l4_cksum(data, pkt,

> > +                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,

> > +                                   true);

> > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {

> > +            netdev_refill_l4_cksum(data, pkt,

> > +                                   ((struct ipv6_hdr *)l3hdr)->proto,

> > +                                   false);

> > +        }

> > +    }

> > +}

> > +

> >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of

> >   * 'pkts', even in case of failure.

> >   *

> > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,

> > int qid,

> >          return;

> >      }

> >

> > +    netdev_prepare_tx_csum(batch->packets, batch->count);

> 

> Putting this here assumes we only prepare the csum for vhost -> dpdk or

> vhost -> ring cases. What about vhost -> vhost?

> 

> > +

> >      if (OVS_UNLIKELY(concurrent_txq)) {

> >          qid = qid % dev->up.n_txq;

> >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)

> >      if (ovsthread_once_start(&once)) {

> >          rte_vhost_driver_callback_register(&virtio_net_device_ops);

> >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6

> > -                                  | 1ULL << VIRTIO_NET_F_CSUM);

> > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);

> >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);

> >

> >          ovsthread_once_done(&once);

> > --

> > 1.8.3.1

> >

> > _______________________________________________

> > dev mailing list

> > dev@openvswitch.org

> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Gao Zhenyu Aug. 10, 2017, 4:35 a.m. | #4
I see, for flows in phy-phy setup, they should not be calculate cksum.
I will revise my patch to do the cksum for vhost port only. I will send a
new patch next week.

Thanks
Zhenyu Gao

2017-08-08 17:53 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> >
> > Hi Loftus,
> >
> > Thanks for testing and the comments!
> > Can you show more details about your phy-vm-phy,phy-phy setup and
> > testing steps? Then I can reproduce it to see if I can solve this pps
> problem.
>
> You're welcome. I forgot to mention my tests were with 64B packets.
>
> For phy-phy the setup is a single host with 2 dpdk physical ports and 1
> flow rule port1 -> port2.
> See figure 3 here: https://tools.ietf.org/html/
> draft-ietf-bmwg-vswitch-opnfv-04#section-4
>
> For the phy-vm-phy the setup is a single host with 2 dpdk physical ports
> and 2 vhostuser ports with flow rules:
> Dpdk1 -> vhost 1 & vhost2 -> dpdk2
> IP rules are set up in the VM to route packets from vhost1 to vhost 2.
> See figure 4 in the link above.
>
> >
> > BTW, how about throughput, did you saw improvment?
>
> By throughput if you mean 0% packet loss, I did not test this.
>
> Thanks,
> Ciara
>
> >
> > I would like to implement vhost->vhost part.
> >
> > Thanks
> > Zhenyu Gao
> >
> > 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
> > >
> > > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
> > > So L4 packets's cksum were calculated in VM side but performance is not
> > > good.
> > > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
> > > makes virtio-net frontend-driver support NETIF_F_SG as well
> > >
> > > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> > > ---
> > >
> > > Here is some performance number:
> > >
> > > Setup:
> > >
> > >  qperf client
> > > +---------+
> > > |   VM    |
> > > +---------+
> > >      |
> > >      |                          qperf server
> > > +--------------+              +------------+
> > > | vswitch+dpdk |              | bare-metal |
> > > +--------------+              +------------+
> > >        |                            |
> > >        |                            |
> > >       pNic---------PhysicalSwitch----
> > >
> > > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0
> tx
> > on'
> > > in VM side.
> > >                       It offload cksum job to ovs-dpdk side.
> > >
> > > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx
> off' in
> > VM
> > > side.
> > >                 VM calculate cksum for tcp/udp packets.
> > >
> > > We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> > > cksum.
> > Hi Zhenyu,
> >
> > Thanks for the patch. I tested some alternative use cases and
> unfortunately I
> > see a degradation for phy-phy and phy-vm-phy topologies.
> > Here are my results:
> >
> > phy-vm-phy:
> > without patch: 0.871Mpps
> > with patch (offload=on): 0.877Mpps
> > with patch (offload=off): 0.891Mpps
> >
> > phy-phy:
> > without patch: 13.581Mpps
> > with patch: 13.055Mpps
> >
> > The half a million pps drop for the second test case is concerning to me
> but
> > not surprising since we're adding extra complexity to netdev_dpdk_send()
> > Could this be avoided? Would it make sense to put this functionality
> > somewhere else eg. vhost receive?
> >
> > On another note I have a general concern. I understand similar
> functionality
> > is present in the DPDK vhost sample app. I wonder if it would be
> feasible for
> > this to be implemented in the DPDK vhost library and leveraged here,
> rather
> > than having two implementations in two separate code bases.
> >
> > I have some other comments inline.
> >
> > Thanks,
> > Ciara
> >
> > >
> > > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2
> host-qperf-server01
> > > tcp_bw tcp_lat udp_bw udp_lat
> > >   do cksum in ovs-dpdk          do cksum in VM             without
> this patch
> > > tcp_bw:
> > >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95
> MB/sec
> > > tcp_bw:
> > >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98
> MB/sec
> > > tcp_bw:
> > >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19
> MB/sec
> > > tcp_bw:
> > >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7
> MB/sec
> > > tcp_bw:
> > >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7
> MB/sec
> > > tcp_bw:
> > >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9
> MB/sec
> > > tcp_bw:
> > >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1
> MB/sec
> > > tcp_bw:
> > >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149
> MB/sec
> > > tcp_bw:
> > >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216
> MB/sec
> > > tcp_bw:
> > >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275
> MB/sec
> > > tcp_bw:
> > >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321
> MB/sec
> > > tcp_bw:
> > >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361
> MB/sec
> > > tcp_bw:
> > >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419
> MB/sec
> > > tcp_bw:
> > >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457
> MB/sec
> > > tcp_bw:
> > >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480
> MB/sec
> > > tcp_bw:
> > >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488
> MB/sec
> > > tcp_bw:
> > >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483
> MB/sec
> > > tcp_lat:
> > >     latency  =  29 us         latency  =  29.2 us       latency  =
> 29.6 us
> > > tcp_lat:
> > >     latency  =  28.9 us       latency  =  29.3 us       latency  =
> 29.5 us
> > > tcp_lat:
> > >     latency  =  29 us         latency  =  29.3 us       latency  =
> 29.6 us
> > > tcp_lat:
> > >     latency  =  29 us         latency  =  29.4 us       latency  =
> 29.5 us
> > > tcp_lat:
> > >     latency  =  29 us         latency  =  29.2 us       latency  =
> 29.6 us
> > > tcp_lat:
> > >     latency  =  29.1 us       latency  =  29.3 us       latency  =
> 29.7 us
> > > tcp_lat:
> > >     latency  =  29.4 us       latency  =  29.6 us       latency  =  30
> us
> > > tcp_lat:
> > >     latency  =  29.8 us       latency  =  30.1 us       latency  =
> 30.2 us
> > > tcp_lat:
> > >     latency  =  30.9 us       latency  =  30.9 us       latency  =  31
> us
> > > tcp_lat:
> > >     latency  =  46.9 us       latency  =  46.2 us       latency  =
> 32.2 us
> > > tcp_lat:
> > >     latency  =  51.5 us       latency  =  52.6 us       latency  =
> 34.5 us
> > > tcp_lat:
> > >     latency  =  43.9 us       latency  =  43.8 us       latency  =
> 43.6 us
> > > tcp_lat:
> > >      latency  =  47.6 us      latency  =  48 us         latency  =
> 48.1 us
> > > tcp_lat:
> > >     latency  =  77.7 us       latency  =  78.8 us       latency  =
> 78.8 us
> > > tcp_lat:
> > >     latency  =  82.8 us       latency  =  82.3 us       latency  =
> 116 us
> > > tcp_lat:
> > >     latency  =  94.8 us       latency  =  94.2 us       latency  =
> 134 us
> > > tcp_lat:
> > >     latency  =  167 us        latency  =  197 us        latency  =
> 172 us
> > > udp_bw:
> > >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403
> > KB/sec
> > >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =
> 400 KB/sec
> > > udp_bw:
> > >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810
> > KB/sec
> > >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =
> 807 KB/sec
> > > udp_bw:
> > >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =
> 1.63
> > > MB/sec
> > >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =
> 1.63
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =
> 3.26
> > > MB/sec
> > >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =
> 2.82
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =
> 6.45
> > > MB/sec
> > >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =
> 6.45
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13
> > > MB/sec
> > >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =
> 25.9
> > > MB/sec
> > >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =
> 25.7
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =
> 52
> > > MB/sec
> > >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =
> 51.2
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103
> > > MB/sec
> > >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205
> > > MB/sec
> > >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401
> > > MB/sec
> > >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557
> > > MB/sec
> > >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863
> > > MB/sec
> > >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =
> 1.08
> > > GB/sec
> > >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =
>  793
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =
> 1.19
> > > GB/sec
> > >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =
>  673
> > > MB/sec
> > > udp_bw:
> > >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3
> > GB/sec
> > >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762
> > MB/sec
> > > udp_bw:
> > >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
> > > bytes/sec
> > >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0
> bytes/sec
> > > udp_lat:
> > >     latency  =  26.7 us       latency  =  26.5 us       latency  =
> 26.4 us
> > > udp_lat:
> > >     latency  =  26.7 us       latency  =  26.5 us       latency  =
> 26.3 us
> > > udp_lat:
> > >     latency  =  26.7 us       latency  =  26.7 us       latency  =
> 26.3 us
> > > udp_lat:
> > >     latency  =  26.7 us       latency  =  26.6 us       latency  =
> 26.3 us
> > > udp_lat:
> > >     latency  =  26.7 us       latency  =  26.7 us       latency  =
> 26.7 us
> > > udp_lat:
> > >     latency  =  27 us         latency  =  26.7 us       latency  =
> 26.6 us
> > > udp_lat:
> > >     latency  =  27 us         latency  =  26.9 us       latency  =
> 26.7 us
> > > udp_lat:
> > >     latency  =  27.6 us       latency  =  27.4 us       latency  =
> 27.3 us
> > > udp_lat:
> > >     latency  =  28.1 us       latency  =  28 us         latency  =  28
> us
> > > udp_lat:
> > >      latency  =  29.4 us      latency  =  29.2 us       latency  =
> 29.2 us
> > > udp_lat:
> > >     latency  =  31 us         latency  =  31 us         latency  =
> 30.8 us
> > > udp_lat:
> > >     latency  =  41.4 us       latency  =  41.4 us       latency  =
> 41.3 us
> > > udp_lat:
> > >     latency  =  41.6 us       latency  =  41.5 us       latency  =
> 41.5 us
> > > udp_lat:
> > >     latency  =  64.9 us       latency  =  65 us         latency  =  65
> us
> > > udp_lat:
> > >     latency  =  72.3 us       latency  =  72 us         latency  =  72
> us
> > > udp_lat:
> > >     latency  =  121 us        latency  =  122 us        latency  =
> 122 us
> > > udp_lat:
> > >     latency  =  0 ns          latency  =  0 ns          latency  =  0
> ns
> > >
> > >
> > >  lib/netdev-dpdk.c | 84
> > > +++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 82 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > > index ea17b97..d27d615 100644
> > > --- a/lib/netdev-dpdk.c
> > > +++ b/lib/netdev-dpdk.c
> > > @@ -28,6 +28,7 @@
> > >  #include <rte_errno.h>
> > >  #include <rte_eth_ring.h>
> > >  #include <rte_ethdev.h>
> > > +#include <rte_ip.h>
> > >  #include <rte_malloc.h>
> > >  #include <rte_mbuf.h>
> > >  #include <rte_meter.h>
> > > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> > > *rxq)
> > >      rte_free(rx);
> > >  }
> > >
> > > +static inline void
> > > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
> > > +                       uint8_t l4_proto, bool is_ipv4)
> > > +{
> > > +    void *l3hdr = (void *)(data + pkt->l3_ofs);
> > > +
> > > +    if (l4_proto == IPPROTO_TCP) {
> > > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt-
> > >l4_ofs);
> > > +
> > > +        pkt->mbuf.l2_len = pkt->l3_ofs;
> > > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > > +        tcp_hdr->tcp_csum = 0;
> > > +        if (is_ipv4) {
> > > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
> > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
> > > +        } else {
> > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
> > > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
> > > +        }
> > > +    } else if (l4_proto == IPPROTO_UDP) {
> > > +        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt-
> > > >l4_ofs);
> > > +        /* do not recalculate udp cksum if it was 0 */
> > > +        if (udp_hdr->udp_csum != 0) {
> > > +            pkt->mbuf.l2_len = pkt->l3_ofs;
> > > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > > +            udp_hdr->udp_csum = 0;
> > > +            if (is_ipv4) {
> > > +                /*do not calculate udp cksum if it was a fragment IP*/
> > > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> > > +                                      fragment_offset)) {
> > > +                    return;
> > > +                }
> > > +
> > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
> > > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr,
> udp_hdr);
> > > +            } else {
> > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
> > > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr,
> udp_hdr);
> > > +            }
> > > +        }
> > > +    }
> > > +}
> > > +
> > > +static inline void
> > > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> > > +{
> > > +    int i;
> > > +
> > > +    for (i = 0; i < pkt_cnt; i++) {
> > > +        ovs_be16 dl_type;
> > > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> > > +        const char *data = dp_packet_data(pkt);
> > > +        void *l3hdr = (char *)(data + pkt->l3_ofs);
> > > +
> > > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
> > > +            continue;
> > > +        }
> > > +        /* This take a assumption that it should be a vhost packet if
> this
> > > +         * packet was allocated by DPDK pool and try sending to pNic.
> */
> > > +        if (pkt->source == DPBUF_DPDK &&
> > > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> > > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need
> > cksum
> > > +            continue;
> > > +        }
> > The comments here could be formatted better. Suggest combining both into
> > one comment before the 'if'.
> > Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.
> >
> > > +
> > > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
> > > +        if (dl_type == htons(ETH_TYPE_IP)) {
> > > +            netdev_refill_l4_cksum(data, pkt,
> > > +                                   ((struct ipv4_hdr
> *)l3hdr)->next_proto_id,
> > > +                                   true);
> > > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> > > +            netdev_refill_l4_cksum(data, pkt,
> > > +                                   ((struct ipv6_hdr *)l3hdr)->proto,
> > > +                                   false);
> > > +        }
> > > +    }
> > > +}
> > > +
> > >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
> ownership of
> > >   * 'pkts', even in case of failure.
> > >   *
> > > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,
> > > int qid,
> > >          return;
> > >      }
> > >
> > > +    netdev_prepare_tx_csum(batch->packets, batch->count);
> >
> > Putting this here assumes we only prepare the csum for vhost -> dpdk or
> > vhost -> ring cases. What about vhost -> vhost?
> >
> > > +
> > >      if (OVS_UNLIKELY(concurrent_txq)) {
> > >          qid = qid % dev->up.n_txq;
> > >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> > > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
> > >      if (ovsthread_once_start(&once)) {
> > >          rte_vhost_driver_callback_register(&virtio_net_device_ops);
> > >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> > > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
> > > -                                  | 1ULL << VIRTIO_NET_F_CSUM);
> > > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
> > >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> > >
> > >          ovsthread_once_done(&once);
> > > --
> > > 1.8.3.1
> > >
> > > _______________________________________________
> > > dev mailing list
> > > dev@openvswitch.org
> > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
Gao Zhenyu Aug. 16, 2017, 1:38 p.m. | #5
Hi Loftus,

   I had submitted a new version, please see
https://patchwork.ozlabs.org/patch/802070/
   It move the cksum to vhost receive side.

Thanks
Zhenyu Gao

2017-08-10 12:35 GMT+08:00 Gao Zhenyu <sysugaozhenyu@gmail.com>:

> I see, for flows in phy-phy setup, they should not be calculate cksum.
> I will revise my patch to do the cksum for vhost port only. I will send a
> new patch next week.
>
> Thanks
> Zhenyu Gao
>
> 2017-08-08 17:53 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
>
>> >
>> > Hi Loftus,
>> >
>> > Thanks for testing and the comments!
>> > Can you show more details about your phy-vm-phy,phy-phy setup and
>> > testing steps? Then I can reproduce it to see if I can solve this pps
>> problem.
>>
>> You're welcome. I forgot to mention my tests were with 64B packets.
>>
>> For phy-phy the setup is a single host with 2 dpdk physical ports and 1
>> flow rule port1 -> port2.
>> See figure 3 here: https://tools.ietf.org/html/dr
>> aft-ietf-bmwg-vswitch-opnfv-04#section-4
>>
>> For the phy-vm-phy the setup is a single host with 2 dpdk physical ports
>> and 2 vhostuser ports with flow rules:
>> Dpdk1 -> vhost 1 & vhost2 -> dpdk2
>> IP rules are set up in the VM to route packets from vhost1 to vhost 2.
>> See figure 4 in the link above.
>>
>> >
>> > BTW, how about throughput, did you saw improvment?
>>
>> By throughput if you mean 0% packet loss, I did not test this.
>>
>> Thanks,
>> Ciara
>>
>> >
>> > I would like to implement vhost->vhost part.
>> >
>> > Thanks
>> > Zhenyu Gao
>> >
>> > 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
>> > >
>> > > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx
>> cksum.
>> > > So L4 packets's cksum were calculated in VM side but performance is
>> not
>> > > good.
>> > > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
>> > > makes virtio-net frontend-driver support NETIF_F_SG as well
>> > >
>> > > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
>> > > ---
>> > >
>> > > Here is some performance number:
>> > >
>> > > Setup:
>> > >
>> > >  qperf client
>> > > +---------+
>> > > |   VM    |
>> > > +---------+
>> > >      |
>> > >      |                          qperf server
>> > > +--------------+              +------------+
>> > > | vswitch+dpdk |              | bare-metal |
>> > > +--------------+              +------------+
>> > >        |                            |
>> > >        |                            |
>> > >       pNic---------PhysicalSwitch----
>> > >
>> > > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0
>> tx
>> > on'
>> > > in VM side.
>> > >                       It offload cksum job to ovs-dpdk side.
>> > >
>> > > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx
>> off' in
>> > VM
>> > > side.
>> > >                 VM calculate cksum for tcp/udp packets.
>> > >
>> > > We can see huge improvment in TCP throughput if we leverage ovs-dpdk
>> > > cksum.
>> > Hi Zhenyu,
>> >
>> > Thanks for the patch. I tested some alternative use cases and
>> unfortunately I
>> > see a degradation for phy-phy and phy-vm-phy topologies.
>> > Here are my results:
>> >
>> > phy-vm-phy:
>> > without patch: 0.871Mpps
>> > with patch (offload=on): 0.877Mpps
>> > with patch (offload=off): 0.891Mpps
>> >
>> > phy-phy:
>> > without patch: 13.581Mpps
>> > with patch: 13.055Mpps
>> >
>> > The half a million pps drop for the second test case is concerning to
>> me but
>> > not surprising since we're adding extra complexity to netdev_dpdk_send()
>> > Could this be avoided? Would it make sense to put this functionality
>> > somewhere else eg. vhost receive?
>> >
>> > On another note I have a general concern. I understand similar
>> functionality
>> > is present in the DPDK vhost sample app. I wonder if it would be
>> feasible for
>> > this to be implemented in the DPDK vhost library and leveraged here,
>> rather
>> > than having two implementations in two separate code bases.
>> >
>> > I have some other comments inline.
>> >
>> > Thanks,
>> > Ciara
>> >
>> > >
>> > > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2
>> host-qperf-server01
>> > > tcp_bw tcp_lat udp_bw udp_lat
>> > >   do cksum in ovs-dpdk          do cksum in VM             without
>> this patch
>> > > tcp_bw:
>> > >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488
>> MB/sec
>> > > tcp_bw:
>> > >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483
>> MB/sec
>> > > tcp_lat:
>> > >     latency  =  29 us         latency  =  29.2 us       latency  =
>> 29.6 us
>> > > tcp_lat:
>> > >     latency  =  28.9 us       latency  =  29.3 us       latency  =
>> 29.5 us
>> > > tcp_lat:
>> > >     latency  =  29 us         latency  =  29.3 us       latency  =
>> 29.6 us
>> > > tcp_lat:
>> > >     latency  =  29 us         latency  =  29.4 us       latency  =
>> 29.5 us
>> > > tcp_lat:
>> > >     latency  =  29 us         latency  =  29.2 us       latency  =
>> 29.6 us
>> > > tcp_lat:
>> > >     latency  =  29.1 us       latency  =  29.3 us       latency  =
>> 29.7 us
>> > > tcp_lat:
>> > >     latency  =  29.4 us       latency  =  29.6 us       latency  =
>> 30 us
>> > > tcp_lat:
>> > >     latency  =  29.8 us       latency  =  30.1 us       latency  =
>> 30.2 us
>> > > tcp_lat:
>> > >     latency  =  30.9 us       latency  =  30.9 us       latency  =
>> 31 us
>> > > tcp_lat:
>> > >     latency  =  46.9 us       latency  =  46.2 us       latency  =
>> 32.2 us
>> > > tcp_lat:
>> > >     latency  =  51.5 us       latency  =  52.6 us       latency  =
>> 34.5 us
>> > > tcp_lat:
>> > >     latency  =  43.9 us       latency  =  43.8 us       latency  =
>> 43.6 us
>> > > tcp_lat:
>> > >      latency  =  47.6 us      latency  =  48 us         latency  =
>> 48.1 us
>> > > tcp_lat:
>> > >     latency  =  77.7 us       latency  =  78.8 us       latency  =
>> 78.8 us
>> > > tcp_lat:
>> > >     latency  =  82.8 us       latency  =  82.3 us       latency  =
>> 116 us
>> > > tcp_lat:
>> > >     latency  =  94.8 us       latency  =  94.2 us       latency  =
>> 134 us
>> > > tcp_lat:
>> > >     latency  =  167 us        latency  =  197 us        latency  =
>> 172 us
>> > > udp_bw:
>> > >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =
>> 403
>> > KB/sec
>> > >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =
>> 400 KB/sec
>> > > udp_bw:
>> > >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =
>> 810
>> > KB/sec
>> > >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =
>> 807 KB/sec
>> > > udp_bw:
>> > >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =
>> 1.63
>> > > MB/sec
>> > >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =
>> 1.63
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =
>> 3.26
>> > > MB/sec
>> > >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =
>> 2.82
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =
>> 6.45
>> > > MB/sec
>> > >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =
>> 6.45
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13
>> > > MB/sec
>> > >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =
>> 25.9
>> > > MB/sec
>> > >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =
>> 25.7
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =
>> 52
>> > > MB/sec
>> > >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =
>> 51.2
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =
>> 103
>> > > MB/sec
>> > >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =
>> 100
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =
>> 205
>> > > MB/sec
>> > >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =
>> 202
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =
>> 401
>> > > MB/sec
>> > >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =
>> 358
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =
>> 557
>> > > MB/sec
>> > >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =
>> 362
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =
>> 863
>> > > MB/sec
>> > >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =
>> 584
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =
>> 1.08
>> > > GB/sec
>> > >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =
>>  793
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =
>> 1.19
>> > > GB/sec
>> > >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =
>>  673
>> > > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =
>> 1.3
>> > GB/sec
>> > >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =
>> 762
>> > MB/sec
>> > > udp_bw:
>> > >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
>> > > bytes/sec
>> > >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0
>> bytes/sec
>> > > udp_lat:
>> > >     latency  =  26.7 us       latency  =  26.5 us       latency  =
>> 26.4 us
>> > > udp_lat:
>> > >     latency  =  26.7 us       latency  =  26.5 us       latency  =
>> 26.3 us
>> > > udp_lat:
>> > >     latency  =  26.7 us       latency  =  26.7 us       latency  =
>> 26.3 us
>> > > udp_lat:
>> > >     latency  =  26.7 us       latency  =  26.6 us       latency  =
>> 26.3 us
>> > > udp_lat:
>> > >     latency  =  26.7 us       latency  =  26.7 us       latency  =
>> 26.7 us
>> > > udp_lat:
>> > >     latency  =  27 us         latency  =  26.7 us       latency  =
>> 26.6 us
>> > > udp_lat:
>> > >     latency  =  27 us         latency  =  26.9 us       latency  =
>> 26.7 us
>> > > udp_lat:
>> > >     latency  =  27.6 us       latency  =  27.4 us       latency  =
>> 27.3 us
>> > > udp_lat:
>> > >     latency  =  28.1 us       latency  =  28 us         latency  =
>> 28 us
>> > > udp_lat:
>> > >      latency  =  29.4 us      latency  =  29.2 us       latency  =
>> 29.2 us
>> > > udp_lat:
>> > >     latency  =  31 us         latency  =  31 us         latency  =
>> 30.8 us
>> > > udp_lat:
>> > >     latency  =  41.4 us       latency  =  41.4 us       latency  =
>> 41.3 us
>> > > udp_lat:
>> > >     latency  =  41.6 us       latency  =  41.5 us       latency  =
>> 41.5 us
>> > > udp_lat:
>> > >     latency  =  64.9 us       latency  =  65 us         latency  =
>> 65 us
>> > > udp_lat:
>> > >     latency  =  72.3 us       latency  =  72 us         latency  =
>> 72 us
>> > > udp_lat:
>> > >     latency  =  121 us        latency  =  122 us        latency  =
>> 122 us
>> > > udp_lat:
>> > >     latency  =  0 ns          latency  =  0 ns          latency  =  0
>> ns
>> > >
>> > >
>> > >  lib/netdev-dpdk.c | 84
>> > > +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>> > >  1 file changed, 82 insertions(+), 2 deletions(-)
>> > >
>> > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>> > > index ea17b97..d27d615 100644
>> > > --- a/lib/netdev-dpdk.c
>> > > +++ b/lib/netdev-dpdk.c
>> > > @@ -28,6 +28,7 @@
>> > >  #include <rte_errno.h>
>> > >  #include <rte_eth_ring.h>
>> > >  #include <rte_ethdev.h>
>> > > +#include <rte_ip.h>
>> > >  #include <rte_malloc.h>
>> > >  #include <rte_mbuf.h>
>> > >  #include <rte_meter.h>
>> > > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
>> > > *rxq)
>> > >      rte_free(rx);
>> > >  }
>> > >
>> > > +static inline void
>> > > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
>> > > +                       uint8_t l4_proto, bool is_ipv4)
>> > > +{
>> > > +    void *l3hdr = (void *)(data + pkt->l3_ofs);
>> > > +
>> > > +    if (l4_proto == IPPROTO_TCP) {
>> > > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
>> pkt-
>> > >l4_ofs);
>> > > +
>> > > +        pkt->mbuf.l2_len = pkt->l3_ofs;
>> > > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
>> > > +        tcp_hdr->tcp_csum = 0;
>> > > +        if (is_ipv4) {
>> > > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr,
>> tcp_hdr);
>> > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
>> > > +        } else {
>> > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
>> > > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr,
>> tcp_hdr);
>> > > +        }
>> > > +    } else if (l4_proto == IPPROTO_UDP) {
>> > > +        struct udp_header *udp_hdr = (struct udp_header *)(data +
>> pkt-
>> > > >l4_ofs);
>> > > +        /* do not recalculate udp cksum if it was 0 */
>> > > +        if (udp_hdr->udp_csum != 0) {
>> > > +            pkt->mbuf.l2_len = pkt->l3_ofs;
>> > > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
>> > > +            udp_hdr->udp_csum = 0;
>> > > +            if (is_ipv4) {
>> > > +                /*do not calculate udp cksum if it was a fragment
>> IP*/
>> > > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
>> > > +                                      fragment_offset)) {
>> > > +                    return;
>> > > +                }
>> > > +
>> > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
>> > > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr,
>> udp_hdr);
>> > > +            } else {
>> > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
>> > > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr,
>> udp_hdr);
>> > > +            }
>> > > +        }
>> > > +    }
>> > > +}
>> > > +
>> > > +static inline void
>> > > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
>> > > +{
>> > > +    int i;
>> > > +
>> > > +    for (i = 0; i < pkt_cnt; i++) {
>> > > +        ovs_be16 dl_type;
>> > > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
>> > > +        const char *data = dp_packet_data(pkt);
>> > > +        void *l3hdr = (char *)(data + pkt->l3_ofs);
>> > > +
>> > > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
>> > > +            continue;
>> > > +        }
>> > > +        /* This take a assumption that it should be a vhost packet
>> if this
>> > > +         * packet was allocated by DPDK pool and try sending to
>> pNic. */
>> > > +        if (pkt->source == DPBUF_DPDK &&
>> > > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
>> > > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet
>> need
>> > cksum
>> > > +            continue;
>> > > +        }
>> > The comments here could be formatted better. Suggest combining both into
>> > one comment before the 'if'.
>> > Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.
>> >
>> > > +
>> > > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
>> > > +        if (dl_type == htons(ETH_TYPE_IP)) {
>> > > +            netdev_refill_l4_cksum(data, pkt,
>> > > +                                   ((struct ipv4_hdr
>> *)l3hdr)->next_proto_id,
>> > > +                                   true);
>> > > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>> > > +            netdev_refill_l4_cksum(data, pkt,
>> > > +                                   ((struct ipv6_hdr *)l3hdr)->proto,
>> > > +                                   false);
>> > > +        }
>> > > +    }
>> > > +}
>> > > +
>> > >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
>> ownership of
>> > >   * 'pkts', even in case of failure.
>> > >   *
>> > > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,
>> > > int qid,
>> > >          return;
>> > >      }
>> > >
>> > > +    netdev_prepare_tx_csum(batch->packets, batch->count);
>> >
>> > Putting this here assumes we only prepare the csum for vhost -> dpdk or
>> > vhost -> ring cases. What about vhost -> vhost?
>> >
>> > > +
>> > >      if (OVS_UNLIKELY(concurrent_txq)) {
>> > >          qid = qid % dev->up.n_txq;
>> > >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>> > > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
>> > >      if (ovsthread_once_start(&once)) {
>> > >          rte_vhost_driver_callback_register(&virtio_net_device_ops);
>> > >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
>> > > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
>> > > -                                  | 1ULL << VIRTIO_NET_F_CSUM);
>> > > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
>> > >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
>> > >
>> > >          ovsthread_once_done(&once);
>> > > --
>> > > 1.8.3.1
>> > >
>> > > _______________________________________________
>> > > dev mailing list
>> > > dev@openvswitch.org
>> > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>>
>
Darrell Ball Aug. 16, 2017, 5:48 p.m. | #6
Hi Ciara

You had a general concern below; can we conclude on that before going further ?

Thanks Darrell

“
> On another note I have a general concern. I understand similar functionality

> is present in the DPDK vhost sample app. I wonder if it would be feasible for

> this to be implemented in the DPDK vhost library and leveraged here, rather

> than having two implementations in two separate code bases.

>

> I have some other comments inline.

>

> Thanks,

> Ciara



From: Gao Zhenyu <sysugaozhenyu@gmail.com>

Date: Wednesday, August 16, 2017 at 6:38 AM
To: "Loftus, Ciara" <ciara.loftus@intel.com>
Cc: "blp@ovn.org" <blp@ovn.org>, "Chandran, Sugesh" <sugesh.chandran@intel.com>, "ktraynor@redhat.com" <ktraynor@redhat.com>, Darrell Ball <dball@vmware.com>, "dev@openvswitch.org" <dev@openvswitch.org>
Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX cksum in ovs-dpdk side

Hi Loftus,
   I had submitted a new version, please see https://patchwork.ozlabs.org/patch/802070/<https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.ozlabs.org_patch_802070_&d=DwMFaQ&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=CE8K7Pnx6aVmgYFTMrsCLCL8dLA6RjD_jGh5KNtWRvA&s=iBg71oKi5oXmrpna96jYdQhts7WkTJPTLFYuBkI2j1c&e=>
   It move the cksum to vhost receive side.
Thanks
Zhenyu Gao

2017-08-10 12:35 GMT+08:00 Gao Zhenyu <sysugaozhenyu@gmail.com<mailto:sysugaozhenyu@gmail.com>>:
I see, for flows in phy-phy setup, they should not be calculate cksum.
I will revise my patch to do the cksum for vhost port only. I will send a new patch next week.

Thanks
Zhenyu Gao

2017-08-08 17:53 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com<mailto:ciara.loftus@intel.com>>:
>

> Hi Loftus,

>

> Thanks for testing and the comments!

> Can you show more details about your phy-vm-phy,phy-phy setup and

> testing steps? Then I can reproduce it to see if I can solve this pps problem.


You're welcome. I forgot to mention my tests were with 64B packets.

For phy-phy the setup is a single host with 2 dpdk physical ports and 1 flow rule port1 -> port2.
See figure 3 here: https://tools.ietf.org/html/draft-ietf-bmwg-vswitch-opnfv-04#section-4<https://urldefense.proofpoint.com/v2/url?u=https-3A__tools.ietf.org_html_draft-2Dietf-2Dbmwg-2Dvswitch-2Dopnfv-2D04-23section-2D4&d=DwMFaQ&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=CE8K7Pnx6aVmgYFTMrsCLCL8dLA6RjD_jGh5KNtWRvA&s=I_yHZtRxUTnwJK7DOezdioeIoAn6dlev6BSCxDtKzwc&e=>

For the phy-vm-phy the setup is a single host with 2 dpdk physical ports and 2 vhostuser ports with flow rules:
Dpdk1 -> vhost 1 & vhost2 -> dpdk2
IP rules are set up in the VM to route packets from vhost1 to vhost 2.
See figure 4 in the link above.

>

> BTW, how about throughput, did you saw improvment?


By throughput if you mean 0% packet loss, I did not test this.

Thanks,
Ciara

>

> I would like to implement vhost->vhost part.

>

> Thanks

> Zhenyu Gao

>

> 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com<mailto:ciara.loftus@intel.com>>:

> >

> > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.

> > So L4 packets's cksum were calculated in VM side but performance is not

> > good.

> > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and

> > makes virtio-net frontend-driver support NETIF_F_SG as well

> >

> > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com<mailto:sysugaozhenyu@gmail.com>>

> > ---

> >

> > Here is some performance number:

> >

> > Setup:

> >

> >  qperf client

> > +---------+

> > |   VM    |

> > +---------+

> >      |

> >      |                          qperf server

> > +--------------+              +------------+

> > | vswitch+dpdk |              | bare-metal |

> > +--------------+              +------------+

> >        |                            |

> >        |                            |

> >       pNic---------PhysicalSwitch----

> >

> > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx

> on'

> > in VM side.

> >                       It offload cksum job to ovs-dpdk side.

> >

> > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in

> VM

> > side.

> >                 VM calculate cksum for tcp/udp packets.

> >

> > We can see huge improvment in TCP throughput if we leverage ovs-dpdk

> > cksum.

> Hi Zhenyu,

>

> Thanks for the patch. I tested some alternative use cases and unfortunately I

> see a degradation for phy-phy and phy-vm-phy topologies.

> Here are my results:

>

> phy-vm-phy:

> without patch: 0.871Mpps

> with patch (offload=on): 0.877Mpps

> with patch (offload=off): 0.891Mpps

>

> phy-phy:

> without patch: 13.581Mpps

> with patch: 13.055Mpps

>

> The half a million pps drop for the second test case is concerning to me but

> not surprising since we're adding extra complexity to netdev_dpdk_send()

> Could this be avoided? Would it make sense to put this functionality

> somewhere else eg. vhost receive?

>

> On another note I have a general concern. I understand similar functionality

> is present in the DPDK vhost sample app. I wonder if it would be feasible for

> this to be implemented in the DPDK vhost library and leveraged here, rather

> than having two implementations in two separate code bases.

>

> I have some other comments inline.

>

> Thanks,

> Ciara

>

> >

> > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01

> > tcp_bw tcp_lat udp_bw udp_lat

> >   do cksum in ovs-dpdk          do cksum in VM             without this patch

> > tcp_bw:

> >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec

> > tcp_bw:

> >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec

> > tcp_bw:

> >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec

> > tcp_bw:

> >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec

> > tcp_bw:

> >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec

> > tcp_bw:

> >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec

> > tcp_bw:

> >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec

> > tcp_bw:

> >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec

> > tcp_bw:

> >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec

> > tcp_bw:

> >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec

> > tcp_bw:

> >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec

> > tcp_bw:

> >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec

> > tcp_bw:

> >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec

> > tcp_bw:

> >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec

> > tcp_bw:

> >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec

> > tcp_bw:

> >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec

> > tcp_bw:

> >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us

> > tcp_lat:

> >     latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us

> > tcp_lat:

> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us

> > tcp_lat:

> >     latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us

> > tcp_lat:

> >     latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us

> > tcp_lat:

> >     latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us

> > tcp_lat:

> >     latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us

> > tcp_lat:

> >     latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us

> > tcp_lat:

> >     latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us

> > tcp_lat:

> >     latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us

> > tcp_lat:

> >      latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us

> > tcp_lat:

> >     latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us

> > tcp_lat:

> >     latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us

> > tcp_lat:

> >     latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us

> > tcp_lat:

> >     latency  =  167 us        latency  =  197 us        latency  =  172 us

> > udp_bw:

> >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403

> KB/sec

> >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400 KB/sec

> > udp_bw:

> >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810

> KB/sec

> >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807 KB/sec

> > udp_bw:

> >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63

> > MB/sec

> >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63

> > MB/sec

> > udp_bw:

> >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26

> > MB/sec

> >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82

> > MB/sec

> > udp_bw:

> >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45

> > MB/sec

> >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45

> > MB/sec

> > udp_bw:

> >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13

> > MB/sec

> >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13

> MB/sec

> > udp_bw:

> >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9

> > MB/sec

> >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7

> > MB/sec

> > udp_bw:

> >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52

> > MB/sec

> >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2

> > MB/sec

> > udp_bw:

> >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103

> > MB/sec

> >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100

> MB/sec

> > udp_bw:

> >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205

> > MB/sec

> >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202

> MB/sec

> > udp_bw:

> >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401

> > MB/sec

> >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358

> MB/sec

> > udp_bw:

> >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557

> > MB/sec

> >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362

> MB/sec

> > udp_bw:

> >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863

> > MB/sec

> >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584

> MB/sec

> > udp_bw:

> >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08

> > GB/sec

> >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793

> > MB/sec

> > udp_bw:

> >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19

> > GB/sec

> >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673

> > MB/sec

> > udp_bw:

> >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3

> GB/sec

> >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762

> MB/sec

> > udp_bw:

> >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0

> > bytes/sec

> >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us

> > udp_lat:

> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us

> > udp_lat:

> >     latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us

> > udp_lat:

> >     latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us

> > udp_lat:

> >     latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us

> > udp_lat:

> >     latency  =  28.1 us       latency  =  28 us         latency  =  28 us

> > udp_lat:

> >      latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us

> > udp_lat:

> >     latency  =  31 us         latency  =  31 us         latency  =  30.8 us

> > udp_lat:

> >     latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us

> > udp_lat:

> >     latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us

> > udp_lat:

> >     latency  =  64.9 us       latency  =  65 us         latency  =  65 us

> > udp_lat:

> >     latency  =  72.3 us       latency  =  72 us         latency  =  72 us

> > udp_lat:

> >     latency  =  121 us        latency  =  122 us        latency  =  122 us

> > udp_lat:

> >     latency  =  0 ns          latency  =  0 ns          latency  =  0 ns

> >

> >

> >  lib/netdev-dpdk.c | 84

> > +++++++++++++++++++++++++++++++++++++++++++++++++++++--

> >  1 file changed, 82 insertions(+), 2 deletions(-)

> >

> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c

> > index ea17b97..d27d615 100644

> > --- a/lib/netdev-dpdk.c

> > +++ b/lib/netdev-dpdk.c

> > @@ -28,6 +28,7 @@

> >  #include <rte_errno.h>

> >  #include <rte_eth_ring.h>

> >  #include <rte_ethdev.h>

> > +#include <rte_ip.h>

> >  #include <rte_malloc.h>

> >  #include <rte_mbuf.h>

> >  #include <rte_meter.h>

> > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq

> > *rxq)

> >      rte_free(rx);

> >  }

> >

> > +static inline void

> > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,

> > +                       uint8_t l4_proto, bool is_ipv4)

> > +{

> > +    void *l3hdr = (void *)(data + pkt->l3_ofs);

> > +

> > +    if (l4_proto == IPPROTO_TCP) {

> > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt-

> >l4_ofs);

> > +

> > +        pkt->mbuf.l2_len = pkt->l3_ofs;

> > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;

> > +        tcp_hdr->tcp_csum = 0;

> > +        if (is_ipv4) {

> > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);

> > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;

> > +        } else {

> > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;

> > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);

> > +        }

> > +    } else if (l4_proto == IPPROTO_UDP) {

> > +        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt-

> > >l4_ofs);

> > +        /* do not recalculate udp cksum if it was 0 */

> > +        if (udp_hdr->udp_csum != 0) {

> > +            pkt->mbuf.l2_len = pkt->l3_ofs;

> > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;

> > +            udp_hdr->udp_csum = 0;

> > +            if (is_ipv4) {

> > +                /*do not calculate udp cksum if it was a fragment IP*/

> > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->

> > +                                      fragment_offset)) {

> > +                    return;

> > +                }

> > +

> > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;

> > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);

> > +            } else {

> > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;

> > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);

> > +            }

> > +        }

> > +    }

> > +}

> > +

> > +static inline void

> > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)

> > +{

> > +    int i;

> > +

> > +    for (i = 0; i < pkt_cnt; i++) {

> > +        ovs_be16 dl_type;

> > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];

> > +        const char *data = dp_packet_data(pkt);

> > +        void *l3hdr = (char *)(data + pkt->l3_ofs);

> > +

> > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {

> > +            continue;

> > +        }

> > +        /* This take a assumption that it should be a vhost packet if this

> > +         * packet was allocated by DPDK pool and try sending to pNic. */

> > +        if (pkt->source == DPBUF_DPDK &&

> > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {

> > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need

> cksum

> > +            continue;

> > +        }

> The comments here could be formatted better. Suggest combining both into

> one comment before the 'if'.

> Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.

>

> > +

> > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);

> > +        if (dl_type == htons(ETH_TYPE_IP)) {

> > +            netdev_refill_l4_cksum(data, pkt,

> > +                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,

> > +                                   true);

> > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {

> > +            netdev_refill_l4_cksum(data, pkt,

> > +                                   ((struct ipv6_hdr *)l3hdr)->proto,

> > +                                   false);

> > +        }

> > +    }

> > +}

> > +

> >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of

> >   * 'pkts', even in case of failure.

> >   *

> > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,

> > int qid,

> >          return;

> >      }

> >

> > +    netdev_prepare_tx_csum(batch->packets, batch->count);

>

> Putting this here assumes we only prepare the csum for vhost -> dpdk or

> vhost -> ring cases. What about vhost -> vhost?

>

> > +

> >      if (OVS_UNLIKELY(concurrent_txq)) {

> >          qid = qid % dev->up.n_txq;

> >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)

> >      if (ovsthread_once_start(&once)) {

> >          rte_vhost_driver_callback_register(&virtio_net_device_ops);

> >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6

> > -                                  | 1ULL << VIRTIO_NET_F_CSUM);

> > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);

> >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);

> >

> >          ovsthread_once_done(&once);

> > --

> > 1.8.3.1

> >

> > _______________________________________________

> > dev mailing list

> > dev@openvswitch.org<mailto:dev@openvswitch.org>

> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev<https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwMFaQ&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=CE8K7Pnx6aVmgYFTMrsCLCL8dLA6RjD_jGh5KNtWRvA&s=ugbDu5YHH06r-nzgT1CXCqNv2lFMHh9wG9Q6okXK5uQ&e=>
Ciara Loftus Aug. 23, 2017, 1:59 p.m. | #7
> 

> Hi Ciara

> 

> You had a general concern below; can we conclude on that before going

> further ?

> 

> Thanks Darrell

> 

> “

> > On another note I have a general concern. I understand similar functionality

> > is present in the DPDK vhost sample app. I wonder if it would be feasible

> for

> > this to be implemented in the DPDK vhost library and leveraged here,

> rather

> > than having two implementations in two separate code bases.


This is something I'd like to see, although I wouldn't block on this patch waiting for it.
Maybe we can have the initial implementation as it is (if it proves beneficial), then move to a common DPDK API if/when it becomes available.

I've cc'ed DPDK users list hoping for some input. To summarise:
From my understanding, the DPDK vhost sample application calculates TX checksum for packets received from vHost ports with invalid/0 checksums:
http://dpdk.org/browse/dpdk/tree/examples/vhost/main.c#n910
The patch being discussed in this thread (also here: https://patchwork.ozlabs.org/patch/802070/) it seems does something very similar.
Wondering on the feasibility of putting this functionality in a rte_vhost library call such that we don't have two separate implementations?

Thanks,
Ciara

> >

> > I have some other comments inline.

> >

> > Thanks,

> > Ciara

> “

> 

> 

> 

> From: Gao Zhenyu <sysugaozhenyu@gmail.com>

> Date: Wednesday, August 16, 2017 at 6:38 AM

> To: "Loftus, Ciara" <ciara.loftus@intel.com>

> Cc: "blp@ovn.org" <blp@ovn.org>, "Chandran, Sugesh"

> <sugesh.chandran@intel.com>, "ktraynor@redhat.com"

> <ktraynor@redhat.com>, Darrell Ball <dball@vmware.com>,

> "dev@openvswitch.org" <dev@openvswitch.org>

> Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX

> cksum in ovs-dpdk side

> 

> Hi Loftus,

>    I had submitted a new version, please see

> https://patchwork.ozlabs.org/patch/802070/

>    It move the cksum to vhost receive side.

> Thanks

> Zhenyu Gao

> 

> 2017-08-10 12:35 GMT+08:00 Gao Zhenyu <sysugaozhenyu@gmail.com>:

> I see, for flows in phy-phy setup, they should not be calculate cksum.

> I will revise my patch to do the cksum for vhost port only. I will send a new

> patch next week.

> 

> Thanks

> Zhenyu Gao

> 

> 2017-08-08 17:53 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> >

> > Hi Loftus,

> >

> > Thanks for testing and the comments!

> > Can you show more details about your phy-vm-phy,phy-phy setup and

> > testing steps? Then I can reproduce it to see if I can solve this pps problem.

> 

> You're welcome. I forgot to mention my tests were with 64B packets.

> 

> For phy-phy the setup is a single host with 2 dpdk physical ports and 1 flow

> rule port1 -> port2.

> See figure 3 here: https://tools.ietf.org/html/draft-ietf-bmwg-vswitch-

> opnfv-04#section-4

> 

> For the phy-vm-phy the setup is a single host with 2 dpdk physical ports and 2

> vhostuser ports with flow rules:

> Dpdk1 -> vhost 1 & vhost2 -> dpdk2

> IP rules are set up in the VM to route packets from vhost1 to vhost 2.

> See figure 4 in the link above.

> 

> >

> > BTW, how about throughput, did you saw improvment?

> 

> By throughput if you mean 0% packet loss, I did not test this.

> 

> Thanks,

> Ciara

> 

> >

> > I would like to implement vhost->vhost part.

> >

> > Thanks

> > Zhenyu Gao

> >

> > 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> > >

> > > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.

> > > So L4 packets's cksum were calculated in VM side but performance is not

> > > good.

> > > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput

> and

> > > makes virtio-net frontend-driver support NETIF_F_SG as well

> > >

> > > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>

> > > ---

> > >

> > > Here is some performance number:

> > >

> > > Setup:

> > >

> > >  qperf client

> > > +---------+

> > > |   VM    |

> > > +---------+

> > >      |

> > >      |                          qperf server

> > > +--------------+              +------------+

> > > | vswitch+dpdk |              | bare-metal |

> > > +--------------+              +------------+

> > >        |                            |

> > >        |                            |

> > >       pNic---------PhysicalSwitch----

> > >

> > > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx

> > on'

> > > in VM side.

> > >                       It offload cksum job to ovs-dpdk side.

> > >

> > > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in

> > VM

> > > side.

> > >                 VM calculate cksum for tcp/udp packets.

> > >

> > > We can see huge improvment in TCP throughput if we leverage ovs-dpdk

> > > cksum.

> > Hi Zhenyu,

> >

> > Thanks for the patch. I tested some alternative use cases and

> unfortunately I

> > see a degradation for phy-phy and phy-vm-phy topologies.

> > Here are my results:

> >

> > phy-vm-phy:

> > without patch: 0.871Mpps

> > with patch (offload=on): 0.877Mpps

> > with patch (offload=off): 0.891Mpps

> >

> > phy-phy:

> > without patch: 13.581Mpps

> > with patch: 13.055Mpps

> >

> > The half a million pps drop for the second test case is concerning to me but

> > not surprising since we're adding extra complexity to netdev_dpdk_send()

> > Could this be avoided? Would it make sense to put this functionality

> > somewhere else eg. vhost receive?

> >

> > On another note I have a general concern. I understand similar functionality

> > is present in the DPDK vhost sample app. I wonder if it would be feasible

> for

> > this to be implemented in the DPDK vhost library and leveraged here,

> rather

> > than having two implementations in two separate code bases.

> >

> > I have some other comments inline.

> >

> > Thanks,

> > Ciara

> >

> > >

> > > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-

> server01

> > > tcp_bw tcp_lat udp_bw udp_lat

> > >   do cksum in ovs-dpdk          do cksum in VM             without this patch

> > > tcp_bw:

> > >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec

> > > tcp_bw:

> > >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec

> > > tcp_bw:

> > >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec

> > > tcp_bw:

> > >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec

> > > tcp_bw:

> > >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec

> > > tcp_bw:

> > >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec

> > > tcp_bw:

> > >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec

> > > tcp_bw:

> > >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec

> > > tcp_bw:

> > >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec

> > > tcp_bw:

> > >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec

> > > tcp_bw:

> > >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec

> > > tcp_bw:

> > >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec

> > > tcp_bw:

> > >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec

> > > tcp_bw:

> > >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec

> > > tcp_bw:

> > >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec

> > > tcp_bw:

> > >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec

> > > tcp_bw:

> > >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec

> > > tcp_lat:

> > >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us

> > > tcp_lat:

> > >     latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us

> > > tcp_lat:

> > >     latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us

> > > tcp_lat:

> > >     latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us

> > > tcp_lat:

> > >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us

> > > tcp_lat:

> > >     latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us

> > > tcp_lat:

> > >     latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us

> > > tcp_lat:

> > >     latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us

> > > tcp_lat:

> > >     latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us

> > > tcp_lat:

> > >     latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us

> > > tcp_lat:

> > >     latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us

> > > tcp_lat:

> > >     latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us

> > > tcp_lat:

> > >      latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us

> > > tcp_lat:

> > >     latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us

> > > tcp_lat:

> > >     latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us

> > > tcp_lat:

> > >     latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us

> > > tcp_lat:

> > >     latency  =  167 us        latency  =  197 us        latency  =  172 us

> > > udp_bw:

> > >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403

> > KB/sec

> > >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400

> KB/sec

> > > udp_bw:

> > >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810

> > KB/sec

> > >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807

> KB/sec

> > > udp_bw:

> > >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63

> > > MB/sec

> > >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63

> > > MB/sec

> > > udp_bw:

> > >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26

> > > MB/sec

> > >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82

> > > MB/sec

> > > udp_bw:

> > >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45

> > > MB/sec

> > >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45

> > > MB/sec

> > > udp_bw:

> > >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13

> > > MB/sec

> > >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13

> > MB/sec

> > > udp_bw:

> > >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9

> > > MB/sec

> > >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7

> > > MB/sec

> > > udp_bw:

> > >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52

> > > MB/sec

> > >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2

> > > MB/sec

> > > udp_bw:

> > >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103

> > > MB/sec

> > >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100

> > MB/sec

> > > udp_bw:

> > >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205

> > > MB/sec

> > >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202

> > MB/sec

> > > udp_bw:

> > >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401

> > > MB/sec

> > >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358

> > MB/sec

> > > udp_bw:

> > >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557

> > > MB/sec

> > >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362

> > MB/sec

> > > udp_bw:

> > >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863

> > > MB/sec

> > >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584

> > MB/sec

> > > udp_bw:

> > >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08

> > > GB/sec

> > >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793

> > > MB/sec

> > > udp_bw:

> > >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19

> > > GB/sec

> > >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673

> > > MB/sec

> > > udp_bw:

> > >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3

> > GB/sec

> > >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762

> > MB/sec

> > > udp_bw:

> > >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0

> > > bytes/sec

> > >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0

> bytes/sec

> > > udp_lat:

> > >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us

> > > udp_lat:

> > >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us

> > > udp_lat:

> > >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us

> > > udp_lat:

> > >     latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us

> > > udp_lat:

> > >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us

> > > udp_lat:

> > >     latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us

> > > udp_lat:

> > >     latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us

> > > udp_lat:

> > >     latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us

> > > udp_lat:

> > >     latency  =  28.1 us       latency  =  28 us         latency  =  28 us

> > > udp_lat:

> > >      latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us

> > > udp_lat:

> > >     latency  =  31 us         latency  =  31 us         latency  =  30.8 us

> > > udp_lat:

> > >     latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us

> > > udp_lat:

> > >     latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us

> > > udp_lat:

> > >     latency  =  64.9 us       latency  =  65 us         latency  =  65 us

> > > udp_lat:

> > >     latency  =  72.3 us       latency  =  72 us         latency  =  72 us

> > > udp_lat:

> > >     latency  =  121 us        latency  =  122 us        latency  =  122 us

> > > udp_lat:

> > >     latency  =  0 ns          latency  =  0 ns          latency  =  0 ns

> > >

> > >

> > >  lib/netdev-dpdk.c | 84

> > > +++++++++++++++++++++++++++++++++++++++++++++++++++++--

> > >  1 file changed, 82 insertions(+), 2 deletions(-)

> > >

> > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c

> > > index ea17b97..d27d615 100644

> > > --- a/lib/netdev-dpdk.c

> > > +++ b/lib/netdev-dpdk.c

> > > @@ -28,6 +28,7 @@

> > >  #include <rte_errno.h>

> > >  #include <rte_eth_ring.h>

> > >  #include <rte_ethdev.h>

> > > +#include <rte_ip.h>

> > >  #include <rte_malloc.h>

> > >  #include <rte_mbuf.h>

> > >  #include <rte_meter.h>

> > > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq

> > > *rxq)

> > >      rte_free(rx);

> > >  }

> > >

> > > +static inline void

> > > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,

> > > +                       uint8_t l4_proto, bool is_ipv4)

> > > +{

> > > +    void *l3hdr = (void *)(data + pkt->l3_ofs);

> > > +

> > > +    if (l4_proto == IPPROTO_TCP) {

> > > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt-

> > >l4_ofs);

> > > +

> > > +        pkt->mbuf.l2_len = pkt->l3_ofs;

> > > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;

> > > +        tcp_hdr->tcp_csum = 0;

> > > +        if (is_ipv4) {

> > > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);

> > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;

> > > +        } else {

> > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;

> > > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);

> > > +        }

> > > +    } else if (l4_proto == IPPROTO_UDP) {

> > > +        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt-

> > > >l4_ofs);

> > > +        /* do not recalculate udp cksum if it was 0 */

> > > +        if (udp_hdr->udp_csum != 0) {

> > > +            pkt->mbuf.l2_len = pkt->l3_ofs;

> > > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;

> > > +            udp_hdr->udp_csum = 0;

> > > +            if (is_ipv4) {

> > > +                /*do not calculate udp cksum if it was a fragment IP*/

> > > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->

> > > +                                      fragment_offset)) {

> > > +                    return;

> > > +                }

> > > +

> > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;

> > > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr,

> udp_hdr);

> > > +            } else {

> > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;

> > > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr,

> udp_hdr);

> > > +            }

> > > +        }

> > > +    }

> > > +}

> > > +

> > > +static inline void

> > > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)

> > > +{

> > > +    int i;

> > > +

> > > +    for (i = 0; i < pkt_cnt; i++) {

> > > +        ovs_be16 dl_type;

> > > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];

> > > +        const char *data = dp_packet_data(pkt);

> > > +        void *l3hdr = (char *)(data + pkt->l3_ofs);

> > > +

> > > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {

> > > +            continue;

> > > +        }

> > > +        /* This take a assumption that it should be a vhost packet if this

> > > +         * packet was allocated by DPDK pool and try sending to pNic. */

> > > +        if (pkt->source == DPBUF_DPDK &&

> > > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {

> > > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need

> > cksum

> > > +            continue;

> > > +        }

> > The comments here could be formatted better. Suggest combining both

> into

> > one comment before the 'if'.

> > Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.

> >

> > > +

> > > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);

> > > +        if (dl_type == htons(ETH_TYPE_IP)) {

> > > +            netdev_refill_l4_cksum(data, pkt,

> > > +                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,

> > > +                                   true);

> > > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {

> > > +            netdev_refill_l4_cksum(data, pkt,

> > > +                                   ((struct ipv6_hdr *)l3hdr)->proto,

> > > +                                   false);

> > > +        }

> > > +    }

> > > +}

> > > +

> > >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of

> > >   * 'pkts', even in case of failure.

> > >   *

> > > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk

> *dev,

> > > int qid,

> > >          return;

> > >      }

> > >

> > > +    netdev_prepare_tx_csum(batch->packets, batch->count);

> >

> > Putting this here assumes we only prepare the csum for vhost -> dpdk or

> > vhost -> ring cases. What about vhost -> vhost?

> >

> > > +

> > >      if (OVS_UNLIKELY(concurrent_txq)) {

> > >          qid = qid % dev->up.n_txq;

> > >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> > > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)

> > >      if (ovsthread_once_start(&once)) {

> > >          rte_vhost_driver_callback_register(&virtio_net_device_ops);

> > >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> > > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6

> > > -                                  | 1ULL << VIRTIO_NET_F_CSUM);

> > > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);

> > >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);

> > >

> > >          ovsthread_once_done(&once);

> > > --

> > > 1.8.3.1

> > >

> > > _______________________________________________

> > > dev mailing list

> > > dev@openvswitch.org

> > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev

>
Gao Zhenyu Aug. 23, 2017, 3:12 p.m. | #8
Yes, maintaining only one implementation is resonable.
However making ovs-dpdk to support vhost tx-cksum first is doable as well.
We can have it in ovs, and replace it with new DPDK API once ovs update its
dpdk version which contains the tx-cksum implementation.


Thanks
Zhenyu Gao

2017-08-23 21:59 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> >
> > Hi Ciara
> >
> > You had a general concern below; can we conclude on that before going
> > further ?
> >
> > Thanks Darrell
> >
> > “
> > > On another note I have a general concern. I understand similar
> functionality
> > > is present in the DPDK vhost sample app. I wonder if it would be
> feasible
> > for
> > > this to be implemented in the DPDK vhost library and leveraged here,
> > rather
> > > than having two implementations in two separate code bases.
>
> This is something I'd like to see, although I wouldn't block on this patch
> waiting for it.
> Maybe we can have the initial implementation as it is (if it proves
> beneficial), then move to a common DPDK API if/when it becomes available.
>
> I've cc'ed DPDK users list hoping for some input. To summarise:
> From my understanding, the DPDK vhost sample application calculates TX
> checksum for packets received from vHost ports with invalid/0 checksums:
> http://dpdk.org/browse/dpdk/tree/examples/vhost/main.c#n910
> The patch being discussed in this thread (also here:
> https://patchwork.ozlabs.org/patch/802070/) it seems does something very
> similar.
> Wondering on the feasibility of putting this functionality in a rte_vhost
> library call such that we don't have two separate implementations?
>
> Thanks,
> Ciara
>
> > >
> > > I have some other comments inline.
> > >
> > > Thanks,
> > > Ciara
> > “
> >
> >
> >
> > From: Gao Zhenyu <sysugaozhenyu@gmail.com>
> > Date: Wednesday, August 16, 2017 at 6:38 AM
> > To: "Loftus, Ciara" <ciara.loftus@intel.com>
> > Cc: "blp@ovn.org" <blp@ovn.org>, "Chandran, Sugesh"
> > <sugesh.chandran@intel.com>, "ktraynor@redhat.com"
> > <ktraynor@redhat.com>, Darrell Ball <dball@vmware.com>,
> > "dev@openvswitch.org" <dev@openvswitch.org>
> > Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX
> > cksum in ovs-dpdk side
> >
> > Hi Loftus,
> >    I had submitted a new version, please see
> > https://patchwork.ozlabs.org/patch/802070/
> >    It move the cksum to vhost receive side.
> > Thanks
> > Zhenyu Gao
> >
> > 2017-08-10 12:35 GMT+08:00 Gao Zhenyu <sysugaozhenyu@gmail.com>:
> > I see, for flows in phy-phy setup, they should not be calculate cksum.
> > I will revise my patch to do the cksum for vhost port only. I will send
> a new
> > patch next week.
> >
> > Thanks
> > Zhenyu Gao
> >
> > 2017-08-08 17:53 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
> > >
> > > Hi Loftus,
> > >
> > > Thanks for testing and the comments!
> > > Can you show more details about your phy-vm-phy,phy-phy setup and
> > > testing steps? Then I can reproduce it to see if I can solve this pps
> problem.
> >
> > You're welcome. I forgot to mention my tests were with 64B packets.
> >
> > For phy-phy the setup is a single host with 2 dpdk physical ports and 1
> flow
> > rule port1 -> port2.
> > See figure 3 here: https://tools.ietf.org/html/draft-ietf-bmwg-vswitch-
> > opnfv-04#section-4
> >
> > For the phy-vm-phy the setup is a single host with 2 dpdk physical ports
> and 2
> > vhostuser ports with flow rules:
> > Dpdk1 -> vhost 1 & vhost2 -> dpdk2
> > IP rules are set up in the VM to route packets from vhost1 to vhost 2.
> > See figure 4 in the link above.
> >
> > >
> > > BTW, how about throughput, did you saw improvment?
> >
> > By throughput if you mean 0% packet loss, I did not test this.
> >
> > Thanks,
> > Ciara
> >
> > >
> > > I would like to implement vhost->vhost part.
> > >
> > > Thanks
> > > Zhenyu Gao
> > >
> > > 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
> > > >
> > > > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx
> cksum.
> > > > So L4 packets's cksum were calculated in VM side but performance is
> not
> > > > good.
> > > > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput
> > and
> > > > makes virtio-net frontend-driver support NETIF_F_SG as well
> > > >
> > > > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> > > > ---
> > > >
> > > > Here is some performance number:
> > > >
> > > > Setup:
> > > >
> > > >  qperf client
> > > > +---------+
> > > > |   VM    |
> > > > +---------+
> > > >      |
> > > >      |                          qperf server
> > > > +--------------+              +------------+
> > > > | vswitch+dpdk |              | bare-metal |
> > > > +--------------+              +------------+
> > > >        |                            |
> > > >        |                            |
> > > >       pNic---------PhysicalSwitch----
> > > >
> > > > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K
> eth0 tx
> > > on'
> > > > in VM side.
> > > >                       It offload cksum job to ovs-dpdk side.
> > > >
> > > > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx
> off' in
> > > VM
> > > > side.
> > > >                 VM calculate cksum for tcp/udp packets.
> > > >
> > > > We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> > > > cksum.
> > > Hi Zhenyu,
> > >
> > > Thanks for the patch. I tested some alternative use cases and
> > unfortunately I
> > > see a degradation for phy-phy and phy-vm-phy topologies.
> > > Here are my results:
> > >
> > > phy-vm-phy:
> > > without patch: 0.871Mpps
> > > with patch (offload=on): 0.877Mpps
> > > with patch (offload=off): 0.891Mpps
> > >
> > > phy-phy:
> > > without patch: 13.581Mpps
> > > with patch: 13.055Mpps
> > >
> > > The half a million pps drop for the second test case is concerning to
> me but
> > > not surprising since we're adding extra complexity to
> netdev_dpdk_send()
> > > Could this be avoided? Would it make sense to put this functionality
> > > somewhere else eg. vhost receive?
> > >
> > > On another note I have a general concern. I understand similar
> functionality
> > > is present in the DPDK vhost sample app. I wonder if it would be
> feasible
> > for
> > > this to be implemented in the DPDK vhost library and leveraged here,
> > rather
> > > than having two implementations in two separate code bases.
> > >
> > > I have some other comments inline.
> > >
> > > Thanks,
> > > Ciara
> > >
> > > >
> > > > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-
> > server01
> > > > tcp_bw tcp_lat udp_bw udp_lat
> > > >   do cksum in ovs-dpdk          do cksum in VM             without
> this patch
> > > > tcp_bw:
> > > >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488
> MB/sec
> > > > tcp_bw:
> > > >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483
> MB/sec
> > > > tcp_lat:
> > > >     latency  =  29 us         latency  =  29.2 us       latency  =
> 29.6 us
> > > > tcp_lat:
> > > >     latency  =  28.9 us       latency  =  29.3 us       latency  =
> 29.5 us
> > > > tcp_lat:
> > > >     latency  =  29 us         latency  =  29.3 us       latency  =
> 29.6 us
> > > > tcp_lat:
> > > >     latency  =  29 us         latency  =  29.4 us       latency  =
> 29.5 us
> > > > tcp_lat:
> > > >     latency  =  29 us         latency  =  29.2 us       latency  =
> 29.6 us
> > > > tcp_lat:
> > > >     latency  =  29.1 us       latency  =  29.3 us       latency  =
> 29.7 us
> > > > tcp_lat:
> > > >     latency  =  29.4 us       latency  =  29.6 us       latency  =
> 30 us
> > > > tcp_lat:
> > > >     latency  =  29.8 us       latency  =  30.1 us       latency  =
> 30.2 us
> > > > tcp_lat:
> > > >     latency  =  30.9 us       latency  =  30.9 us       latency  =
> 31 us
> > > > tcp_lat:
> > > >     latency  =  46.9 us       latency  =  46.2 us       latency  =
> 32.2 us
> > > > tcp_lat:
> > > >     latency  =  51.5 us       latency  =  52.6 us       latency  =
> 34.5 us
> > > > tcp_lat:
> > > >     latency  =  43.9 us       latency  =  43.8 us       latency  =
> 43.6 us
> > > > tcp_lat:
> > > >      latency  =  47.6 us      latency  =  48 us         latency  =
> 48.1 us
> > > > tcp_lat:
> > > >     latency  =  77.7 us       latency  =  78.8 us       latency  =
> 78.8 us
> > > > tcp_lat:
> > > >     latency  =  82.8 us       latency  =  82.3 us       latency  =
> 116 us
> > > > tcp_lat:
> > > >     latency  =  94.8 us       latency  =  94.2 us       latency  =
> 134 us
> > > > tcp_lat:
> > > >     latency  =  167 us        latency  =  197 us        latency  =
> 172 us
> > > > udp_bw:
> > > >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =
> 403
> > > KB/sec
> > > >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =
> 400
> > KB/sec
> > > > udp_bw:
> > > >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =
> 810
> > > KB/sec
> > > >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =
> 807
> > KB/sec
> > > > udp_bw:
> > > >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =
> 1.63
> > > > MB/sec
> > > >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =
> 1.63
> > > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =
> 3.26
> > > > MB/sec
> > > >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =
> 2.82
> > > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =
> 6.45
> > > > MB/sec
> > > >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =
> 6.45
> > > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =
> 13
> > > > MB/sec
> > > >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =
> 13
> > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =
> 25.9
> > > > MB/sec
> > > >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =
> 25.7
> > > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =
>   52
> > > > MB/sec
> > > >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =
> 51.2
> > > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =
> 103
> > > > MB/sec
> > > >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =
> 100
> > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =
> 205
> > > > MB/sec
> > > >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =
> 202
> > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =
> 401
> > > > MB/sec
> > > >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =
> 358
> > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =
> 557
> > > > MB/sec
> > > >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =
> 362
> > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =
> 863
> > > > MB/sec
> > > >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =
> 584
> > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =
> 1.08
> > > > GB/sec
> > > >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =
>  793
> > > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =
> 1.19
> > > > GB/sec
> > > >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =
>  673
> > > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =
> 1.3
> > > GB/sec
> > > >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =
> 762
> > > MB/sec
> > > > udp_bw:
> > > >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
> > > > bytes/sec
> > > >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0
> > bytes/sec
> > > > udp_lat:
> > > >     latency  =  26.7 us       latency  =  26.5 us       latency  =
> 26.4 us
> > > > udp_lat:
> > > >     latency  =  26.7 us       latency  =  26.5 us       latency  =
> 26.3 us
> > > > udp_lat:
> > > >     latency  =  26.7 us       latency  =  26.7 us       latency  =
> 26.3 us
> > > > udp_lat:
> > > >     latency  =  26.7 us       latency  =  26.6 us       latency  =
> 26.3 us
> > > > udp_lat:
> > > >     latency  =  26.7 us       latency  =  26.7 us       latency  =
> 26.7 us
> > > > udp_lat:
> > > >     latency  =  27 us         latency  =  26.7 us       latency  =
> 26.6 us
> > > > udp_lat:
> > > >     latency  =  27 us         latency  =  26.9 us       latency  =
> 26.7 us
> > > > udp_lat:
> > > >     latency  =  27.6 us       latency  =  27.4 us       latency  =
> 27.3 us
> > > > udp_lat:
> > > >     latency  =  28.1 us       latency  =  28 us         latency  =
> 28 us
> > > > udp_lat:
> > > >      latency  =  29.4 us      latency  =  29.2 us       latency  =
> 29.2 us
> > > > udp_lat:
> > > >     latency  =  31 us         latency  =  31 us         latency  =
> 30.8 us
> > > > udp_lat:
> > > >     latency  =  41.4 us       latency  =  41.4 us       latency  =
> 41.3 us
> > > > udp_lat:
> > > >     latency  =  41.6 us       latency  =  41.5 us       latency  =
> 41.5 us
> > > > udp_lat:
> > > >     latency  =  64.9 us       latency  =  65 us         latency  =
> 65 us
> > > > udp_lat:
> > > >     latency  =  72.3 us       latency  =  72 us         latency  =
> 72 us
> > > > udp_lat:
> > > >     latency  =  121 us        latency  =  122 us        latency  =
> 122 us
> > > > udp_lat:
> > > >     latency  =  0 ns          latency  =  0 ns          latency  =
> 0 ns
> > > >
> > > >
> > > >  lib/netdev-dpdk.c | 84
> > > > +++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > > >  1 file changed, 82 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > > > index ea17b97..d27d615 100644
> > > > --- a/lib/netdev-dpdk.c
> > > > +++ b/lib/netdev-dpdk.c
> > > > @@ -28,6 +28,7 @@
> > > >  #include <rte_errno.h>
> > > >  #include <rte_eth_ring.h>
> > > >  #include <rte_ethdev.h>
> > > > +#include <rte_ip.h>
> > > >  #include <rte_malloc.h>
> > > >  #include <rte_mbuf.h>
> > > >  #include <rte_meter.h>
> > > > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> > > > *rxq)
> > > >      rte_free(rx);
> > > >  }
> > > >
> > > > +static inline void
> > > > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
> > > > +                       uint8_t l4_proto, bool is_ipv4)
> > > > +{
> > > > +    void *l3hdr = (void *)(data + pkt->l3_ofs);
> > > > +
> > > > +    if (l4_proto == IPPROTO_TCP) {
> > > > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
> pkt-
> > > >l4_ofs);
> > > > +
> > > > +        pkt->mbuf.l2_len = pkt->l3_ofs;
> > > > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > > > +        tcp_hdr->tcp_csum = 0;
> > > > +        if (is_ipv4) {
> > > > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr,
> tcp_hdr);
> > > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
> > > > +        } else {
> > > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
> > > > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr,
> tcp_hdr);
> > > > +        }
> > > > +    } else if (l4_proto == IPPROTO_UDP) {
> > > > +        struct udp_header *udp_hdr = (struct udp_header *)(data +
> pkt-
> > > > >l4_ofs);
> > > > +        /* do not recalculate udp cksum if it was 0 */
> > > > +        if (udp_hdr->udp_csum != 0) {
> > > > +            pkt->mbuf.l2_len = pkt->l3_ofs;
> > > > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > > > +            udp_hdr->udp_csum = 0;
> > > > +            if (is_ipv4) {
> > > > +                /*do not calculate udp cksum if it was a fragment
> IP*/
> > > > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> > > > +                                      fragment_offset)) {
> > > > +                    return;
> > > > +                }
> > > > +
> > > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM |
> PKT_TX_IPV4;
> > > > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr,
> > udp_hdr);
> > > > +            } else {
> > > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM |
> PKT_TX_IPV6;
> > > > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr,
> > udp_hdr);
> > > > +            }
> > > > +        }
> > > > +    }
> > > > +}
> > > > +
> > > > +static inline void
> > > > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> > > > +{
> > > > +    int i;
> > > > +
> > > > +    for (i = 0; i < pkt_cnt; i++) {
> > > > +        ovs_be16 dl_type;
> > > > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> > > > +        const char *data = dp_packet_data(pkt);
> > > > +        void *l3hdr = (char *)(data + pkt->l3_ofs);
> > > > +
> > > > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX)
> {
> > > > +            continue;
> > > > +        }
> > > > +        /* This take a assumption that it should be a vhost packet
> if this
> > > > +         * packet was allocated by DPDK pool and try sending to
> pNic. */
> > > > +        if (pkt->source == DPBUF_DPDK &&
> > > > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> > > > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet
> need
> > > cksum
> > > > +            continue;
> > > > +        }
> > > The comments here could be formatted better. Suggest combining both
> > into
> > > one comment before the 'if'.
> > > Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.
> > >
> > > > +
> > > > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
> > > > +        if (dl_type == htons(ETH_TYPE_IP)) {
> > > > +            netdev_refill_l4_cksum(data, pkt,
> > > > +                                   ((struct ipv4_hdr
> *)l3hdr)->next_proto_id,
> > > > +                                   true);
> > > > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> > > > +            netdev_refill_l4_cksum(data, pkt,
> > > > +                                   ((struct ipv6_hdr
> *)l3hdr)->proto,
> > > > +                                   false);
> > > > +        }
> > > > +    }
> > > > +}
> > > > +
> > > >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
> ownership of
> > > >   * 'pkts', even in case of failure.
> > > >   *
> > > > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk
> > *dev,
> > > > int qid,
> > > >          return;
> > > >      }
> > > >
> > > > +    netdev_prepare_tx_csum(batch->packets, batch->count);
> > >
> > > Putting this here assumes we only prepare the csum for vhost -> dpdk or
> > > vhost -> ring cases. What about vhost -> vhost?
> > >
> > > > +
> > > >      if (OVS_UNLIKELY(concurrent_txq)) {
> > > >          qid = qid % dev->up.n_txq;
> > > >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> > > > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
> > > >      if (ovsthread_once_start(&once)) {
> > > >          rte_vhost_driver_callback_register(&virtio_net_device_ops);
> > > >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> > > > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
> > > > -                                  | 1ULL << VIRTIO_NET_F_CSUM);
> > > > +                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
> > > >          ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> > > >
> > > >          ovsthread_once_done(&once);
> > > > --
> > > > 1.8.3.1
> > > >
> > > > _______________________________________________
> > > > dev mailing list
> > > > dev@openvswitch.org
> > > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >
>
>
O Mahony, Billy Aug. 24, 2017, 9:07 a.m. | #9
Hi Gao,

Thanks for working on this. Lack of checksum offload is big difference between ovs and ovs-dpdk when using linux stack in the guest.
 
The thing that struck me was that rather than immediately calculating the L4 checksum in the host on vhost rx that the calculation should be delayed until it's known to be absolutely required to be done on the host. If the packet is for another VM a checksum is not required as the bits are not going over a physical medium. And if the packets is destined for a NIC then the checksum can be offloaded if the NIC supports it.

I'm not sure why doing the L4 sum in the guest should give a performance gain. The processing still has to be done. Maybe the guest code was compiled for an older architecture and is not using as efficient a set of instructions?

In any case the best advantage of having dpdk virtio device  support offload is if it can further offload to a NIC or avoid cksum entirely if the packet is destined for a local VM.

Thanks,
Billy. 


> -----Original Message-----

> From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-

> bounces@openvswitch.org] On Behalf Of Gao Zhenyu

> Sent: Wednesday, August 23, 2017 4:12 PM

> To: Loftus, Ciara <ciara.loftus@intel.com>

> Cc: dev@openvswitch.org; users@dpdk.org

> Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX

> cksum in ovs-dpdk side

> 

> Yes, maintaining only one implementation is resonable.

> However making ovs-dpdk to support vhost tx-cksum first is doable as well.

> We can have it in ovs, and replace it with new DPDK API once ovs update its

> dpdk version which contains the tx-cksum implementation.

> 

> 

> Thanks

> Zhenyu Gao

> 

> 2017-08-23 21:59 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> 

> > >

> > > Hi Ciara

> > >

> > > You had a general concern below; can we conclude on that before

> > > going further ?

> > >

> > > Thanks Darrell

> > >

> > > “

> > > > On another note I have a general concern. I understand similar

> > functionality

> > > > is present in the DPDK vhost sample app. I wonder if it would be

> > feasible

> > > for

> > > > this to be implemented in the DPDK vhost library and leveraged

> > > > here,

> > > rather

> > > > than having two implementations in two separate code bases.

> >

> > This is something I'd like to see, although I wouldn't block on this

> > patch waiting for it.

> > Maybe we can have the initial implementation as it is (if it proves

> > beneficial), then move to a common DPDK API if/when it becomes

> available.

> >

> > I've cc'ed DPDK users list hoping for some input. To summarise:

> > From my understanding, the DPDK vhost sample application calculates TX

> > checksum for packets received from vHost ports with invalid/0 checksums:

> > http://dpdk.org/browse/dpdk/tree/examples/vhost/main.c#n910

> > The patch being discussed in this thread (also here:

> > https://patchwork.ozlabs.org/patch/802070/) it seems does something

> > very similar.

> > Wondering on the feasibility of putting this functionality in a

> > rte_vhost library call such that we don't have two separate

> implementations?

> >

> > Thanks,

> > Ciara

> >

> > > >

> > > > I have some other comments inline.

> > > >

> > > > Thanks,

> > > > Ciara

> > > “

> > >

> > >

> > >

> > > From: Gao Zhenyu <sysugaozhenyu@gmail.com>

> > > Date: Wednesday, August 16, 2017 at 6:38 AM

> > > To: "Loftus, Ciara" <ciara.loftus@intel.com>

> > > Cc: "blp@ovn.org" <blp@ovn.org>, "Chandran, Sugesh"

> > > <sugesh.chandran@intel.com>, "ktraynor@redhat.com"

> > > <ktraynor@redhat.com>, Darrell Ball <dball@vmware.com>,

> > > "dev@openvswitch.org" <dev@openvswitch.org>

> > > Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX

> > > cksum in ovs-dpdk side

> > >

> > > Hi Loftus,

> > >    I had submitted a new version, please see

> > > https://patchwork.ozlabs.org/patch/802070/

> > >    It move the cksum to vhost receive side.

> > > Thanks

> > > Zhenyu Gao

> > >

> > > 2017-08-10 12:35 GMT+08:00 Gao Zhenyu <sysugaozhenyu@gmail.com>:

> > > I see, for flows in phy-phy setup, they should not be calculate cksum.

> > > I will revise my patch to do the cksum for vhost port only. I will

> > > send

> > a new

> > > patch next week.

> > >

> > > Thanks

> > > Zhenyu Gao

> > >

> > > 2017-08-08 17:53 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> > > >

> > > > Hi Loftus,

> > > >

> > > > Thanks for testing and the comments!

> > > > Can you show more details about your phy-vm-phy,phy-phy setup and

> > > > testing steps? Then I can reproduce it to see if I can solve this

> > > > pps

> > problem.

> > >

> > > You're welcome. I forgot to mention my tests were with 64B packets.

> > >

> > > For phy-phy the setup is a single host with 2 dpdk physical ports

> > > and 1

> > flow

> > > rule port1 -> port2.

> > > See figure 3 here:

> > > https://tools.ietf.org/html/draft-ietf-bmwg-vswitch-

> > > opnfv-04#section-4

> > >

> > > For the phy-vm-phy the setup is a single host with 2 dpdk physical

> > > ports

> > and 2

> > > vhostuser ports with flow rules:

> > > Dpdk1 -> vhost 1 & vhost2 -> dpdk2

> > > IP rules are set up in the VM to route packets from vhost1 to vhost 2.

> > > See figure 4 in the link above.

> > >

> > > >

> > > > BTW, how about throughput, did you saw improvment?

> > >

> > > By throughput if you mean 0% packet loss, I did not test this.

> > >

> > > Thanks,

> > > Ciara

> > >

> > > >

> > > > I would like to implement vhost->vhost part.

> > > >

> > > > Thanks

> > > > Zhenyu Gao

> > > >

> > > > 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> > > > >

> > > > > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx

> > cksum.

> > > > > So L4 packets's cksum were calculated in VM side but performance

> > > > > is

> > not

> > > > > good.

> > > > > Implementing tcp/udp tx cksum in ovs-dpdk side improves

> > > > > throughput

> > > and

> > > > > makes virtio-net frontend-driver support NETIF_F_SG as well

> > > > >

> > > > > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>

> > > > > ---

> > > > >

> > > > > Here is some performance number:

> > > > >

> > > > > Setup:

> > > > >

> > > > >  qperf client

> > > > > +---------+

> > > > > |   VM    |

> > > > > +---------+

> > > > >      |

> > > > >      |                          qperf server

> > > > > +--------------+              +------------+

> > > > > | vswitch+dpdk |              | bare-metal |

> > > > > +--------------+              +------------+

> > > > >        |                            |

> > > > >        |                            |

> > > > >       pNic---------PhysicalSwitch----

> > > > >

> > > > > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K

> > eth0 tx

> > > > on'

> > > > > in VM side.

> > > > >                       It offload cksum job to ovs-dpdk side.

> > > > >

> > > > > do cksum in VM: Applied this patch and execute 'ethtool -K eth0

> > > > > tx

> > off' in

> > > > VM

> > > > > side.

> > > > >                 VM calculate cksum for tcp/udp packets.

> > > > >

> > > > > We can see huge improvment in TCP throughput if we leverage

> > > > > ovs-dpdk cksum.

> > > > Hi Zhenyu,

> > > >

> > > > Thanks for the patch. I tested some alternative use cases and

> > > unfortunately I

> > > > see a degradation for phy-phy and phy-vm-phy topologies.

> > > > Here are my results:

> > > >

> > > > phy-vm-phy:

> > > > without patch: 0.871Mpps

> > > > with patch (offload=on): 0.877Mpps with patch (offload=off):

> > > > 0.891Mpps

> > > >

> > > > phy-phy:

> > > > without patch: 13.581Mpps

> > > > with patch: 13.055Mpps

> > > >

> > > > The half a million pps drop for the second test case is concerning

> > > > to

> > me but

> > > > not surprising since we're adding extra complexity to

> > netdev_dpdk_send()

> > > > Could this be avoided? Would it make sense to put this

> > > > functionality somewhere else eg. vhost receive?

> > > >

> > > > On another note I have a general concern. I understand similar

> > functionality

> > > > is present in the DPDK vhost sample app. I wonder if it would be

> > feasible

> > > for

> > > > this to be implemented in the DPDK vhost library and leveraged

> > > > here,

> > > rather

> > > > than having two implementations in two separate code bases.

> > > >

> > > > I have some other comments inline.

> > > >

> > > > Thanks,

> > > > Ciara

> > > >

> > > > >

> > > > > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2

> > > > > host-qperf-

> > > server01

> > > > > tcp_bw tcp_lat udp_bw udp_lat

> > > > >   do cksum in ovs-dpdk          do cksum in VM             without

> > this patch

> > > > > tcp_bw:

> > > > >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488

> > MB/sec

> > > > > tcp_bw:

> > > > >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483

> > MB/sec

> > > > > tcp_lat:

> > > > >     latency  =  29 us         latency  =  29.2 us       latency  =

> > 29.6 us

> > > > > tcp_lat:

> > > > >     latency  =  28.9 us       latency  =  29.3 us       latency  =

> > 29.5 us

> > > > > tcp_lat:

> > > > >     latency  =  29 us         latency  =  29.3 us       latency  =

> > 29.6 us

> > > > > tcp_lat:

> > > > >     latency  =  29 us         latency  =  29.4 us       latency  =

> > 29.5 us

> > > > > tcp_lat:

> > > > >     latency  =  29 us         latency  =  29.2 us       latency  =

> > 29.6 us

> > > > > tcp_lat:

> > > > >     latency  =  29.1 us       latency  =  29.3 us       latency  =

> > 29.7 us

> > > > > tcp_lat:

> > > > >     latency  =  29.4 us       latency  =  29.6 us       latency  =

> > 30 us

> > > > > tcp_lat:

> > > > >     latency  =  29.8 us       latency  =  30.1 us       latency  =

> > 30.2 us

> > > > > tcp_lat:

> > > > >     latency  =  30.9 us       latency  =  30.9 us       latency  =

> > 31 us

> > > > > tcp_lat:

> > > > >     latency  =  46.9 us       latency  =  46.2 us       latency  =

> > 32.2 us

> > > > > tcp_lat:

> > > > >     latency  =  51.5 us       latency  =  52.6 us       latency  =

> > 34.5 us

> > > > > tcp_lat:

> > > > >     latency  =  43.9 us       latency  =  43.8 us       latency  =

> > 43.6 us

> > > > > tcp_lat:

> > > > >      latency  =  47.6 us      latency  =  48 us         latency  =

> > 48.1 us

> > > > > tcp_lat:

> > > > >     latency  =  77.7 us       latency  =  78.8 us       latency  =

> > 78.8 us

> > > > > tcp_lat:

> > > > >     latency  =  82.8 us       latency  =  82.3 us       latency  =

> > 116 us

> > > > > tcp_lat:

> > > > >     latency  =  94.8 us       latency  =  94.2 us       latency  =

> > 134 us

> > > > > tcp_lat:

> > > > >     latency  =  167 us        latency  =  197 us        latency  =

> > 172 us

> > > > > udp_bw:

> > > > >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =

> > 403

> > > > KB/sec

> > > > >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =

> > 400

> > > KB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =

> > 810

> > > > KB/sec

> > > > >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =

> > 807

> > > KB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =

> > 1.63

> > > > > MB/sec

> > > > >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =

> > 1.63

> > > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =

> > 3.26

> > > > > MB/sec

> > > > >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =

> > 2.82

> > > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =

> > 6.45

> > > > > MB/sec

> > > > >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =

> > 6.45

> > > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =

> > 13

> > > > > MB/sec

> > > > >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =

> > 13

> > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =

> > 25.9

> > > > > MB/sec

> > > > >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =

> > 25.7

> > > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =

> >   52

> > > > > MB/sec

> > > > >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =

> > 51.2

> > > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =

> > 103

> > > > > MB/sec

> > > > >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =

> > 100

> > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =

> > 205

> > > > > MB/sec

> > > > >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =

> > 202

> > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =

> > 401

> > > > > MB/sec

> > > > >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =

> > 358

> > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =

> > 557

> > > > > MB/sec

> > > > >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =

> > 362

> > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =

> > 863

> > > > > MB/sec

> > > > >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =

> > 584

> > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =

> > 1.08

> > > > > GB/sec

> > > > >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =

> >  793

> > > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =

> > 1.19

> > > > > GB/sec

> > > > >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =

> >  673

> > > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =

> > 1.3

> > > > GB/sec

> > > > >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =

> > 762

> > > > MB/sec

> > > > > udp_bw:

> > > > >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0

> > > > > bytes/sec

> > > > >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0

> > > bytes/sec

> > > > > udp_lat:

> > > > >     latency  =  26.7 us       latency  =  26.5 us       latency  =

> > 26.4 us

> > > > > udp_lat:

> > > > >     latency  =  26.7 us       latency  =  26.5 us       latency  =

> > 26.3 us

> > > > > udp_lat:

> > > > >     latency  =  26.7 us       latency  =  26.7 us       latency  =

> > 26.3 us

> > > > > udp_lat:

> > > > >     latency  =  26.7 us       latency  =  26.6 us       latency  =

> > 26.3 us

> > > > > udp_lat:

> > > > >     latency  =  26.7 us       latency  =  26.7 us       latency  =

> > 26.7 us

> > > > > udp_lat:

> > > > >     latency  =  27 us         latency  =  26.7 us       latency  =

> > 26.6 us

> > > > > udp_lat:

> > > > >     latency  =  27 us         latency  =  26.9 us       latency  =

> > 26.7 us

> > > > > udp_lat:

> > > > >     latency  =  27.6 us       latency  =  27.4 us       latency  =

> > 27.3 us

> > > > > udp_lat:

> > > > >     latency  =  28.1 us       latency  =  28 us         latency  =

> > 28 us

> > > > > udp_lat:

> > > > >      latency  =  29.4 us      latency  =  29.2 us       latency  =

> > 29.2 us

> > > > > udp_lat:

> > > > >     latency  =  31 us         latency  =  31 us         latency  =

> > 30.8 us

> > > > > udp_lat:

> > > > >     latency  =  41.4 us       latency  =  41.4 us       latency  =

> > 41.3 us

> > > > > udp_lat:

> > > > >     latency  =  41.6 us       latency  =  41.5 us       latency  =

> > 41.5 us

> > > > > udp_lat:

> > > > >     latency  =  64.9 us       latency  =  65 us         latency  =

> > 65 us

> > > > > udp_lat:

> > > > >     latency  =  72.3 us       latency  =  72 us         latency  =

> > 72 us

> > > > > udp_lat:

> > > > >     latency  =  121 us        latency  =  122 us        latency  =

> > 122 us

> > > > > udp_lat:

> > > > >     latency  =  0 ns          latency  =  0 ns          latency  =

> > 0 ns

> > > > >

> > > > >

> > > > >  lib/netdev-dpdk.c | 84

> > > > >

> +++++++++++++++++++++++++++++++++++++++++++++++++++++--

> > > > >  1 file changed, 82 insertions(+), 2 deletions(-)

> > > > >

> > > > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index

> > > > > ea17b97..d27d615 100644

> > > > > --- a/lib/netdev-dpdk.c

> > > > > +++ b/lib/netdev-dpdk.c

> > > > > @@ -28,6 +28,7 @@

> > > > >  #include <rte_errno.h>

> > > > >  #include <rte_eth_ring.h>

> > > > >  #include <rte_ethdev.h>

> > > > > +#include <rte_ip.h>

> > > > >  #include <rte_malloc.h>

> > > > >  #include <rte_mbuf.h>

> > > > >  #include <rte_meter.h>

> > > > > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct

> netdev_rxq

> > > > > *rxq)

> > > > >      rte_free(rx);

> > > > >  }

> > > > >

> > > > > +static inline void

> > > > > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,

> > > > > +                       uint8_t l4_proto, bool is_ipv4) {

> > > > > +    void *l3hdr = (void *)(data + pkt->l3_ofs);

> > > > > +

> > > > > +    if (l4_proto == IPPROTO_TCP) {

> > > > > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data

> > > > > + +

> > pkt-

> > > > >l4_ofs);

> > > > > +

> > > > > +        pkt->mbuf.l2_len = pkt->l3_ofs;

> > > > > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;

> > > > > +        tcp_hdr->tcp_csum = 0;

> > > > > +        if (is_ipv4) {

> > > > > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr,

> > tcp_hdr);

> > > > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;

> > > > > +        } else {

> > > > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;

> > > > > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr,

> > tcp_hdr);

> > > > > +        }

> > > > > +    } else if (l4_proto == IPPROTO_UDP) {

> > > > > +        struct udp_header *udp_hdr = (struct udp_header *)(data

> > > > > + +

> > pkt-

> > > > > >l4_ofs);

> > > > > +        /* do not recalculate udp cksum if it was 0 */

> > > > > +        if (udp_hdr->udp_csum != 0) {

> > > > > +            pkt->mbuf.l2_len = pkt->l3_ofs;

> > > > > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;

> > > > > +            udp_hdr->udp_csum = 0;

> > > > > +            if (is_ipv4) {

> > > > > +                /*do not calculate udp cksum if it was a

> > > > > + fragment

> > IP*/

> > > > > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->

> > > > > +                                      fragment_offset)) {

> > > > > +                    return;

> > > > > +                }

> > > > > +

> > > > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM |

> > PKT_TX_IPV4;

> > > > > +                udp_hdr->udp_csum =

> > > > > + rte_ipv4_udptcp_cksum(l3hdr,

> > > udp_hdr);

> > > > > +            } else {

> > > > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM |

> > PKT_TX_IPV6;

> > > > > +                udp_hdr->udp_csum =

> > > > > + rte_ipv6_udptcp_cksum(l3hdr,

> > > udp_hdr);

> > > > > +            }

> > > > > +        }

> > > > > +    }

> > > > > +}

> > > > > +

> > > > > +static inline void

> > > > > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt) {

> > > > > +    int i;

> > > > > +

> > > > > +    for (i = 0; i < pkt_cnt; i++) {

> > > > > +        ovs_be16 dl_type;

> > > > > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];

> > > > > +        const char *data = dp_packet_data(pkt);

> > > > > +        void *l3hdr = (char *)(data + pkt->l3_ofs);

> > > > > +

> > > > > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs ==

> > > > > + UINT16_MAX)

> > {

> > > > > +            continue;

> > > > > +        }

> > > > > +        /* This take a assumption that it should be a vhost

> > > > > + packet

> > if this

> > > > > +         * packet was allocated by DPDK pool and try sending to

> > pNic. */

> > > > > +        if (pkt->source == DPBUF_DPDK &&

> > > > > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {

> > > > > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4

> > > > > + packet

> > need

> > > > cksum

> > > > > +            continue;

> > > > > +        }

> > > > The comments here could be formatted better. Suggest combining

> > > > both

> > > into

> > > > one comment before the 'if'.

> > > > Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.

> > > >

> > > > > +

> > > > > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);

> > > > > +        if (dl_type == htons(ETH_TYPE_IP)) {

> > > > > +            netdev_refill_l4_cksum(data, pkt,

> > > > > +                                   ((struct ipv4_hdr

> > *)l3hdr)->next_proto_id,

> > > > > +                                   true);

> > > > > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {

> > > > > +            netdev_refill_l4_cksum(data, pkt,

> > > > > +                                   ((struct ipv6_hdr

> > *)l3hdr)->proto,

> > > > > +                                   false);

> > > > > +        }

> > > > > +    }

> > > > > +}

> > > > > +

> > > > >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.

> > > > > Takes

> > ownership of

> > > > >   * 'pkts', even in case of failure.

> > > > >   *

> > > > > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk

> > > *dev,

> > > > > int qid,

> > > > >          return;

> > > > >      }

> > > > >

> > > > > +    netdev_prepare_tx_csum(batch->packets, batch->count);

> > > >

> > > > Putting this here assumes we only prepare the csum for vhost ->

> > > > dpdk or vhost -> ring cases. What about vhost -> vhost?

> > > >

> > > > > +

> > > > >      if (OVS_UNLIKELY(concurrent_txq)) {

> > > > >          qid = qid % dev->up.n_txq;

> > > > >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> > > > > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)

> > > > >      if (ovsthread_once_start(&once)) {

> > > > >          rte_vhost_driver_callback_register(&virtio_net_device_ops);

> > > > >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> > > > > -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6

> > > > > -                                  | 1ULL << VIRTIO_NET_F_CSUM);

> > > > > +                                  | 1ULL <<

> > > > > + VIRTIO_NET_F_HOST_TSO6);

> > > > >          ovs_thread_create("vhost_thread", start_vhost_loop,

> > > > > NULL);

> > > > >

> > > > >          ovsthread_once_done(&once);

> > > > > --

> > > > > 1.8.3.1

> > > > >

> > > > > _______________________________________________

> > > > > dev mailing list

> > > > > dev@openvswitch.org

> > > > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev

> > >

> >

> >

> _______________________________________________

> dev mailing list

> dev@openvswitch.org

> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Gao Zhenyu Aug. 24, 2017, 3:02 p.m. | #10
Thanks for the comments!

Yes, the best way is calculating cksum if destination need cksum.
But in current ovs-dpdk process,  it is hard to tell whether this whole
batch packets need cksum or not when delivering to destination.
If we check(check PKT_TX_L4_MASK and has l4 header) packets one by one will
introduce regression in some usecases. (In the previous email, Ciara give a
testing on my first patch and see about 4% regression in pure forwarding
packet testing )

About offlording to physical nic, I had make some testing on it and it
doesn't show significant improvment but disable dpdk tx vectorization.(may
not good for small packets) I prefer to implement software cksum first then
count hardware offloading later.

The VM I use for testing is centos7,kernel version
is 3.10.0-514.16.1.el7.x86_64. Supporting cksum has a additional benefit,
the vhost-net can enable NETIF_F_SG (enable scatter-gather feature).

2017-08-24 17:07 GMT+08:00 O Mahony, Billy <billy.o.mahony@intel.com>:

> Hi Gao,
>
> Thanks for working on this. Lack of checksum offload is big difference
> between ovs and ovs-dpdk when using linux stack in the guest.
>
> The thing that struck me was that rather than immediately calculating the
> L4 checksum in the host on vhost rx that the calculation should be delayed
> until it's known to be absolutely required to be done on the host. If the
> packet is for another VM a checksum is not required as the bits are not
> going over a physical medium. And if the packets is destined for a NIC then
> the checksum can be offloaded if the NIC supports it.
>
> I'm not sure why doing the L4 sum in the guest should give a performance
> gain. The processing still has to be done. Maybe the guest code was
> compiled for an older architecture and is not using as efficient a set of
> instructions?
>
> In any case the best advantage of having dpdk virtio device  support
> offload is if it can further offload to a NIC or avoid cksum entirely if
> the packet is destined for a local VM.
>
> Thanks,
> Billy.
>
>
> > -----Original Message-----
> > From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-
> > bounces@openvswitch.org] On Behalf Of Gao Zhenyu
> > Sent: Wednesday, August 23, 2017 4:12 PM
> > To: Loftus, Ciara <ciara.loftus@intel.com>
> > Cc: dev@openvswitch.org; users@dpdk.org
> > Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX
> > cksum in ovs-dpdk side
> >
> > Yes, maintaining only one implementation is resonable.
> > However making ovs-dpdk to support vhost tx-cksum first is doable as
> well.
> > We can have it in ovs, and replace it with new DPDK API once ovs update
> its
> > dpdk version which contains the tx-cksum implementation.
> >
> >
> > Thanks
> > Zhenyu Gao
> >
> > 2017-08-23 21:59 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
> >
> > > >
> > > > Hi Ciara
> > > >
> > > > You had a general concern below; can we conclude on that before
> > > > going further ?
> > > >
> > > > Thanks Darrell
> > > >
> > > > “
> > > > > On another note I have a general concern. I understand similar
> > > functionality
> > > > > is present in the DPDK vhost sample app. I wonder if it would be
> > > feasible
> > > > for
> > > > > this to be implemented in the DPDK vhost library and leveraged
> > > > > here,
> > > > rather
> > > > > than having two implementations in two separate code bases.
> > >
> > > This is something I'd like to see, although I wouldn't block on this
> > > patch waiting for it.
> > > Maybe we can have the initial implementation as it is (if it proves
> > > beneficial), then move to a common DPDK API if/when it becomes
> > available.
> > >
> > > I've cc'ed DPDK users list hoping for some input. To summarise:
> > > From my understanding, the DPDK vhost sample application calculates TX
> > > checksum for packets received from vHost ports with invalid/0
> checksums:
> > > http://dpdk.org/browse/dpdk/tree/examples/vhost/main.c#n910
> > > The patch being discussed in this thread (also here:
> > > https://patchwork.ozlabs.org/patch/802070/) it seems does something
> > > very similar.
> > > Wondering on the feasibility of putting this functionality in a
> > > rte_vhost library call such that we don't have two separate
> > implementations?
> > >
> > > Thanks,
> > > Ciara
> > >
> > > > >
> > > > > I have some other comments inline.
> > > > >
> > > > > Thanks,
> > > > > Ciara
> > > > “
> > > >
> > > >
> > > >
> > > > From: Gao Zhenyu <sysugaozhenyu@gmail.com>
> > > > Date: Wednesday, August 16, 2017 at 6:38 AM
> > > > To: "Loftus, Ciara" <ciara.loftus@intel.com>
> > > > Cc: "blp@ovn.org" <blp@ovn.org>, "Chandran, Sugesh"
> > > > <sugesh.chandran@intel.com>, "ktraynor@redhat.com"
> > > > <ktraynor@redhat.com>, Darrell Ball <dball@vmware.com>,
> > > > "dev@openvswitch.org" <dev@openvswitch.org>
> > > > Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX
> > > > cksum in ovs-dpdk side
> > > >
> > > > Hi Loftus,
> > > >    I had submitted a new version, please see
> > > > https://patchwork.ozlabs.org/patch/802070/
> > > >    It move the cksum to vhost receive side.
> > > > Thanks
> > > > Zhenyu Gao
> > > >
> > > > 2017-08-10 12:35 GMT+08:00 Gao Zhenyu <sysugaozhenyu@gmail.com>:
> > > > I see, for flows in phy-phy setup, they should not be calculate
> cksum.
> > > > I will revise my patch to do the cksum for vhost port only. I will
> > > > send
> > > a new
> > > > patch next week.
> > > >
> > > > Thanks
> > > > Zhenyu Gao
> > > >
> > > > 2017-08-08 17:53 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
> > > > >
> > > > > Hi Loftus,
> > > > >
> > > > > Thanks for testing and the comments!
> > > > > Can you show more details about your phy-vm-phy,phy-phy setup and
> > > > > testing steps? Then I can reproduce it to see if I can solve this
> > > > > pps
> > > problem.
> > > >
> > > > You're welcome. I forgot to mention my tests were with 64B packets.
> > > >
> > > > For phy-phy the setup is a single host with 2 dpdk physical ports
> > > > and 1
> > > flow
> > > > rule port1 -> port2.
> > > > See figure 3 here:
> > > > https://tools.ietf.org/html/draft-ietf-bmwg-vswitch-
> > > > opnfv-04#section-4
> > > >
> > > > For the phy-vm-phy the setup is a single host with 2 dpdk physical
> > > > ports
> > > and 2
> > > > vhostuser ports with flow rules:
> > > > Dpdk1 -> vhost 1 & vhost2 -> dpdk2
> > > > IP rules are set up in the VM to route packets from vhost1 to vhost
> 2.
> > > > See figure 4 in the link above.
> > > >
> > > > >
> > > > > BTW, how about throughput, did you saw improvment?
> > > >
> > > > By throughput if you mean 0% packet loss, I did not test this.
> > > >
> > > > Thanks,
> > > > Ciara
> > > >
> > > > >
> > > > > I would like to implement vhost->vhost part.
> > > > >
> > > > > Thanks
> > > > > Zhenyu Gao
> > > > >
> > > > > 2017-08-04 22:52 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:
> > > > > >
> > > > > > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx
> > > cksum.
> > > > > > So L4 packets's cksum were calculated in VM side but performance
> > > > > > is
> > > not
> > > > > > good.
> > > > > > Implementing tcp/udp tx cksum in ovs-dpdk side improves
> > > > > > throughput
> > > > and
> > > > > > makes virtio-net frontend-driver support NETIF_F_SG as well
> > > > > >
> > > > > > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> > > > > > ---
> > > > > >
> > > > > > Here is some performance number:
> > > > > >
> > > > > > Setup:
> > > > > >
> > > > > >  qperf client
> > > > > > +---------+
> > > > > > |   VM    |
> > > > > > +---------+
> > > > > >      |
> > > > > >      |                          qperf server
> > > > > > +--------------+              +------------+
> > > > > > | vswitch+dpdk |              | bare-metal |
> > > > > > +--------------+              +------------+
> > > > > >        |                            |
> > > > > >        |                            |
> > > > > >       pNic---------PhysicalSwitch----
> > > > > >
> > > > > > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K
> > > eth0 tx
> > > > > on'
> > > > > > in VM side.
> > > > > >                       It offload cksum job to ovs-dpdk side.
> > > > > >
> > > > > > do cksum in VM: Applied this patch and execute 'ethtool -K eth0
> > > > > > tx
> > > off' in
> > > > > VM
> > > > > > side.
> > > > > >                 VM calculate cksum for tcp/udp packets.
> > > > > >
> > > > > > We can see huge improvment in TCP throughput if we leverage
> > > > > > ovs-dpdk cksum.
> > > > > Hi Zhenyu,
> > > > >
> > > > > Thanks for the patch. I tested some alternative use cases and
> > > > unfortunately I
> > > > > see a degradation for phy-phy and phy-vm-phy topologies.
> > > > > Here are my results:
> > > > >
> > > > > phy-vm-phy:
> > > > > without patch: 0.871Mpps
> > > > > with patch (offload=on): 0.877Mpps with patch (offload=off):
> > > > > 0.891Mpps
> > > > >
> > > > > phy-phy:
> > > > > without patch: 13.581Mpps
> > > > > with patch: 13.055Mpps
> > > > >
> > > > > The half a million pps drop for the second test case is concerning
> > > > > to
> > > me but
> > > > > not surprising since we're adding extra complexity to
> > > netdev_dpdk_send()
> > > > > Could this be avoided? Would it make sense to put this
> > > > > functionality somewhere else eg. vhost receive?
> > > > >
> > > > > On another note I have a general concern. I understand similar
> > > functionality
> > > > > is present in the DPDK vhost sample app. I wonder if it would be
> > > feasible
> > > > for
> > > > > this to be implemented in the DPDK vhost library and leveraged
> > > > > here,
> > > > rather
> > > > > than having two implementations in two separate code bases.
> > > > >
> > > > > I have some other comments inline.
> > > > >
> > > > > Thanks,
> > > > > Ciara
> > > > >
> > > > > >
> > > > > > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2
> > > > > > host-qperf-
> > > > server01
> > > > > > tcp_bw tcp_lat udp_bw udp_lat
> > > > > >   do cksum in ovs-dpdk          do cksum in VM
>  without
> > > this patch
> > > > > > tcp_bw:
> > > > > >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =
> 1.95
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =
> 3.98
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =
> 8.19
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =
> 15.7
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =
> 29.7
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =
> 54.9
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =
> 95.1
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =
> 149
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =
> 216
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =
> 275
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =
> 321
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =
> 361
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =
> 419
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =
> 457
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =
> 480
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =
> 488
> > > MB/sec
> > > > > > tcp_bw:
> > > > > >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =
> 483
> > > MB/sec
> > > > > > tcp_lat:
> > > > > >     latency  =  29 us         latency  =  29.2 us       latency
> =
> > > 29.6 us
> > > > > > tcp_lat:
> > > > > >     latency  =  28.9 us       latency  =  29.3 us       latency
> =
> > > 29.5 us
> > > > > > tcp_lat:
> > > > > >     latency  =  29 us         latency  =  29.3 us       latency
> =
> > > 29.6 us
> > > > > > tcp_lat:
> > > > > >     latency  =  29 us         latency  =  29.4 us       latency
> =
> > > 29.5 us
> > > > > > tcp_lat:
> > > > > >     latency  =  29 us         latency  =  29.2 us       latency
> =
> > > 29.6 us
> > > > > > tcp_lat:
> > > > > >     latency  =  29.1 us       latency  =  29.3 us       latency
> =
> > > 29.7 us
> > > > > > tcp_lat:
> > > > > >     latency  =  29.4 us       latency  =  29.6 us       latency
> =
> > > 30 us
> > > > > > tcp_lat:
> > > > > >     latency  =  29.8 us       latency  =  30.1 us       latency
> =
> > > 30.2 us
> > > > > > tcp_lat:
> > > > > >     latency  =  30.9 us       latency  =  30.9 us       latency
> =
> > > 31 us
> > > > > > tcp_lat:
> > > > > >     latency  =  46.9 us       latency  =  46.2 us       latency
> =
> > > 32.2 us
> > > > > > tcp_lat:
> > > > > >     latency  =  51.5 us       latency  =  52.6 us       latency
> =
> > > 34.5 us
> > > > > > tcp_lat:
> > > > > >     latency  =  43.9 us       latency  =  43.8 us       latency
> =
> > > 43.6 us
> > > > > > tcp_lat:
> > > > > >      latency  =  47.6 us      latency  =  48 us         latency
> =
> > > 48.1 us
> > > > > > tcp_lat:
> > > > > >     latency  =  77.7 us       latency  =  78.8 us       latency
> =
> > > 78.8 us
> > > > > > tcp_lat:
> > > > > >     latency  =  82.8 us       latency  =  82.3 us       latency
> =
> > > 116 us
> > > > > > tcp_lat:
> > > > > >     latency  =  94.8 us       latency  =  94.2 us       latency
> =
> > > 134 us
> > > > > > tcp_lat:
> > > > > >     latency  =  167 us        latency  =  197 us        latency
> =
> > > 172 us
> > > > > > udp_bw:
> > > > > >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw
> =
> > > 403
> > > > > KB/sec
> > > > > >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw
> =
> > > 400
> > > > KB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw
> =
> > > 810
> > > > > KB/sec
> > > > > >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw
> =
> > > 807
> > > > KB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw
> =
> > > 1.63
> > > > > > MB/sec
> > > > > >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw
> =
> > > 1.63
> > > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw
> =
> > > 3.26
> > > > > > MB/sec
> > > > > >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw
> =
> > > 2.82
> > > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw
> =
> > > 6.45
> > > > > > MB/sec
> > > > > >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw
> =
> > > 6.45
> > > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw
> =
> > > 13
> > > > > > MB/sec
> > > > > >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw
> =
> > > 13
> > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw
> =
> > > 25.9
> > > > > > MB/sec
> > > > > >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw
> =
> > > 25.7
> > > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw
> =
> > >   52
> > > > > > MB/sec
> > > > > >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw
> =
> > > 51.2
> > > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw
> =
> > > 103
> > > > > > MB/sec
> > > > > >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw
> =
> > > 100
> > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw
> =
> > > 205
> > > > > > MB/sec
> > > > > >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw
> =
> > > 202
> > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw
> =
> > > 401
> > > > > > MB/sec
> > > > > >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw
> =
> > > 358
> > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw
> =
> > > 557
> > > > > > MB/sec
> > > > > >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw
> =
> > > 362
> > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw
> =
> > > 863
> > > > > > MB/sec
> > > > > >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw
> =
> > > 584
> > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw
> =
> > > 1.08
> > > > > > GB/sec
> > > > > >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw
> =
> > >  793
> > > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw
> =
> > > 1.19
> > > > > > GB/sec
> > > > > >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw
> =
> > >  673
> > > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw
> =
> > > 1.3
> > > > > GB/sec
> > > > > >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw
> =
> > > 762
> > > > > MB/sec
> > > > > > udp_bw:
> > > > > >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw
> =  0
> > > > > > bytes/sec
> > > > > >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw
> =  0
> > > > bytes/sec
> > > > > > udp_lat:
> > > > > >     latency  =  26.7 us       latency  =  26.5 us       latency
> =
> > > 26.4 us
> > > > > > udp_lat:
> > > > > >     latency  =  26.7 us       latency  =  26.5 us       latency
> =
> > > 26.3 us
> > > > > > udp_lat:
> > > > > >     latency  =  26.7 us       latency  =  26.7 us       latency
> =
> > > 26.3 us
> > > > > > udp_lat:
> > > > > >     latency  =  26.7 us       latency  =  26.6 us       latency
> =
> > > 26.3 us
> > > > > > udp_lat:
> > > > > >     latency  =  26.7 us       latency  =  26.7 us       latency
> =
> > > 26.7 us
> > > > > > udp_lat:
> > > > > >     latency  =  27 us         latency  =  26.7 us       latency
> =
> > > 26.6 us
> > > > > > udp_lat:
> > > > > >     latency  =  27 us         latency  =  26.9 us       latency
> =
> > > 26.7 us
> > > > > > udp_lat:
> > > > > >     latency  =  27.6 us       latency  =  27.4 us       latency
> =
> > > 27.3 us
> > > > > > udp_lat:
> > > > > >     latency  =  28.1 us       latency  =  28 us         latency
> =
> > > 28 us
> > > > > > udp_lat:
> > > > > >      latency  =  29.4 us      latency  =  29.2 us       latency
> =
> > > 29.2 us
> > > > > > udp_lat:
> > > > > >     latency  =  31 us         latency  =  31 us         latency
> =
> > > 30.8 us
> > > > > > udp_lat:
> > > > > >     latency  =  41.4 us       latency  =  41.4 us       latency
> =
> > > 41.3 us
> > > > > > udp_lat:
> > > > > >     latency  =  41.6 us       latency  =  41.5 us       latency
> =
> > > 41.5 us
> > > > > > udp_lat:
> > > > > >     latency  =  64.9 us       latency  =  65 us         latency
> =
> > > 65 us
> > > > > > udp_lat:
> > > > > >     latency  =  72.3 us       latency  =  72 us         latency
> =
> > > 72 us
> > > > > > udp_lat:
> > > > > >     latency  =  121 us        latency  =  122 us        latency
> =
> > > 122 us
> > > > > > udp_lat:
> > > > > >     latency  =  0 ns          latency  =  0 ns          latency
> =
> > > 0 ns
> > > > > >
> > > > > >
> > > > > >  lib/netdev-dpdk.c | 84
> > > > > >
> > +++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > > > > >  1 file changed, 82 insertions(+), 2 deletions(-)
> > > > > >
> > > > > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
> > > > > > ea17b97..d27d615 100644
> > > > > > --- a/lib/netdev-dpdk.c
> > > > > > +++ b/lib/netdev-dpdk.c
> > > > > > @@ -28,6 +28,7 @@
> > > > > >  #include <rte_errno.h>
> > > > > >  #include <rte_eth_ring.h>
> > > > > >  #include <rte_ethdev.h>
> > > > > > +#include <rte_ip.h>
> > > > > >  #include <rte_malloc.h>
> > > > > >  #include <rte_mbuf.h>
> > > > > >  #include <rte_meter.h>
> > > > > > @@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct
> > netdev_rxq
> > > > > > *rxq)
> > > > > >      rte_free(rx);
> > > > > >  }
> > > > > >
> > > > > > +static inline void
> > > > > > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
> > > > > > +                       uint8_t l4_proto, bool is_ipv4) {
> > > > > > +    void *l3hdr = (void *)(data + pkt->l3_ofs);
> > > > > > +
> > > > > > +    if (l4_proto == IPPROTO_TCP) {
> > > > > > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data
> > > > > > + +
> > > pkt-
> > > > > >l4_ofs);
> > > > > > +
> > > > > > +        pkt->mbuf.l2_len = pkt->l3_ofs;
> > > > > > +        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > > > > > +        tcp_hdr->tcp_csum = 0;
> > > > > > +        if (is_ipv4) {
> > > > > > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr,
> > > tcp_hdr);
> > > > > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM |
> PKT_TX_IPV4;
> > > > > > +        } else {
> > > > > > +            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM |
> PKT_TX_IPV6;
> > > > > > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr,
> > > tcp_hdr);
> > > > > > +        }
> > > > > > +    } else if (l4_proto == IPPROTO_UDP) {
> > > > > > +        struct udp_header *udp_hdr = (struct udp_header *)(data
> > > > > > + +
> > > pkt-
> > > > > > >l4_ofs);
> > > > > > +        /* do not recalculate udp cksum if it was 0 */
> > > > > > +        if (udp_hdr->udp_csum != 0) {
> > > > > > +            pkt->mbuf.l2_len = pkt->l3_ofs;
> > > > > > +            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
> > > > > > +            udp_hdr->udp_csum = 0;
> > > > > > +            if (is_ipv4) {
> > > > > > +                /*do not calculate udp cksum if it was a
> > > > > > + fragment
> > > IP*/
> > > > > > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> > > > > > +                                      fragment_offset)) {
> > > > > > +                    return;
> > > > > > +                }
> > > > > > +
> > > > > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM |
> > > PKT_TX_IPV4;
> > > > > > +                udp_hdr->udp_csum =
> > > > > > + rte_ipv4_udptcp_cksum(l3hdr,
> > > > udp_hdr);
> > > > > > +            } else {
> > > > > > +                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM |
> > > PKT_TX_IPV6;
> > > > > > +                udp_hdr->udp_csum =
> > > > > > + rte_ipv6_udptcp_cksum(l3hdr,
> > > > udp_hdr);
> > > > > > +            }
> > > > > > +        }
> > > > > > +    }
> > > > > > +}
> > > > > > +
> > > > > > +static inline void
> > > > > > +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt) {
> > > > > > +    int i;
> > > > > > +
> > > > > > +    for (i = 0; i < pkt_cnt; i++) {
> > > > > > +        ovs_be16 dl_type;
> > > > > > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> > > > > > +        const char *data = dp_packet_data(pkt);
> > > > > > +        void *l3hdr = (char *)(data + pkt->l3_ofs);
> > > > > > +
> > > > > > +        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs ==
> > > > > > + UINT16_MAX)
> > > {
> > > > > > +            continue;
> > > > > > +        }
> > > > > > +        /* This take a assumption that it should be a vhost
> > > > > > + packet
> > > if this
> > > > > > +         * packet was allocated by DPDK pool and try sending to
> > > pNic. */
> > > > > > +        if (pkt->source == DPBUF_DPDK &&
> > > > > > +            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> > > > > > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4
> > > > > > + packet
> > > need
> > > > > cksum
> > > > > > +            continue;
> > > > > > +        }
> > > > > The comments here could be formatted better. Suggest combining
> > > > > both
> > > > into
> > > > > one comment before the 'if'.
> > > > > Not sure the term 'pNIC' is widely used. Suggest using 'dpdk port'.
> > > > >
> > > > > > +
> > > > > > +        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
> > > > > > +        if (dl_type == htons(ETH_TYPE_IP)) {
> > > > > > +            netdev_refill_l4_cksum(data, pkt,
> > > > > > +                                   ((struct ipv4_hdr
> > > *)l3hdr)->next_proto_id,
> > > > > > +                                   true);
> > > > > > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> > > > > > +            netdev_refill_l4_cksum(data, pkt,
> > > > > > +                                   ((struct ipv6_hdr
> > > *)l3hdr)->proto,
> > > > > > +                                   false);
> > > > > > +        }
> > > > > > +    }
> > > > > > +}
> > > > > > +
> > > > > >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.
> > > > > > Takes
> > > ownership of
> > > > > >   * 'pkts', even in case of failure.
> > > > > >   *
> > > > > > @@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk
> > > > *dev,
> > > > > > int qid,
> > > > > >          return;
> > > > > >      }
> > > > > >
> > > > > > +    netdev_prepare_tx_csum(batch->packets, batch->count);
> > > > >
> > > > > Putting this here assumes we only prepare the csum for vhost ->
> > > > > dpdk or vhost -> ring cases. What about vhost -> vhost?
> > > > >
> > > > > > +
> > > > > >      if (OVS_UNLIKELY(concurrent_txq)) {
> > > > > >          qid = qid % dev->up.n_txq;
> > > > > >          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> > > > > > @@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
> > > > > >      if (ovsthread_once_start(&once)) {
> > > > > >          rte_vhost_driver_callback_register(&virtio_net_device_
> ops);
> > > > > >          rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> > > > > > -                                  | 1ULL <<
> VIRTIO_NET_F_HOST_TSO6
> > > > > > -                                  | 1ULL << VIRTIO_NET_F_CSUM);
> > > > > > +                                  | 1ULL <<
> > > > > > + VIRTIO_NET_F_HOST_TSO6);
> > > > > >          ovs_thread_create("vhost_thread", start_vhost_loop,
> > > > > > NULL);
> > > > > >
> > > > > >          ovsthread_once_done(&once);
> > > > > > --
> > > > > > 1.8.3.1
> > > > > >
> > > > > > _______________________________________________
> > > > > > dev mailing list
> > > > > > dev@openvswitch.org
> > > > > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> > > >
> > >
> > >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index ea17b97..d27d615 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -28,6 +28,7 @@ 
 #include <rte_errno.h>
 #include <rte_eth_ring.h>
 #include <rte_ethdev.h>
+#include <rte_ip.h>
 #include <rte_malloc.h>
 #include <rte_mbuf.h>
 #include <rte_meter.h>
@@ -1392,6 +1393,84 @@  netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
     rte_free(rx);
 }
 
+static inline void
+netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
+                       uint8_t l4_proto, bool is_ipv4)
+{
+    void *l3hdr = (void *)(data + pkt->l3_ofs);
+
+    if (l4_proto == IPPROTO_TCP) {
+        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt->l4_ofs);
+
+        pkt->mbuf.l2_len = pkt->l3_ofs;
+        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
+        tcp_hdr->tcp_csum = 0;
+        if (is_ipv4) {
+            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
+            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
+        } else {
+            pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
+            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
+        }
+    } else if (l4_proto == IPPROTO_UDP) {
+        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt->l4_ofs);
+        /* do not recalculate udp cksum if it was 0 */
+        if (udp_hdr->udp_csum != 0) {
+            pkt->mbuf.l2_len = pkt->l3_ofs;
+            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
+            udp_hdr->udp_csum = 0;
+            if (is_ipv4) {
+                /*do not calculate udp cksum if it was a fragment IP*/
+                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
+                                      fragment_offset)) {
+                    return;
+                }
+
+                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
+                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);
+            } else {
+                pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
+                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);
+            }
+        }
+    }
+}
+
+static inline void
+netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
+{
+    int i;
+
+    for (i = 0; i < pkt_cnt; i++) {
+        ovs_be16 dl_type;
+        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
+        const char *data = dp_packet_data(pkt);
+        void *l3hdr = (char *)(data + pkt->l3_ofs);
+
+        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
+            continue;
+        }
+        /* This take a assumption that it should be a vhost packet if this
+         * packet was allocated by DPDK pool and try sending to pNic. */
+        if (pkt->source == DPBUF_DPDK &&
+            !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
+            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need cksum
+            continue;
+        }
+
+        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
+        if (dl_type == htons(ETH_TYPE_IP)) {
+            netdev_refill_l4_cksum(data, pkt,
+                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,
+                                   true);
+        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
+            netdev_refill_l4_cksum(data, pkt,
+                                   ((struct ipv6_hdr *)l3hdr)->proto,
+                                   false);
+        }
+    }
+}
+
 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
  * 'pkts', even in case of failure.
  *
@@ -1833,6 +1912,8 @@  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
         return;
     }
 
+    netdev_prepare_tx_csum(batch->packets, batch->count);
+
     if (OVS_UNLIKELY(concurrent_txq)) {
         qid = qid % dev->up.n_txq;
         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
@@ -2741,8 +2822,7 @@  netdev_dpdk_vhost_class_init(void)
     if (ovsthread_once_start(&once)) {
         rte_vhost_driver_callback_register(&virtio_net_device_ops);
         rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
-                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                  | 1ULL << VIRTIO_NET_F_CSUM);
+                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
         ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
 
         ovsthread_once_done(&once);